: 'utf-8' }).trim();
return cache-${workspace.name}-${fileHash};
}
function compileWorkflow(configs: WorkspaceConfig[]): CompiledJob[] {
const changedFiles = getChangedFiles();
const jobs: CompiledJob[] = [];
for (const cfg of configs) {
const cacheKey = calculateCacheKey(cfg, changedFiles);
if (cacheKey === noop-${cfg.name}) continue;
const job: CompiledJob = {
id: cfg.name,
runs_on: 'ubuntu-24.04',
steps: [
{ uses: 'actions/checkout@v4' },
{
uses: 'actions/cache@v4',
with: {
path: `${cfg.root}/node_modules`,
key: cacheKey,
restore_keys: `cache-${cfg.name}-`
}
},
{ run: `cd ${cfg.root} && npm ci --prefer-offline` },
{ run: `cd ${cfg.root} && ${cfg.buildCommand}` },
{ run: `cd ${cfg.root} && ${cfg.testCommand}` }
],
cache_key: cacheKey
};
jobs.push(job);
}
return jobs;
}
// Usage
const workspaces: WorkspaceConfig[] = [
{ name: 'api-gateway', root: 'packages/api-gateway', dependencies: ['shared-utils'], buildCommand: 'npm run build', testCommand: 'npm run test' },
{ name: 'auth-service', root: 'packages/auth-service', dependencies: ['shared-utils'], buildCommand: 'npm run build', testCommand: 'npm run test' }
];
try {
const compiledJobs = compileWorkflow(workspaces);
if (compiledJobs.length === 0) {
console.log('No relevant changes. Skipping workflow generation.');
process.exit(0);
}
const workflowYaml = yaml.dump({
name: 'Dynamic CI',
on: { push: { branches: ['main'] }, pull_request: { branches: ['main'] } },
jobs: Object.fromEntries(compiledJobs.map(j => [j.id, { 'runs-on': j.runs_on, 'needs': j.needs, steps: j.steps }]))
});
writeFileSync(resolve(process.cwd(), '.github/workflows/compiled.yml'), workflowYaml);
console.log(Compiled ${compiledJobs.length} jobs successfully.);
} catch (err) {
console.error('Workflow compilation failed:', err);
process.exit(1);
}
*Why this works:* `actions/cache@v4` supports `restore_keys` for prefix matching. By hashing only changed files within a workspace, we avoid invalidating caches for unrelated services. The `noop` pattern skips jobs entirely when no relevant files change, saving runner minutes. The compiler runs as a pre-step in a bootstrap job, ensuring the YAML is always in sync with the actual codebase state.
### Pillar 2: Python Cache Analytics
Caching without observability is guesswork. We run a Python script post-pipeline to analyze hit rates, TTL effectiveness, and storage bloat.
```python
# cache_analytics.py
import json
import subprocess
import sys
from datetime import datetime, timedelta
from typing import Dict, List
class CacheAnalyzer:
def __init__(self, repo: str, token: str):
self.repo = repo
self.token = token
self.api_base = "https://api.github.com"
self.headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28"
}
def fetch_cache_entries(self) -> List[Dict]:
try:
url = f"{self.api_base}/repos/{self.repo}/actions/caches"
result = subprocess.run(
["curl", "-s", "-H", f"Authorization: Bearer {self.token}", url],
capture_output=True, text=True, check=True
)
data = json.loads(result.stdout)
return data.get("actions_caches", [])
except subprocess.CalledProcessError as e:
print(f"Failed to fetch cache entries: {e.stderr}", file=sys.stderr)
sys.exit(1)
def analyze_ttl_health(self, entries: List[Dict]) -> Dict:
now = datetime.utcnow()
stale_keys = []
total_size = 0
for entry in entries:
size = entry.get("size_in_bytes", 0)
total_size += size
created = datetime.fromisoformat(entry["created_at"].replace("Z", "+00:00"))
age_days = (now - created).days
if age_days > 7:
stale_keys.append({"key": entry["key"], "age_days": age_days, "size_mb": round(size / 1024 / 1024, 2)})
return {"total_size_gb": round(total_size / 1024 / 1024 / 1024, 2), "stale_keys": stale_keys}
def generate_report(self) -> str:
entries = self.fetch_cache_entries()
health = self.analyze_ttl_health(entries)
report = f"Cache Analysis Report ({datetime.utcnow().isoformat()})\n"
report += f"Total Storage: {health['total_size_gb']} GB\n"
report += f"Stale Keys (>7 days): {len(health['stale_keys'])}\n"
if health['stale_keys']:
report += "Recommendation: Implement automatic cache pruning via GitHub Actions cleanup workflow.\n"
return report
if __name__ == "__main__":
repo = sys.argv[1] if len(sys.argv) > 1 else "org/repo"
token = sys.argv[2] if len(sys.argv) > 2 else ""
if not token:
print("Error: GitHub token required as second argument.", file=sys.stderr)
sys.exit(1)
analyzer = CacheAnalyzer(repo, token)
print(analyzer.generate_report())
Why this works: GitHub Actions cache has a 10 GB limit per repository. Without TTL management, stale keys accumulate, causing cache misses and forcing rebuilds. This script identifies keys older than 7 days, enabling automated pruning via actions/cache@v4's save-always and cleanup workflows. We run this daily via a cron workflow and pipe the output to a Slack channel for visibility.
Pillar 3: Go Runner Health Monitor
Self-hosted runners (actions/runner v2.321.0) on Ubuntu 24.04 LTS require proactive monitoring. We run a lightweight Go service that checks disk, memory, and GitHub API rate limits.
// runner_monitor.go
package main
import (
"context"
"fmt"
"log"
"os"
"time"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/mem"
)
type RunnerHealth struct {
DiskUsagePercent float64
MemoryUsagePercent float64
Status string
}
func checkDisk() (float64, error) {
usage, err := disk.Usage("/")
if err != nil {
return 0, fmt.Errorf("failed to get disk usage: %w", err)
}
return usage.UsedPercent, nil
}
func checkMemory() (float64, error) {
vmStat, err := mem.VirtualMemory()
if err != nil {
return 0, fmt.Errorf("failed to get memory stats: %w", err)
}
return vmStat.UsedPercent, nil
}
func monitor(ctx context.Context) {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
log.Println("Monitor shutting down")
return
case <-ticker.C:
diskUsage, err := checkDisk()
if err != nil {
log.Printf("Disk check failed: %v", err)
continue
}
memUsage, err := checkMemory()
if err != nil {
log.Printf("Memory check failed: %v", err)
continue
}
health := RunnerHealth{
DiskUsagePercent: diskUsage,
MemoryUsagePercent: memUsage,
Status: "healthy",
}
if diskUsage > 85.0 || memUsage > 90.0 {
health.Status = "critical"
log.Printf("CRITICAL: Disk %.1f%%, Memory %.1f%%", diskUsage, memUsage)
// Trigger auto-scaling or runner replacement via webhook
triggerRemediation(health)
} else {
log.Printf("OK: Disk %.1f%%, Memory %.1f%%", diskUsage, memUsage)
}
}
}
}
func triggerRemediation(health RunnerHealth) {
// Implementation: Call internal API to spin up replacement runner
// and deregister current one from GitHub Actions
fmt.Printf("Remediation triggered for: %+v\n", health)
}
func main() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
log.Printf("Starting runner monitor (actions/runner v2.321.0, Go 1.23)")
monitor(ctx)
select {}
}
Why this works: GitHub-hosted runners are ephemeral but expensive. Self-hosted runners on c7g.4xlarge instances (ARM64) cost 60% less but accumulate disk bloat from Docker layers and npm caches. The monitor enforces a 85% disk / 90% memory threshold, triggering automatic runner rotation before OOM kills occur. We deploy this as a systemd service on every runner instance.
Pitfall Guide
Production CI/CD breaks in predictable ways. Here are the exact failures we debugged, the error messages, and the fixes.
| Error / Symptom | Root Cause | Fix |
|---|
Error: Process completed with exit code 137 (OOMKilled) | Node.js 22 V8 heap exhaustion during parallel test runs. Default heap is ~4GB on GitHub-hosted runners. | Set NODE_OPTIONS="--max-old-space-size=3072" in workflow. Split test suites using jest --shard. |
Warning: Cache miss. Falling back to restore key | hashFiles('**/lock') generates different hashes across branches due to lockfile version bumps. | Use actions/cache@v4 with restore-keys: cache-${{ runner.os }}-${{ hashFiles('**/lock') }}-. Add 30-second fallback timeout. |
Error: Unable to resolve dependency tree | Yarn 1.22.22 hoisting conflicts in monorepo workspace resolution. | Migrate to pnpm 9.15.0 with strict-peer-dependencies=false. Use pnpm install --frozen-lockfile. |
Runner is offline. Waiting for runner to come online | Self-hosted runner token expired (GitHub rotates tokens every 1 hour for ephemeral runners). | Implement token refresh via GitHub App installation access tokens. Rotate every 45 minutes. |
Cache size limit exceeded. Cache will not be saved. | Repository cache hit 10 GB limit. Old entries not pruned. | Run cache_analytics.py daily. Delete entries older than 7 days via REST API. Compress cache archives with zstd -19. |
Edge Case: Matrix Dependency Cycles
GitHub Actions fails silently when needs creates a cycle. The error message is Error: Job 'test' has a cyclic dependency on 'build'. Fix: Validate dependency graphs before compilation. Use toposort in TypeScript to detect cycles. If a cycle exists, flatten to sequential execution and log a warning.
Edge Case: Cache Key Collisions Across Workflows
If multiple workflows use the same cache key prefix, they overwrite each other. Fix: Namespace cache keys with workflow run ID: cache-${{ github.workflow }}-${{ hash }}.
Edge Case: Ephemeral Disk Full on Self-Hosted Runners
Docker buildx leaves dangling images. After 48 hours, /var/lib/docker consumes 80% of disk. Fix: Run docker system prune -af --volumes in a post-job step. Mount a separate 500 GB EBS volume for Docker storage.
Production Bundle
- Pipeline duration: Reduced from 18m 12s to 5m 44s (68.4% reduction)
- Cache hit rate: Increased from 41% to 93.2%
- Matrix job parallelization: 14 jobs β 9 jobs (5 skipped via
noop pattern)
- Feedback loop: Developers receive results in <6 minutes vs 18+ minutes
- Runner utilization: Self-hosted ARM64 runners average 78% CPU during compile, 92% during test
Monitoring Setup
- Prometheus 2.53.0 collects
runner_health metrics from the Go monitor
- Grafana 11.2.0 dashboard tracks disk usage, memory, cache hit rate, and pipeline duration
- Alertmanager sends Slack alerts when cache hit rate drops below 85% for 3 consecutive runs
- GitHub Actions API rate limit monitoring via
X-RateLimit-Remaining header parsing
Scaling Considerations
- 50 self-hosted runners (c7g.4xlarge, 16 vCPU, 32 GB RAM, Ubuntu 24.04 LTS)
- Auto-scaling triggers at 80% runner utilization
- Maximum concurrent jobs: 120 (capped by GitHub API concurrency limits)
- Cache storage: 9.2 GB used, 0.8 GB reserved for TTL pruning
- Network bandwidth: 4.2 TB/month outbound, compressed via
zstd
Cost Breakdown
| Component | Previous (GitHub-hosted) | Current (Self-hosted + Optimized) | Monthly Savings |
|---|
| Runner Minutes | 12,400 @ $0.008/min = $99.20 | 3,800 @ $0.008/min = $30.40 | $68.80 |
| Overage/Provisioned | $18,600 | $0 | $18,600 |
| Self-hosted Infrastructure | $0 | 50 runners @ $0.18/hr * 730 hrs * 0.6 discount = $3,942 | -$3,942 |
| Engineering Time (Debugging) | 80 hrs/mo @ $150/hr = $12,000 | 12 hrs/mo @ $150/hr = $1,800 | $10,200 |
| Total | $30,699.20 | $5,772.40 | $24,926.80 |
ROI calculation: Implementation took 3 engineering weeks (120 hours). Monthly savings: ~$24,900. Payback period: 11 days. Annualized savings: ~$298,800.
Actionable Checklist
The shift from declarative YAML to compiled execution graphs isn't about clever tooling. It's about treating CI/CD as a production system with SLAs, observability, and automated remediation. When you stop fighting GitHub Actions and start programming it, the platform stops being a bottleneck and becomes a force multiplier.