diff --git a/.github/workflows/performance-monitor.yml b/.github/workflows/performance-monitor.yml new file mode 100644 index 00000000..db34cd61 --- /dev/null +++ b/.github/workflows/performance-monitor.yml @@ -0,0 +1,132 @@ +name: Performance Monitor + +on: + schedule: + # Run weekly on Mondays at 06:00 UTC + - cron: "0 6 * * 1" + workflow_dispatch: + inputs: + iterations: + description: "Number of iterations per metric" + required: false + default: "5" + +permissions: + contents: read + issues: write + +jobs: + benchmark: + name: Run Performance Benchmarks + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 22 + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Build + run: npm run build + + - name: Install AWF locally + run: | + sudo tee /usr/local/bin/awf > /dev/null <<'WRAPPER' + #!/bin/bash + exec node "${{ github.workspace }}/dist/cli.js" "$@" + WRAPPER + sudo chmod +x /usr/local/bin/awf + + - name: Run benchmarks + id: benchmark + run: | + npx tsx scripts/ci/benchmark-performance.ts > benchmark-results.json 2>&1 || true + cat benchmark-results.json + + - name: Upload results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-${{ github.sha }} + path: benchmark-results.json + retention-days: 90 + + - name: Check for regressions + id: check + run: | + if [ ! -f benchmark-results.json ]; then + echo "No benchmark results found" + exit 1 + fi + + REGRESSIONS=$(jq -r '.regressions | length' benchmark-results.json) + echo "regression_count=$REGRESSIONS" >> "$GITHUB_OUTPUT" + + if [ "$REGRESSIONS" -gt 0 ]; then + echo "## ⚠️ Performance Regressions Detected" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + jq -r '.regressions[]' benchmark-results.json | while read -r line; do + echo "- $line" >> "$GITHUB_STEP_SUMMARY" + done + else + echo "## ✅ All Metrics Within Thresholds" >> "$GITHUB_STEP_SUMMARY" + fi + + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "### Results" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "| Metric | Mean | Median | P95 | P99 | Target | Critical |" >> "$GITHUB_STEP_SUMMARY" + echo "|--------|------|--------|-----|-----|--------|----------|" >> "$GITHUB_STEP_SUMMARY" + + jq -r '.results[] as $r | .thresholds[$r.metric] as $t | + "| \($r.metric) | \($r.mean)\($r.unit) | \($r.median)\($r.unit) | \($r.p95)\($r.unit) | \($r.p99)\($r.unit) | \($t.target // "N/A")\(if $t then $r.unit else "" end) | \($t.critical // "N/A")\(if $t then $r.unit else "" end) |"' \ + benchmark-results.json >> "$GITHUB_STEP_SUMMARY" + + - name: Create regression issue + if: steps.check.outputs.regression_count != '0' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const report = JSON.parse(fs.readFileSync('benchmark-results.json', 'utf-8')); + + const body = [ + `## Performance Regression Detected`, + ``, + `**Commit:** ${report.commitSha}`, + `**Timestamp:** ${report.timestamp}`, + `**Iterations:** ${report.iterations}`, + ``, + `### Regressions`, + ``, + ...report.regressions.map(r => `- ${r}`), + ``, + `### Full Results`, + ``, + `| Metric | Mean | Median | P95 | P99 |`, + `|--------|------|--------|-----|-----|`, + ...report.results.map(r => + `| ${r.metric} | ${r.mean}${r.unit} | ${r.median}${r.unit} | ${r.p95}${r.unit} | ${r.p99}${r.unit} |` + ), + ``, + `
Raw JSON`, + ``, + '```json', + JSON.stringify(report, null, 2), + '```', + `
`, + ].join('\n'); + + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: `Performance regression detected (${new Date().toISOString().split('T')[0]})`, + body, + labels: ['performance', 'needs-investigation'], + }); diff --git a/package.json b/package.json index 1b80d7c2..e34a639a 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,8 @@ "prepare": "husky", "docs:dev": "cd docs-site && npm run dev", "docs:build": "cd docs-site && npm run build", - "docs:preview": "cd docs-site && npm run preview" + "docs:preview": "cd docs-site && npm run preview", + "benchmark": "npx tsx scripts/ci/benchmark-performance.ts" }, "keywords": [ "agentic-workflows", diff --git a/scripts/ci/benchmark-performance.ts b/scripts/ci/benchmark-performance.ts new file mode 100644 index 00000000..d0112cb6 --- /dev/null +++ b/scripts/ci/benchmark-performance.ts @@ -0,0 +1,295 @@ +#!/usr/bin/env npx tsx +/** + * Performance benchmark script for AWF (Agentic Workflow Firewall). + * + * Measures key metrics: + * - Container startup (cold & warm) + * - Squid HTTP / HTTPS proxy latency + * - Memory footprint + * - Docker network creation time + * + * Outputs structured JSON with mean, median, p95, p99 per metric. + */ + +import { execSync, ExecSyncOptions } from "child_process"; + +// ── Configuration ────────────────────────────────────────────────── + +const ITERATIONS = 5; +const AWF_CMD = "sudo awf"; +const ALLOWED_DOMAIN = "api.github.com"; +const CLEANUP_CMD = "sudo docker compose down -v 2>/dev/null; sudo docker rm -f awf-squid awf-agent 2>/dev/null; sudo docker network prune -f 2>/dev/null"; + +interface BenchmarkResult { + metric: string; + unit: string; + values: number[]; + mean: number; + median: number; + p95: number; + p99: number; +} + +interface BenchmarkReport { + timestamp: string; + commitSha: string; + iterations: number; + results: BenchmarkResult[]; + thresholds: Record; + regressions: string[]; +} + +// ── Thresholds (milliseconds or MB) ─────────────────────────────── + +const THRESHOLDS: Record = { + "container_startup_cold": { target: 15000, critical: 20000 }, + "container_startup_warm": { target: 5000, critical: 8000 }, + "squid_https_latency": { target: 100, critical: 200 }, + "memory_footprint_mb": { target: 500, critical: 1024 }, + "docker_network_creation": { target: 2000, critical: 5000 }, +}; + +// ── Helpers ──────────────────────────────────────────────────────── + +function exec(cmd: string, opts?: ExecSyncOptions): string { + return execSync(cmd, { encoding: "utf-8", timeout: 120_000, ...opts }).trim(); +} + +function timeMs(fn: () => void): number { + const start = performance.now(); + fn(); + return Math.round(performance.now() - start); +} + +function stats(values: number[]): Pick { + const sorted = [...values].sort((a, b) => a - b); + const n = sorted.length; + return { + mean: Math.round(sorted.reduce((a, b) => a + b, 0) / n), + median: sorted[Math.floor(n / 2)], + p95: sorted[Math.min(Math.floor(n * 0.95), n - 1)], + p99: sorted[Math.min(Math.floor(n * 0.99), n - 1)], + }; +} + +function cleanup(): void { + try { + execSync(CLEANUP_CMD, { stdio: "ignore", timeout: 30_000 }); + } catch { + // best-effort + } +} + +// ── Benchmarks ───────────────────────────────────────────────────── + +function benchmarkColdStart(): BenchmarkResult { + console.error(" Benchmarking cold container startup..."); + const values: number[] = []; + + for (let i = 0; i < ITERATIONS; i++) { + cleanup(); + // Remove cached images to force cold pull + try { + execSync("sudo docker rmi ghcr.io/github/gh-aw-firewall/squid:latest ghcr.io/github/gh-aw-firewall/agent:latest 2>/dev/null", { stdio: "ignore", timeout: 30_000 }); + } catch { + // images may not exist + } + + const ms = timeMs(() => { + exec(`${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error -- echo ok`, { stdio: "ignore" }); + }); + values.push(ms); + console.error(` Iteration ${i + 1}/${ITERATIONS}: ${ms}ms`); + } + + return { metric: "container_startup_cold", unit: "ms", values, ...stats(values) }; +} + +function benchmarkWarmStart(): BenchmarkResult { + console.error(" Benchmarking warm container startup..."); + const values: number[] = []; + + // Ensure images are pulled + cleanup(); + try { + exec(`${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error -- echo warmup`, { stdio: "ignore" }); + } catch { + // warmup + } + + for (let i = 0; i < ITERATIONS; i++) { + cleanup(); + const ms = timeMs(() => { + exec(`${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error -- echo ok`, { stdio: "ignore" }); + }); + values.push(ms); + console.error(` Iteration ${i + 1}/${ITERATIONS}: ${ms}ms`); + } + + return { metric: "container_startup_warm", unit: "ms", values, ...stats(values) }; +} + +function benchmarkHttpsLatency(): BenchmarkResult { + console.error(" Benchmarking HTTPS latency through Squid..."); + const values: number[] = []; + + for (let i = 0; i < ITERATIONS; i++) { + cleanup(); + try { + // Use curl's time_total to measure end-to-end HTTPS request latency + const output = exec( + `${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error -- ` + + `curl -fsS -o /dev/null -w '%{time_total}' https://${ALLOWED_DOMAIN}/zen` + ); + const seconds = parseFloat(output); + if (!isNaN(seconds)) { + values.push(Math.round(seconds * 1000)); + } + } catch { + console.error(` Iteration ${i + 1}/${ITERATIONS}: failed (skipped)`); + continue; + } + console.error(` Iteration ${i + 1}/${ITERATIONS}: ${values[values.length - 1]}ms`); + } + + if (values.length === 0) { + values.push(0); + } + + return { metric: "squid_https_latency", unit: "ms", values, ...stats(values) }; +} + +function benchmarkMemory(): BenchmarkResult { + console.error(" Benchmarking memory footprint..."); + const values: number[] = []; + + for (let i = 0; i < ITERATIONS; i++) { + cleanup(); + // Start containers, measure memory, then stop + try { + // Run a sleep command so containers stay up, then check memory + const output = exec( + `${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error --keep-containers -- ` + + `echo measuring_memory` + ); + // Get memory stats for both containers + const squidMem = exec( + "sudo docker stats awf-squid --no-stream --format '{{.MemUsage}}' 2>/dev/null || echo '0MiB'" + ); + const agentMem = exec( + "sudo docker stats awf-agent --no-stream --format '{{.MemUsage}}' 2>/dev/null || echo '0MiB'" + ); + + // Parse memory values (format: "123.4MiB / 7.773GiB") + const parseMb = (s: string): number => { + const match = s.match(/([\d.]+)\s*(MiB|GiB|KiB)/i); + if (!match) return 0; + const val = parseFloat(match[1]); + const unit = match[2].toLowerCase(); + if (unit === "gib") return val * 1024; + if (unit === "kib") return val / 1024; + return val; + }; + + const totalMb = Math.round(parseMb(squidMem) + parseMb(agentMem)); + values.push(totalMb); + console.error(` Iteration ${i + 1}/${ITERATIONS}: ${totalMb}MB (squid: ${squidMem}, agent: ${agentMem})`); + } catch { + console.error(` Iteration ${i + 1}/${ITERATIONS}: failed (skipped)`); + } + cleanup(); + } + + if (values.length === 0) { + values.push(0); + } + + return { metric: "memory_footprint_mb", unit: "MB", values, ...stats(values) }; +} + +function benchmarkNetworkCreation(): BenchmarkResult { + console.error(" Benchmarking Docker network creation..."); + const values: number[] = []; + + for (let i = 0; i < ITERATIONS; i++) { + const netName = `awf-bench-net-${i}`; + try { + execSync(`sudo docker network rm ${netName} 2>/dev/null`, { stdio: "ignore" }); + } catch { + // may not exist + } + const ms = timeMs(() => { + exec(`sudo docker network create --subnet=172.${31 + i}.0.0/24 ${netName}`, { stdio: "ignore" }); + }); + values.push(ms); + console.error(` Iteration ${i + 1}/${ITERATIONS}: ${ms}ms`); + try { + execSync(`sudo docker network rm ${netName} 2>/dev/null`, { stdio: "ignore" }); + } catch { + // best-effort cleanup + } + } + + return { metric: "docker_network_creation", unit: "ms", values, ...stats(values) }; +} + +// ── Main ─────────────────────────────────────────────────────────── + +async function main(): Promise { + const commitSha = exec("git rev-parse HEAD"); + console.error(`AWF Performance Benchmark`); + console.error(` Commit: ${commitSha}`); + console.error(` Iterations: ${ITERATIONS}`); + console.error(""); + + const results: BenchmarkResult[] = []; + + results.push(benchmarkNetworkCreation()); + results.push(benchmarkWarmStart()); + results.push(benchmarkColdStart()); + results.push(benchmarkHttpsLatency()); + results.push(benchmarkMemory()); + + // Final cleanup + cleanup(); + + // Check for regressions against critical thresholds + const regressions: string[] = []; + for (const r of results) { + const threshold = THRESHOLDS[r.metric]; + if (threshold && r.p95 > threshold.critical) { + regressions.push( + `${r.metric}: p95=${r.p95}${r.unit} exceeds critical threshold of ${threshold.critical}${r.unit}` + ); + } + } + + const report: BenchmarkReport = { + timestamp: new Date().toISOString(), + commitSha, + iterations: ITERATIONS, + results, + thresholds: THRESHOLDS, + regressions, + }; + + // Output JSON to stdout + console.log(JSON.stringify(report, null, 2)); + + if (regressions.length > 0) { + console.error(""); + console.error("⚠️ Performance regressions detected:"); + for (const r of regressions) { + console.error(` - ${r}`); + } + process.exit(1); + } else { + console.error(""); + console.error("✅ All metrics within acceptable thresholds."); + } +} + +main().catch((err) => { + console.error("Benchmark failed:", err); + process.exit(1); +});