diff --git a/.github/workflows/performance-monitor.yml b/.github/workflows/performance-monitor.yml
new file mode 100644
index 00000000..db34cd61
--- /dev/null
+++ b/.github/workflows/performance-monitor.yml
@@ -0,0 +1,132 @@
+name: Performance Monitor
+
+on:
+ schedule:
+ # Run weekly on Mondays at 06:00 UTC
+ - cron: "0 6 * * 1"
+ workflow_dispatch:
+ inputs:
+ iterations:
+ description: "Number of iterations per metric"
+ required: false
+ default: "5"
+
+permissions:
+ contents: read
+ issues: write
+
+jobs:
+ benchmark:
+ name: Run Performance Benchmarks
+ runs-on: ubuntu-latest
+ timeout-minutes: 30
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: 22
+ cache: npm
+
+ - name: Install dependencies
+ run: npm ci
+
+ - name: Build
+ run: npm run build
+
+ - name: Install AWF locally
+ run: |
+ sudo tee /usr/local/bin/awf > /dev/null <<'WRAPPER'
+ #!/bin/bash
+ exec node "${{ github.workspace }}/dist/cli.js" "$@"
+ WRAPPER
+ sudo chmod +x /usr/local/bin/awf
+
+ - name: Run benchmarks
+ id: benchmark
+ run: |
+ npx tsx scripts/ci/benchmark-performance.ts > benchmark-results.json 2>&1 || true
+ cat benchmark-results.json
+
+ - name: Upload results
+ uses: actions/upload-artifact@v4
+ with:
+ name: benchmark-results-${{ github.sha }}
+ path: benchmark-results.json
+ retention-days: 90
+
+ - name: Check for regressions
+ id: check
+ run: |
+ if [ ! -f benchmark-results.json ]; then
+ echo "No benchmark results found"
+ exit 1
+ fi
+
+ REGRESSIONS=$(jq -r '.regressions | length' benchmark-results.json)
+ echo "regression_count=$REGRESSIONS" >> "$GITHUB_OUTPUT"
+
+ if [ "$REGRESSIONS" -gt 0 ]; then
+ echo "## ⚠️ Performance Regressions Detected" >> "$GITHUB_STEP_SUMMARY"
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ jq -r '.regressions[]' benchmark-results.json | while read -r line; do
+ echo "- $line" >> "$GITHUB_STEP_SUMMARY"
+ done
+ else
+ echo "## ✅ All Metrics Within Thresholds" >> "$GITHUB_STEP_SUMMARY"
+ fi
+
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "### Results" >> "$GITHUB_STEP_SUMMARY"
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "| Metric | Mean | Median | P95 | P99 | Target | Critical |" >> "$GITHUB_STEP_SUMMARY"
+ echo "|--------|------|--------|-----|-----|--------|----------|" >> "$GITHUB_STEP_SUMMARY"
+
+ jq -r '.results[] as $r | .thresholds[$r.metric] as $t |
+ "| \($r.metric) | \($r.mean)\($r.unit) | \($r.median)\($r.unit) | \($r.p95)\($r.unit) | \($r.p99)\($r.unit) | \($t.target // "N/A")\(if $t then $r.unit else "" end) | \($t.critical // "N/A")\(if $t then $r.unit else "" end) |"' \
+ benchmark-results.json >> "$GITHUB_STEP_SUMMARY"
+
+ - name: Create regression issue
+ if: steps.check.outputs.regression_count != '0'
+ uses: actions/github-script@v7
+ with:
+ script: |
+ const fs = require('fs');
+ const report = JSON.parse(fs.readFileSync('benchmark-results.json', 'utf-8'));
+
+ const body = [
+ `## Performance Regression Detected`,
+ ``,
+ `**Commit:** ${report.commitSha}`,
+ `**Timestamp:** ${report.timestamp}`,
+ `**Iterations:** ${report.iterations}`,
+ ``,
+ `### Regressions`,
+ ``,
+ ...report.regressions.map(r => `- ${r}`),
+ ``,
+ `### Full Results`,
+ ``,
+ `| Metric | Mean | Median | P95 | P99 |`,
+ `|--------|------|--------|-----|-----|`,
+ ...report.results.map(r =>
+ `| ${r.metric} | ${r.mean}${r.unit} | ${r.median}${r.unit} | ${r.p95}${r.unit} | ${r.p99}${r.unit} |`
+ ),
+ ``,
+ `Raw JSON
`,
+ ``,
+ '```json',
+ JSON.stringify(report, null, 2),
+ '```',
+ ` `,
+ ].join('\n');
+
+ await github.rest.issues.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ title: `Performance regression detected (${new Date().toISOString().split('T')[0]})`,
+ body,
+ labels: ['performance', 'needs-investigation'],
+ });
diff --git a/package.json b/package.json
index 1b80d7c2..e34a639a 100644
--- a/package.json
+++ b/package.json
@@ -22,7 +22,8 @@
"prepare": "husky",
"docs:dev": "cd docs-site && npm run dev",
"docs:build": "cd docs-site && npm run build",
- "docs:preview": "cd docs-site && npm run preview"
+ "docs:preview": "cd docs-site && npm run preview",
+ "benchmark": "npx tsx scripts/ci/benchmark-performance.ts"
},
"keywords": [
"agentic-workflows",
diff --git a/scripts/ci/benchmark-performance.ts b/scripts/ci/benchmark-performance.ts
new file mode 100644
index 00000000..d0112cb6
--- /dev/null
+++ b/scripts/ci/benchmark-performance.ts
@@ -0,0 +1,295 @@
+#!/usr/bin/env npx tsx
+/**
+ * Performance benchmark script for AWF (Agentic Workflow Firewall).
+ *
+ * Measures key metrics:
+ * - Container startup (cold & warm)
+ * - Squid HTTP / HTTPS proxy latency
+ * - Memory footprint
+ * - Docker network creation time
+ *
+ * Outputs structured JSON with mean, median, p95, p99 per metric.
+ */
+
+import { execSync, ExecSyncOptions } from "child_process";
+
+// ── Configuration ──────────────────────────────────────────────────
+
+const ITERATIONS = 5;
+const AWF_CMD = "sudo awf";
+const ALLOWED_DOMAIN = "api.github.com";
+const CLEANUP_CMD = "sudo docker compose down -v 2>/dev/null; sudo docker rm -f awf-squid awf-agent 2>/dev/null; sudo docker network prune -f 2>/dev/null";
+
+interface BenchmarkResult {
+ metric: string;
+ unit: string;
+ values: number[];
+ mean: number;
+ median: number;
+ p95: number;
+ p99: number;
+}
+
+interface BenchmarkReport {
+ timestamp: string;
+ commitSha: string;
+ iterations: number;
+ results: BenchmarkResult[];
+ thresholds: Record;
+ regressions: string[];
+}
+
+// ── Thresholds (milliseconds or MB) ───────────────────────────────
+
+const THRESHOLDS: Record = {
+ "container_startup_cold": { target: 15000, critical: 20000 },
+ "container_startup_warm": { target: 5000, critical: 8000 },
+ "squid_https_latency": { target: 100, critical: 200 },
+ "memory_footprint_mb": { target: 500, critical: 1024 },
+ "docker_network_creation": { target: 2000, critical: 5000 },
+};
+
+// ── Helpers ────────────────────────────────────────────────────────
+
+function exec(cmd: string, opts?: ExecSyncOptions): string {
+ return execSync(cmd, { encoding: "utf-8", timeout: 120_000, ...opts }).trim();
+}
+
+function timeMs(fn: () => void): number {
+ const start = performance.now();
+ fn();
+ return Math.round(performance.now() - start);
+}
+
+function stats(values: number[]): Pick {
+ const sorted = [...values].sort((a, b) => a - b);
+ const n = sorted.length;
+ return {
+ mean: Math.round(sorted.reduce((a, b) => a + b, 0) / n),
+ median: sorted[Math.floor(n / 2)],
+ p95: sorted[Math.min(Math.floor(n * 0.95), n - 1)],
+ p99: sorted[Math.min(Math.floor(n * 0.99), n - 1)],
+ };
+}
+
+function cleanup(): void {
+ try {
+ execSync(CLEANUP_CMD, { stdio: "ignore", timeout: 30_000 });
+ } catch {
+ // best-effort
+ }
+}
+
+// ── Benchmarks ─────────────────────────────────────────────────────
+
+function benchmarkColdStart(): BenchmarkResult {
+ console.error(" Benchmarking cold container startup...");
+ const values: number[] = [];
+
+ for (let i = 0; i < ITERATIONS; i++) {
+ cleanup();
+ // Remove cached images to force cold pull
+ try {
+ execSync("sudo docker rmi ghcr.io/github/gh-aw-firewall/squid:latest ghcr.io/github/gh-aw-firewall/agent:latest 2>/dev/null", { stdio: "ignore", timeout: 30_000 });
+ } catch {
+ // images may not exist
+ }
+
+ const ms = timeMs(() => {
+ exec(`${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error -- echo ok`, { stdio: "ignore" });
+ });
+ values.push(ms);
+ console.error(` Iteration ${i + 1}/${ITERATIONS}: ${ms}ms`);
+ }
+
+ return { metric: "container_startup_cold", unit: "ms", values, ...stats(values) };
+}
+
+function benchmarkWarmStart(): BenchmarkResult {
+ console.error(" Benchmarking warm container startup...");
+ const values: number[] = [];
+
+ // Ensure images are pulled
+ cleanup();
+ try {
+ exec(`${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error -- echo warmup`, { stdio: "ignore" });
+ } catch {
+ // warmup
+ }
+
+ for (let i = 0; i < ITERATIONS; i++) {
+ cleanup();
+ const ms = timeMs(() => {
+ exec(`${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error -- echo ok`, { stdio: "ignore" });
+ });
+ values.push(ms);
+ console.error(` Iteration ${i + 1}/${ITERATIONS}: ${ms}ms`);
+ }
+
+ return { metric: "container_startup_warm", unit: "ms", values, ...stats(values) };
+}
+
+function benchmarkHttpsLatency(): BenchmarkResult {
+ console.error(" Benchmarking HTTPS latency through Squid...");
+ const values: number[] = [];
+
+ for (let i = 0; i < ITERATIONS; i++) {
+ cleanup();
+ try {
+ // Use curl's time_total to measure end-to-end HTTPS request latency
+ const output = exec(
+ `${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error -- ` +
+ `curl -fsS -o /dev/null -w '%{time_total}' https://${ALLOWED_DOMAIN}/zen`
+ );
+ const seconds = parseFloat(output);
+ if (!isNaN(seconds)) {
+ values.push(Math.round(seconds * 1000));
+ }
+ } catch {
+ console.error(` Iteration ${i + 1}/${ITERATIONS}: failed (skipped)`);
+ continue;
+ }
+ console.error(` Iteration ${i + 1}/${ITERATIONS}: ${values[values.length - 1]}ms`);
+ }
+
+ if (values.length === 0) {
+ values.push(0);
+ }
+
+ return { metric: "squid_https_latency", unit: "ms", values, ...stats(values) };
+}
+
+function benchmarkMemory(): BenchmarkResult {
+ console.error(" Benchmarking memory footprint...");
+ const values: number[] = [];
+
+ for (let i = 0; i < ITERATIONS; i++) {
+ cleanup();
+ // Start containers, measure memory, then stop
+ try {
+ // Run a sleep command so containers stay up, then check memory
+ const output = exec(
+ `${AWF_CMD} --allow-domains ${ALLOWED_DOMAIN} --log-level error --keep-containers -- ` +
+ `echo measuring_memory`
+ );
+ // Get memory stats for both containers
+ const squidMem = exec(
+ "sudo docker stats awf-squid --no-stream --format '{{.MemUsage}}' 2>/dev/null || echo '0MiB'"
+ );
+ const agentMem = exec(
+ "sudo docker stats awf-agent --no-stream --format '{{.MemUsage}}' 2>/dev/null || echo '0MiB'"
+ );
+
+ // Parse memory values (format: "123.4MiB / 7.773GiB")
+ const parseMb = (s: string): number => {
+ const match = s.match(/([\d.]+)\s*(MiB|GiB|KiB)/i);
+ if (!match) return 0;
+ const val = parseFloat(match[1]);
+ const unit = match[2].toLowerCase();
+ if (unit === "gib") return val * 1024;
+ if (unit === "kib") return val / 1024;
+ return val;
+ };
+
+ const totalMb = Math.round(parseMb(squidMem) + parseMb(agentMem));
+ values.push(totalMb);
+ console.error(` Iteration ${i + 1}/${ITERATIONS}: ${totalMb}MB (squid: ${squidMem}, agent: ${agentMem})`);
+ } catch {
+ console.error(` Iteration ${i + 1}/${ITERATIONS}: failed (skipped)`);
+ }
+ cleanup();
+ }
+
+ if (values.length === 0) {
+ values.push(0);
+ }
+
+ return { metric: "memory_footprint_mb", unit: "MB", values, ...stats(values) };
+}
+
+function benchmarkNetworkCreation(): BenchmarkResult {
+ console.error(" Benchmarking Docker network creation...");
+ const values: number[] = [];
+
+ for (let i = 0; i < ITERATIONS; i++) {
+ const netName = `awf-bench-net-${i}`;
+ try {
+ execSync(`sudo docker network rm ${netName} 2>/dev/null`, { stdio: "ignore" });
+ } catch {
+ // may not exist
+ }
+ const ms = timeMs(() => {
+ exec(`sudo docker network create --subnet=172.${31 + i}.0.0/24 ${netName}`, { stdio: "ignore" });
+ });
+ values.push(ms);
+ console.error(` Iteration ${i + 1}/${ITERATIONS}: ${ms}ms`);
+ try {
+ execSync(`sudo docker network rm ${netName} 2>/dev/null`, { stdio: "ignore" });
+ } catch {
+ // best-effort cleanup
+ }
+ }
+
+ return { metric: "docker_network_creation", unit: "ms", values, ...stats(values) };
+}
+
+// ── Main ───────────────────────────────────────────────────────────
+
+async function main(): Promise {
+ const commitSha = exec("git rev-parse HEAD");
+ console.error(`AWF Performance Benchmark`);
+ console.error(` Commit: ${commitSha}`);
+ console.error(` Iterations: ${ITERATIONS}`);
+ console.error("");
+
+ const results: BenchmarkResult[] = [];
+
+ results.push(benchmarkNetworkCreation());
+ results.push(benchmarkWarmStart());
+ results.push(benchmarkColdStart());
+ results.push(benchmarkHttpsLatency());
+ results.push(benchmarkMemory());
+
+ // Final cleanup
+ cleanup();
+
+ // Check for regressions against critical thresholds
+ const regressions: string[] = [];
+ for (const r of results) {
+ const threshold = THRESHOLDS[r.metric];
+ if (threshold && r.p95 > threshold.critical) {
+ regressions.push(
+ `${r.metric}: p95=${r.p95}${r.unit} exceeds critical threshold of ${threshold.critical}${r.unit}`
+ );
+ }
+ }
+
+ const report: BenchmarkReport = {
+ timestamp: new Date().toISOString(),
+ commitSha,
+ iterations: ITERATIONS,
+ results,
+ thresholds: THRESHOLDS,
+ regressions,
+ };
+
+ // Output JSON to stdout
+ console.log(JSON.stringify(report, null, 2));
+
+ if (regressions.length > 0) {
+ console.error("");
+ console.error("⚠️ Performance regressions detected:");
+ for (const r of regressions) {
+ console.error(` - ${r}`);
+ }
+ process.exit(1);
+ } else {
+ console.error("");
+ console.error("✅ All metrics within acceptable thresholds.");
+ }
+}
+
+main().catch((err) => {
+ console.error("Benchmark failed:", err);
+ process.exit(1);
+});