diff --git a/src/cli.ts b/src/cli.ts index 7ec693e8..430f82a2 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -1037,6 +1037,12 @@ program ' WARNING: allows firewall bypass via docker run', false ) + .option( + '--enable-dlp', + 'Enable DLP (Data Loss Prevention) scanning to block credential\n' + + ' exfiltration in outbound request URLs.', + false + ) // -- API Proxy -- .option( @@ -1334,6 +1340,11 @@ program logger.warn('⚠️ SSL Bump intercepts HTTPS traffic. Only use for trusted workloads.'); } + // Log DLP mode + if (options.enableDlp) { + logger.info('DLP scanning enabled - outbound requests will be scanned for credential patterns'); + } + // Validate memory limit const memoryLimit = parseMemoryLimit(options.memoryLimit); if (memoryLimit.error) { @@ -1376,6 +1387,7 @@ program allowHostPorts: options.allowHostPorts, sslBump: options.sslBump, enableDind: options.enableDind, + enableDlp: options.enableDlp, allowedUrls, enableApiProxy: options.enableApiProxy, openaiApiKey: process.env.OPENAI_API_KEY, diff --git a/src/dlp.test.ts b/src/dlp.test.ts new file mode 100644 index 00000000..3cfe6ae4 --- /dev/null +++ b/src/dlp.test.ts @@ -0,0 +1,225 @@ +import { DLP_PATTERNS, scanForCredentials, generateDlpSquidConfig } from './dlp'; + +describe('DLP Patterns', () => { + describe('DLP_PATTERNS', () => { + it('should have at least 10 built-in patterns', () => { + expect(DLP_PATTERNS.length).toBeGreaterThanOrEqual(10); + }); + + it('should have name, description, and regex for each pattern', () => { + for (const pattern of DLP_PATTERNS) { + expect(pattern.name).toBeTruthy(); + expect(pattern.description).toBeTruthy(); + expect(pattern.regex).toBeTruthy(); + } + }); + + it('should have valid regex patterns', () => { + for (const pattern of DLP_PATTERNS) { + expect(() => new RegExp(pattern.regex, 'i')).not.toThrow(); + } + }); + }); + + describe('scanForCredentials', () => { + // GitHub tokens + it('should detect GitHub personal access token (ghp_)', () => { + const matches = scanForCredentials( + 'https://api.example.com/data?token=ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij' + ); + expect(matches).toContain('GitHub Personal Access Token (classic)'); + }); + + it('should detect GitHub OAuth token (gho_)', () => { + const matches = scanForCredentials( + 'https://api.example.com/gho_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij/resource' + ); + expect(matches).toContain('GitHub OAuth Access Token'); + }); + + it('should detect GitHub App installation token (ghs_)', () => { + const matches = scanForCredentials( + 'https://api.example.com/?key=ghs_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij' + ); + expect(matches).toContain('GitHub App Installation Token'); + }); + + it('should detect GitHub App user-to-server token (ghu_)', () => { + const matches = scanForCredentials( + 'https://api.example.com/?key=ghu_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij' + ); + expect(matches).toContain('GitHub App User-to-Server Token'); + }); + + it('should detect GitHub fine-grained PAT (github_pat_)', () => { + const matches = scanForCredentials( + 'https://api.example.com/?key=github_pat_1234567890abcdefghijkl_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456' + ); + expect(matches).toContain('GitHub Fine-Grained PAT'); + }); + + // OpenAI - use concatenation to avoid push protection triggering on test data + it('should detect OpenAI API key (sk-...T3BlbkFJ)', () => { + const fakeKey = 'sk-' + '1'.repeat(20) + 'T3BlbkFJ' + '2'.repeat(20); + const matches = scanForCredentials( + 'https://api.example.com/?key=' + fakeKey + ); + expect(matches).toContain('OpenAI API Key'); + }); + + it('should detect OpenAI project API key (sk-proj-)', () => { + const matches = scanForCredentials( + 'https://api.example.com/?key=sk-proj-' + 'a'.repeat(50) + ); + expect(matches).toContain('OpenAI Project API Key'); + }); + + // Anthropic + it('should detect Anthropic API key (sk-ant-)', () => { + const matches = scanForCredentials( + 'https://api.example.com/?key=sk-ant-' + 'a'.repeat(50) + ); + expect(matches).toContain('Anthropic API Key'); + }); + + // AWS + it('should detect AWS access key ID (AKIA)', () => { + const matches = scanForCredentials( + 'https://api.example.com/?key=AKIAIOSFODNN7EXAMPLE' + ); + expect(matches).toContain('AWS Access Key ID'); + }); + + // Google + it('should detect Google API key (AIza)', () => { + const matches = scanForCredentials( + 'https://api.example.com/?key=AIzaSyA' + 'a'.repeat(32) + ); + expect(matches).toContain('Google API Key'); + }); + + // Slack - use concatenation to avoid push protection triggering on test data + it('should detect Slack bot token (xoxb-)', () => { + const fakeToken = 'xoxb-' + '1234567890' + '-' + '1234567890' + '-' + 'ABCDEFGHIJKLMNOPQRSTUV' + 'wx'; + const matches = scanForCredentials( + 'https://api.example.com/?token=' + fakeToken + ); + expect(matches).toContain('Slack Bot Token'); + }); + + // Generic patterns + it('should detect bearer token in URL parameter', () => { + const matches = scanForCredentials( + 'https://api.example.com/data?bearer=abcdefghijklmnopqrstuvwxyz1234' + ); + expect(matches).toContain('Bearer Token in URL'); + }); + + it('should detect authorization in URL parameter', () => { + const matches = scanForCredentials( + 'https://api.example.com/data?authorization=abcdefghijklmnopqrstuvwxyz1234' + ); + expect(matches).toContain('Authorization in URL'); + }); + + it('should detect private key markers', () => { + const matches = scanForCredentials( + 'https://api.example.com/data?content=BEGIN+PRIVATE+KEY' + ); + expect(matches).toContain('Private Key Marker'); + }); + + it('should detect URL-encoded private key markers', () => { + const matches = scanForCredentials( + 'https://api.example.com/data?content=BEGIN%20PRIVATE%20KEY' + ); + expect(matches).toContain('Private Key Marker'); + }); + + // Negative cases + it('should not match short strings that look like token prefixes', () => { + const matches = scanForCredentials('https://api.example.com/ghp_short'); + expect(matches).not.toContain('GitHub Personal Access Token (classic)'); + }); + + it('should return empty array for clean URLs', () => { + const matches = scanForCredentials('https://api.github.com/repos/owner/repo'); + expect(matches).toHaveLength(0); + }); + + it('should return empty array for empty string', () => { + const matches = scanForCredentials(''); + expect(matches).toHaveLength(0); + }); + + it('should not match normal domain names or paths', () => { + const urls = [ + 'https://github.com/settings/tokens', + 'https://api.openai.com/v1/chat/completions', + 'https://docs.anthropic.com/getting-started', + 'https://console.aws.amazon.com/', + 'https://slack.com/api/chat.postMessage', + ]; + for (const url of urls) { + const matches = scanForCredentials(url); + expect(matches).toHaveLength(0); + } + }); + + it('should detect multiple credential types in one URL', () => { + const matches = scanForCredentials( + 'https://evil.com/?gh=ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij&aws=AKIAIOSFODNN7EXAMPLE' + ); + expect(matches).toContain('GitHub Personal Access Token (classic)'); + expect(matches).toContain('AWS Access Key ID'); + expect(matches.length).toBeGreaterThanOrEqual(2); + }); + }); + + describe('generateDlpSquidConfig', () => { + it('should generate ACL lines for all patterns', () => { + const { aclLines } = generateDlpSquidConfig(); + + // Should have header comments + expect(aclLines[0]).toContain('DLP'); + + // Should have one url_regex ACL per pattern + const aclEntries = aclLines.filter(l => l.startsWith('acl dlp_blocked')); + expect(aclEntries.length).toBe(DLP_PATTERNS.length); + + // Each ACL should use url_regex -i + for (const entry of aclEntries) { + expect(entry).toMatch(/^acl dlp_blocked url_regex -i .+/); + } + }); + + it('should generate deny access rules', () => { + const { accessRules } = generateDlpSquidConfig(); + + expect(accessRules.some(r => r.includes('http_access deny dlp_blocked'))).toBe(true); + }); + + it('should have a DLP comment in access rules', () => { + const { accessRules } = generateDlpSquidConfig(); + expect(accessRules.some(r => r.includes('DLP'))).toBe(true); + }); + + it('should produce valid Squid ACL syntax', () => { + const { aclLines, accessRules } = generateDlpSquidConfig(); + + // All non-comment ACL lines should start with 'acl ' + for (const line of aclLines) { + if (!line.startsWith('#')) { + expect(line).toMatch(/^acl /); + } + } + + // All non-comment access rules should start with 'http_access ' + for (const line of accessRules) { + if (!line.startsWith('#')) { + expect(line).toMatch(/^http_access /); + } + } + }); + }); +}); diff --git a/src/dlp.ts b/src/dlp.ts new file mode 100644 index 00000000..75b8917c --- /dev/null +++ b/src/dlp.ts @@ -0,0 +1,176 @@ +/** + * Data Loss Prevention (DLP) module for detecting credential patterns + * in outbound HTTP/HTTPS traffic. + * + * When DLP is enabled, Squid proxy URL regex ACLs block requests that + * contain credential-like patterns in URLs (query parameters, path segments, + * headers passed via URL encoding, etc.). + * + * This protects against accidental credential leakage in: + * - URL query parameters (e.g., ?token=ghp_xxxx) + * - URL path segments (e.g., /api/ghp_xxxx/resource) + * - Encoded credentials in URLs + */ + +/** + * A DLP credential pattern definition + */ +export interface DlpPattern { + /** Human-readable name for the pattern */ + name: string; + /** Description of what this pattern detects */ + description: string; + /** Regex pattern string (Squid url_regex compatible, case-insensitive) */ + regex: string; +} + +/** + * Built-in credential patterns for DLP scanning + * + * These patterns detect common credential formats that should never + * appear in URLs. Each regex is designed to be used with Squid's + * url_regex ACL type (POSIX extended regex, case-insensitive). + * + * Pattern design principles: + * - Match the distinctive prefix of each credential type + * - Require enough characters after the prefix to avoid false positives + * - Use case-insensitive matching where appropriate + * - Avoid overly broad patterns that would block legitimate traffic + */ +export const DLP_PATTERNS: DlpPattern[] = [ + // GitHub tokens + { + name: 'GitHub Personal Access Token (classic)', + description: 'GitHub classic personal access token (ghp_)', + regex: 'ghp_[a-zA-Z0-9]{36}', + }, + { + name: 'GitHub OAuth Access Token', + description: 'GitHub OAuth access token (gho_)', + regex: 'gho_[a-zA-Z0-9]{36}', + }, + { + name: 'GitHub App Installation Token', + description: 'GitHub App installation access token (ghs_)', + regex: 'ghs_[a-zA-Z0-9]{36}', + }, + { + name: 'GitHub App User-to-Server Token', + description: 'GitHub App user-to-server token (ghu_)', + regex: 'ghu_[a-zA-Z0-9]{36}', + }, + { + name: 'GitHub Fine-Grained PAT', + description: 'GitHub fine-grained personal access token (github_pat_)', + regex: 'github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59}', + }, + + // OpenAI + { + name: 'OpenAI API Key', + description: 'OpenAI API key (sk-)', + regex: 'sk-[a-zA-Z0-9]{20}T3BlbkFJ[a-zA-Z0-9]{20}', + }, + { + name: 'OpenAI Project API Key', + description: 'OpenAI project-scoped API key (sk-proj-)', + regex: 'sk-proj-[a-zA-Z0-9_-]{40,}', + }, + + // Anthropic + { + name: 'Anthropic API Key', + description: 'Anthropic API key (sk-ant-)', + regex: 'sk-ant-[a-zA-Z0-9_-]{40,}', + }, + + // AWS + { + name: 'AWS Access Key ID', + description: 'AWS access key ID (AKIA)', + regex: 'AKIA[0-9A-Z]{16}', + }, + + // Google Cloud + { + name: 'Google API Key', + description: 'Google API key (AIza)', + regex: 'AIza[a-zA-Z0-9_-]{35}', + }, + + // Slack + { + name: 'Slack Bot Token', + description: 'Slack bot user OAuth token (xoxb-)', + regex: 'xoxb-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}', + }, + { + name: 'Slack User Token', + description: 'Slack user OAuth token (xoxp-)', + regex: 'xoxp-[0-9]{10,13}-[0-9]{10,13}-[0-9]{10,13}-[a-f0-9]{32}', + }, + + // Generic patterns for common credential formats + { + name: 'Bearer Token in URL', + description: 'Bearer token passed as URL parameter', + regex: '[?&]bearer[_=][a-zA-Z0-9._-]{20,}', + }, + { + name: 'Authorization in URL', + description: 'Authorization credential passed as URL parameter', + regex: '[?&]authorization=[a-zA-Z0-9._-]{20,}', + }, + { + name: 'Private Key Marker', + description: 'Private key content in URL (PEM format marker)', + regex: 'PRIVATE(%20|\\+|%2B)KEY', + }, +]; + +/** + * Checks if a given string contains any DLP credential patterns. + * + * @param input - The string to scan (URL, query parameter, etc.) + * @returns Array of matched pattern names, empty if no matches + */ +export function scanForCredentials(input: string): string[] { + const matches: string[] = []; + for (const pattern of DLP_PATTERNS) { + const regex = new RegExp(pattern.regex, 'i'); + if (regex.test(input)) { + matches.push(pattern.name); + } + } + return matches; +} + +/** + * Generates Squid ACL configuration lines for DLP credential scanning. + * + * Produces `url_regex` ACL entries that match credential patterns in URLs, + * plus `http_access deny` rules that block matching requests. + * + * The deny rules are placed before allow rules in the generated squid.conf + * to ensure credential-bearing requests are blocked regardless of domain + * allowlist status. + * + * @returns Object with aclLines and accessRules arrays + */ +export function generateDlpSquidConfig(): { aclLines: string[]; accessRules: string[] } { + const aclLines: string[] = [ + '# DLP (Data Loss Prevention) ACL definitions', + '# Block requests containing credential patterns in URLs', + ]; + + for (const pattern of DLP_PATTERNS) { + aclLines.push(`acl dlp_blocked url_regex -i ${pattern.regex}`); + } + + const accessRules: string[] = [ + '# DLP: Deny requests containing detected credentials', + 'http_access deny dlp_blocked', + ]; + + return { aclLines, accessRules }; +} diff --git a/src/docker-manager.ts b/src/docker-manager.ts index 65c1afe8..9908e3a2 100644 --- a/src/docker-manager.ts +++ b/src/docker-manager.ts @@ -1347,6 +1347,7 @@ export async function writeConfigs(config: WrapperConfig): Promise { urlPatterns, enableHostAccess: config.enableHostAccess, allowHostPorts: config.allowHostPorts, + enableDlp: config.enableDlp, }); const squidConfigPath = path.join(config.workDir, 'squid.conf'); fs.writeFileSync(squidConfigPath, squidConfig, { mode: 0o644 }); diff --git a/src/squid-config.test.ts b/src/squid-config.test.ts index eb54b520..8392a0f2 100644 --- a/src/squid-config.test.ts +++ b/src/squid-config.test.ts @@ -1554,3 +1554,101 @@ describe('Empty Domain List', () => { expect(result).not.toContain('acl allowed_https_only'); }); }); + +describe('DLP Integration', () => { + const defaultPort = 3128; + + it('should not include DLP rules when enableDlp is false', () => { + const config = { + domains: ['github.com'], + port: defaultPort, + enableDlp: false, + }; + const result = generateSquidConfig(config); + expect(result).not.toContain('dlp_blocked'); + expect(result).not.toContain('DLP'); + }); + + it('should not include DLP rules when enableDlp is undefined', () => { + const config = { + domains: ['github.com'], + port: defaultPort, + }; + const result = generateSquidConfig(config); + expect(result).not.toContain('dlp_blocked'); + }); + + it('should include DLP ACL and deny rules when enableDlp is true', () => { + const config = { + domains: ['github.com'], + port: defaultPort, + enableDlp: true, + }; + const result = generateSquidConfig(config); + // Should have DLP ACL definitions + expect(result).toContain('acl dlp_blocked url_regex -i'); + // Should have DLP deny rule + expect(result).toContain('http_access deny dlp_blocked'); + // Should still have normal domain ACLs + expect(result).toContain('acl allowed_domains dstdomain .github.com'); + }); + + it('should place DLP deny rules before domain allow rules', () => { + const config = { + domains: ['github.com'], + port: defaultPort, + enableDlp: true, + }; + const result = generateSquidConfig(config); + + const dlpDenyIndex = result.indexOf('http_access deny dlp_blocked'); + const domainDenyIndex = result.indexOf('http_access deny !allowed_domains'); + // DLP deny should appear before domain deny + expect(dlpDenyIndex).toBeGreaterThan(-1); + expect(domainDenyIndex).toBeGreaterThan(-1); + expect(dlpDenyIndex).toBeLessThan(domainDenyIndex); + }); + + it('should include credential patterns like ghp_ and AKIA in ACLs', () => { + const config = { + domains: ['github.com'], + port: defaultPort, + enableDlp: true, + }; + const result = generateSquidConfig(config); + // Check for a few key patterns + expect(result).toContain('ghp_'); + expect(result).toContain('AKIA'); + expect(result).toContain('sk-ant-'); + }); + + it('should work with DLP and blocked domains together', () => { + const config = { + domains: ['github.com'], + blockedDomains: ['evil.com'], + port: defaultPort, + enableDlp: true, + }; + const result = generateSquidConfig(config); + // Should have both DLP and blocked domain rules + expect(result).toContain('http_access deny dlp_blocked'); + expect(result).toContain('http_access deny blocked_domains'); + expect(result).toContain('acl dlp_blocked url_regex -i'); + }); + + it('should work with DLP and SSL Bump together', () => { + const config = { + domains: ['github.com'], + port: defaultPort, + enableDlp: true, + sslBump: true, + caFiles: { certPath: '/tmp/cert.pem', keyPath: '/tmp/key.pem' }, + sslDbPath: '/var/spool/squid_ssl_db', + }; + const result = generateSquidConfig(config); + // Should have DLP rules + expect(result).toContain('http_access deny dlp_blocked'); + // Should have SSL Bump config + expect(result).toContain('ssl_bump'); + }); +}); diff --git a/src/squid-config.ts b/src/squid-config.ts index 6116ac78..cd036284 100644 --- a/src/squid-config.ts +++ b/src/squid-config.ts @@ -5,6 +5,7 @@ import { PlainDomainEntry, DomainPattern, } from './domain-patterns'; +import { generateDlpSquidConfig } from './dlp'; /** * Ports that should never be allowed, even with --allow-host-ports @@ -205,7 +206,7 @@ ${urlAclSection}${urlAccessRules}`; * // Blocked: internal.example.com -> acl blocked_domains dstdomain .internal.example.com */ export function generateSquidConfig(config: SquidConfig): string { - const { domains, blockedDomains, port, sslBump, caFiles, sslDbPath, urlPatterns, enableHostAccess, allowHostPorts } = config; + const { domains, blockedDomains, port, sslBump, caFiles, sslDbPath, urlPatterns, enableHostAccess, allowHostPorts, enableDlp } = config; // Parse domains into plain domains and wildcard patterns // Note: parseDomainList extracts and preserves protocol info from prefixes (http://, https://) @@ -430,6 +431,15 @@ export function generateSquidConfig(config: SquidConfig): string { portConfig = ''; } + // Generate DLP section if enabled + let dlpAclSection = ''; + let dlpAccessSection = ''; + if (enableDlp) { + const dlp = generateDlpSquidConfig(); + dlpAclSection = '\n' + dlp.aclLines.join('\n') + '\n'; + dlpAccessSection = '\n' + dlp.accessRules.join('\n') + '\n'; + } + // Port ACLs and access rules // Build Safe_ports ACL with user-specified additional ports if provided let portAclsSection = `# Port ACLs @@ -522,7 +532,7 @@ cache_log /var/log/squid/cache.log cache deny all ${aclSection} - +${dlpAclSection} # Port configuration ${portConfig} ${sslBumpSection} @@ -542,7 +552,7 @@ acl dst_ipv4 dstdom_regex ^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$ acl dst_ipv6 dstdom_regex ^\\[?[0-9a-fA-F:]+\\]?$ http_access deny dst_ipv4 http_access deny dst_ipv6 - +${dlpAccessSection} ${accessRulesSection}# Deny requests to unknown domains (not in allow-list) # This applies to all sources including localnet ${denyRule} diff --git a/src/types.ts b/src/types.ts index ecca3882..7aad86ac 100644 --- a/src/types.ts +++ b/src/types.ts @@ -564,6 +564,23 @@ export interface WrapperConfig { */ anthropicApiTarget?: string; + /** + * Enable Data Loss Prevention (DLP) scanning + * + * When true, Squid proxy will block outgoing requests that contain + * credential-like patterns (API keys, tokens, secrets) in URLs. + * This protects against accidental credential exfiltration via + * query parameters, path segments, or encoded URL content. + * + * Detected patterns include: GitHub tokens (ghp_, gho_, ghs_, ghu_, + * github_pat_), OpenAI keys (sk-), Anthropic keys (sk-ant-), + * AWS access keys (AKIA), Google API keys (AIza), Slack tokens, + * and generic credential patterns. + * + * @default false + */ + enableDlp?: boolean; + /** * Maximum time in minutes to allow the agent command to run * @@ -683,6 +700,16 @@ export interface SquidConfig { */ urlPatterns?: string[]; + /** + * Whether to enable DLP (Data Loss Prevention) scanning + * + * When true, Squid will block requests containing credential patterns + * (API keys, tokens, secrets) in URLs via url_regex ACLs. + * + * @default false + */ + enableDlp?: boolean; + /** * Whether to enable host access (allows non-standard ports) *