diff --git a/plugins/llm-security/CLAUDE.md b/plugins/llm-security/CLAUDE.md index 24c93a5..f5423af 100644 --- a/plugins/llm-security/CLAUDE.md +++ b/plugins/llm-security/CLAUDE.md @@ -70,7 +70,7 @@ formula; resolution deferred to Batch B. | `pre-install-supply-chain.mjs` | PreToolUse | `Bash` | Block compromised packages across ALL ecosystems. Bash evasion normalization before gate matching | | `pre-write-pathguard.mjs` | PreToolUse | `Write` | Block writes to .env, .ssh/, .aws/, credentials, settings | | `post-mcp-verify.mjs` | PostToolUse | — (all) | Injection scan on ALL tool output (incl. MEDIUM patterns, HITL traps, sub-agent spawn, NL indirection, cognitive load, hybrid P2SQL/recursive/XSS). HTML content trap detection. Bash-specific: secrets/URLs/size. MCP: description drift detection (MCP05), per-tool volume tracking | -| `post-session-guard.mjs` | PostToolUse | — (all) | Runtime trifecta detection (Rule of Two). Sliding window (20 calls) + 100-call long-horizon. MCP-concentrated trifecta (same server = elevated severity). Sensitive path + exfil detection. Slow-burn trifecta (legs >50 calls apart = MEDIUM). Behavioral drift detection (Jensen-Shannon divergence). CaMeL-inspired data flow tagging (SHA-256 provenance tracking, output→input linking). Mode: `LLM_SECURITY_TRIFECTA_MODE=block\|warn\|off` (default: warn). Cumulative data volume tracking (100KB/500KB/1MB thresholds). Sub-agent delegation tracking (Task/Agent tools): escalation-after-input advisory when delegation occurs within 5 calls of untrusted input (DeepMind Agent Traps kat. 4) | +| `post-session-guard.mjs` | PostToolUse | — (all) | Runtime trifecta detection (Rule of Two). Sliding window (20 calls) + 100-call long-horizon. MCP-concentrated trifecta (same server = elevated severity). Sensitive path + exfil detection. Slow-burn trifecta (legs >50 calls apart = MEDIUM). Behavioral drift detection (Jensen-Shannon divergence). CaMeL-inspired data flow tagging (SHA-256 provenance tracking, output→input linking). Mode: `LLM_SECURITY_TRIFECTA_MODE=block\|warn\|off` (default: warn). Cumulative data volume tracking (100KB/500KB/1MB thresholds). Sub-agent delegation tracking (Task/Agent tools): escalation-after-input advisory when delegation occurs within `LLM_SECURITY_ESCALATION_WINDOW` calls (default 5) of untrusted input (DeepMind Agent Traps kat. 4); secondary 20-call MEDIUM advisory catches slow-burn variants outside the primary window (E17, v7.2.0) | | `update-check.mjs` | UserPromptSubmit | — | Checks for newer versions (max 1x/24h, cached). Disable: `LLM_SECURITY_UPDATE_CHECK=off` | > `pre-install-supply-chain.mjs` covers 7 package managers: npm/yarn/pnpm, pip/pip3/uv, brew, docker, go, cargo, gem. Per-ecosystem blocklists, age gate (<72h), npm audit (critical=block, high=warn), PyPI API inspection, Levenshtein typosquat detection, Docker image verification. diff --git a/plugins/llm-security/hooks/scripts/post-session-guard.mjs b/plugins/llm-security/hooks/scripts/post-session-guard.mjs index 333adf1..0b91194 100644 --- a/plugins/llm-security/hooks/scripts/post-session-guard.mjs +++ b/plugins/llm-security/hooks/scripts/post-session-guard.mjs @@ -61,7 +61,17 @@ const DRIFT_THRESHOLD = 0.25; const DRIFT_SAMPLE_SIZE = 20; // Sub-agent delegation tracking (DeepMind Agent Traps kat. 4, v5.0 S4) -const DELEGATION_ESCALATION_WINDOW = 5; // calls after input_source +// E17 (v7.2.0): primary window configurable via LLM_SECURITY_ESCALATION_WINDOW +// (default 5). Secondary 20-call window emits MEDIUM advisory for delegation +// in the [primary, 20]-call range. Both reference an input_source; the +// secondary catches slow-burn variants where the attacker waits past the +// primary window before delegating. +const DELEGATION_ESCALATION_WINDOW = (() => { + const envVal = parseInt(process.env.LLM_SECURITY_ESCALATION_WINDOW, 10); + if (Number.isFinite(envVal) && envVal > 0) return envVal; + return getPolicyValue('trifecta', 'escalation_window', 5); +})(); +const DELEGATION_ESCALATION_WINDOW_MEDIUM = 20; // secondary longer-window advisory // Rule of Two enforcement mode: block | warn | off (env var takes precedence over policy) const policyTrifectaMode = getPolicyValue('trifecta', 'mode', 'warn'); @@ -452,25 +462,46 @@ function formatWarning(evidence, mcpInfo, isSensitiveExfil) { * Check for escalation-after-input: delegation within DELEGATION_ESCALATION_WINDOW * calls of an input_source. Untrusted content consumed shortly before spawning a * sub-agent may indicate the model is being manipulated into delegating dangerous work. - * @param {object[]} entries — recent window (20-call) + * + * E17 (v7.2.0): returns a `tier` indicating which window matched. + * - `'primary'` — input within DELEGATION_ESCALATION_WINDOW calls (default 5). + * Existing MEDIUM advisory. + * - `'secondary'` — input within DELEGATION_ESCALATION_WINDOW_MEDIUM calls + * (20) but outside the primary window. New, slow-burn variant — + * also MEDIUM but with a different message. + * - `null` (when detected=false) — no input source within either window. + * + * @param {object[]} entries — recent window (long-horizon, 100-call) * @param {{ classes: string[] }} currentEntry — the entry just appended - * @returns {{ detected: boolean, inputDetail: string }} + * @returns {{ detected: boolean, inputDetail: string, tier: 'primary'|'secondary'|null }} */ function checkEscalationAfterInput(entries, currentEntry) { if (!currentEntry.classes.includes('delegation')) { - return { detected: false, inputDetail: '' }; + return { detected: false, inputDetail: '', tier: null }; } - // Walk backwards through the last DELEGATION_ESCALATION_WINDOW entries - // looking for an input_source const toolEntries = entries.filter(e => !e.type); - const recentN = toolEntries.slice(-(DELEGATION_ESCALATION_WINDOW + 1), -1); // exclude current - for (const entry of recentN) { + // Look at the last DELEGATION_ESCALATION_WINDOW_MEDIUM entries before + // current (excluding current). Iterate from newest (closest to delegation) + // to oldest, so we report tier=primary if a match is in the inner window. + const limit = DELEGATION_ESCALATION_WINDOW_MEDIUM; + const slice = toolEntries.slice(-(limit + 1), -1); // exclude current + // Walk newest-to-oldest. Index from the end: distance 1 = most recent. + for (let i = slice.length - 1; i >= 0; i--) { + const entry = slice[i]; if ((entry.classes || []).includes('input_source')) { - return { detected: true, inputDetail: entry.detail || entry.tool || 'unknown' }; + // distance: how many tool calls between input_source and current + // delegation. distance=1 means input is directly before delegation. + const distance = slice.length - i; + const tier = distance <= DELEGATION_ESCALATION_WINDOW ? 'primary' : 'secondary'; + return { + detected: true, + inputDetail: entry.detail || entry.tool || 'unknown', + tier, + }; } } - return { detected: false, inputDetail: '' }; + return { detected: false, inputDetail: '', tier: null }; } /** @@ -486,9 +517,25 @@ function hasEscalationWarning(entries) { * Format the escalation-after-input warning. * @param {string} delegationDetail — what the delegation was for * @param {string} inputDetail — what input source preceded it + * @param {'primary'|'secondary'} tier — which window matched (E17, v7.2.0) * @returns {string} */ -function formatEscalationWarning(delegationDetail, inputDetail) { +function formatEscalationWarning(delegationDetail, inputDetail, tier = 'primary') { + if (tier === 'secondary') { + return ( + 'SECURITY ADVISORY (session-guard): Slow-burn escalation-after-input detected [MEDIUM] — ' + + 'sub-agent delegation in the slow-burn window after untrusted input.\n\n' + + `A Task/Agent delegation occurred within ${DELEGATION_ESCALATION_WINDOW_MEDIUM} calls (` + + `but outside the ${DELEGATION_ESCALATION_WINDOW}-call primary window) of untrusted input:\n` + + ` Input source: ${inputDetail}\n` + + ` Delegation: ${delegationDetail}\n\n` + + 'This is a slower variant of the escalation-after-input pattern. The wider window\n' + + 'catches attackers who deliberately wait past the primary window before delegating,\n' + + 'and surfaces patterns that the primary 5-call window cannot. Review whether this\n' + + 'delegation is expected and appropriately scoped.\n' + + 'Configure window via LLM_SECURITY_ESCALATION_WINDOW env var (default 5).' + ); + } return ( 'SECURITY ADVISORY (session-guard): Escalation-after-input detected [MEDIUM] — ' + 'sub-agent delegation shortly after untrusted input.\n\n' + @@ -498,7 +545,8 @@ function formatEscalationWarning(delegationDetail, inputDetail) { 'Untrusted content (web pages, MCP tool output) may be influencing the model\n' + 'to spawn sub-agents with capabilities beyond the original task scope.\n' + 'This is a known attack vector (DeepMind AI Agent Traps, Category 4).\n' + - 'Review whether this delegation is expected and appropriately scoped.' + 'Review whether this delegation is expected and appropriately scoped.\n' + + 'Configure window via LLM_SECURITY_ESCALATION_WINDOW env var (default 5).' ); } @@ -850,18 +898,22 @@ if (!(classes.length === 1 && (classes[0] === 'neutral' || classes[0] === 'deleg } } -// --- Escalation-after-input detection (delegation within 5 calls of input_source) --- +// --- Escalation-after-input detection (E17 v7.2.0: primary + secondary window) --- +// Primary window: DELEGATION_ESCALATION_WINDOW (default 5, env-configurable). +// Secondary window: DELEGATION_ESCALATION_WINDOW_MEDIUM (20). Slow-burn variant +// emits MEDIUM advisory with a different message. Read enough entries to cover +// the secondary window. if (classes.includes('delegation')) { - const window = readLastEntries(stateFile, WINDOW_SIZE); - const escalation = checkEscalationAfterInput(window, entry); - if (escalation.detected && !hasEscalationWarning(window)) { - messages.push(formatEscalationWarning(detail, escalation.inputDetail)); - appendEntry(stateFile, { type: 'escalation_warning', ts: Date.now() }); + const escalationWindow = readLastEntries(stateFile, Math.max(WINDOW_SIZE, DELEGATION_ESCALATION_WINDOW_MEDIUM + 5)); + const escalation = checkEscalationAfterInput(escalationWindow, entry); + if (escalation.detected && !hasEscalationWarning(escalationWindow)) { + messages.push(formatEscalationWarning(detail, escalation.inputDetail, escalation.tier)); + appendEntry(stateFile, { type: 'escalation_warning', ts: Date.now(), tier: escalation.tier }); writeAuditEvent({ event_type: 'escalation_after_input', severity: 'medium', source: 'post-session-guard', - details: { tool: detail, input_source: escalation.inputDetail }, + details: { tool: detail, input_source: escalation.inputDetail, tier: escalation.tier }, owasp: ['ASI01'], action_taken: 'warned', }); diff --git a/plugins/llm-security/tests/hooks/post-session-guard.test.mjs b/plugins/llm-security/tests/hooks/post-session-guard.test.mjs index 48d9f4d..61a8376 100644 --- a/plugins/llm-security/tests/hooks/post-session-guard.test.mjs +++ b/plugins/llm-security/tests/hooks/post-session-guard.test.mjs @@ -960,12 +960,15 @@ describe('post-session-guard — escalation-after-input (S4)', () => { } finally { teardown(); } }); - it('does NOT trigger when input_source is >5 calls ago', async () => { + it('does NOT trigger when input_source is >20 calls ago (outside both windows, E17 v7.2.0)', async () => { + // Pre-E17 this test asserted >5 calls ago. After E17 the secondary + // 20-call MEDIUM advisory catches input within [primary, 20]; only + // input >20 calls ago is a true negative. setup(); try { const entries = []; entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://example.com')); - for (let i = 0; i < 8; i++) { + for (let i = 0; i < 25; i++) { entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt')); } writeStateFile(entries); @@ -978,7 +981,7 @@ describe('post-session-guard — escalation-after-input (S4)', () => { const advisory = parseAdvisory(result.stdout); if (advisory) { assert.ok(!advisory.systemMessage.includes('Escalation-after-input'), - 'should NOT trigger escalation when input is >5 calls ago'); + 'should NOT trigger when input is >20 calls ago (outside secondary window)'); } } finally { teardown(); } }); @@ -1103,6 +1106,143 @@ describe('post-session-guard — escalation-after-input (S4)', () => { assert.equal(result.code, 0, 'escalation should never block (MEDIUM only)'); } finally { teardown(); } }); + + // ------------------------------------------------------------------------- + // E17 (v7.2.0) — configurable primary window + secondary 20-call advisory + // ------------------------------------------------------------------------- + + it('E17 — secondary window catches delegation 6-20 calls after input (slow-burn)', async () => { + setup(); + try { + const entries = []; + entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com')); + // 8 Read calls — input is 9 calls before Task. Primary window (5) is + // exceeded; secondary window (20) still catches it. + for (let i = 0; i < 8; i++) { + entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt')); + } + writeStateFile(entries); + + const result = await runHook(SCRIPT, payload({ + toolName: 'Task', + toolInput: { description: 'Slow-burn delegation' }, + })); + assert.equal(result.code, 0); + const advisory = parseAdvisory(result.stdout); + assert.ok(advisory, 'should emit secondary-window advisory'); + assert.ok( + advisory.systemMessage.includes('Slow-burn') || + advisory.systemMessage.includes('slow-burn'), + `expected slow-burn message, got: ${advisory.systemMessage.slice(0, 200)}`, + ); + } finally { teardown(); } + }); + + it('E17 — secondary window boundary: exactly 20 calls triggers advisory', async () => { + setup(); + try { + const entries = []; + entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com')); + // 19 Read calls — input is 20 calls before Task. At the boundary. + for (let i = 0; i < 19; i++) { + entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt')); + } + writeStateFile(entries); + + const result = await runHook(SCRIPT, payload({ + toolName: 'Task', + toolInput: { description: 'Boundary test' }, + })); + assert.equal(result.code, 0); + const advisory = parseAdvisory(result.stdout); + assert.ok(advisory, 'should detect at exactly the 20-call boundary'); + } finally { teardown(); } + }); + + it('E17 — primary advisory still fires within first 5 calls (regression guard)', async () => { + setup(); + try { + const entries = []; + entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com')); + entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt')); + writeStateFile(entries); + + const result = await runHook(SCRIPT, payload({ + toolName: 'Task', + toolInput: { description: 'Fast escalation' }, + })); + assert.equal(result.code, 0); + const advisory = parseAdvisory(result.stdout); + assert.ok(advisory, 'primary advisory must still fire'); + // Primary message — NOT slow-burn + assert.ok( + !advisory.systemMessage.includes('Slow-burn') && !advisory.systemMessage.includes('slow-burn'), + `expected primary (not slow-burn) message, got: ${advisory.systemMessage.slice(0, 200)}`, + ); + assert.ok( + advisory.systemMessage.includes('Escalation-after-input'), + `expected primary escalation message, got: ${advisory.systemMessage.slice(0, 200)}`, + ); + } finally { teardown(); } + }); + + it('E17 — LLM_SECURITY_ESCALATION_WINDOW=3 narrows primary window', async () => { + setup(); + try { + const entries = []; + entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com')); + // 3 Read calls — input is 4 calls before Task. + // With default window=5 → primary advisory. + // With env=3 → outside primary, inside secondary (slow-burn advisory). + for (let i = 0; i < 3; i++) { + entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt')); + } + writeStateFile(entries); + + const { runHookWithEnv } = await import('./hook-helper.mjs'); + const result = await runHookWithEnv(SCRIPT, payload({ + toolName: 'Task', + toolInput: { description: 'env-overridden window' }, + }), { LLM_SECURITY_ESCALATION_WINDOW: '3' }); + assert.equal(result.code, 0); + const advisory = parseAdvisory(result.stdout); + assert.ok(advisory, 'should still emit advisory'); + // With narrowed primary, a 4-call distance falls into the secondary window. + assert.ok( + advisory.systemMessage.includes('Slow-burn') || + advisory.systemMessage.includes('slow-burn'), + `expected slow-burn (since 4 > narrowed primary=3), got: ${advisory.systemMessage.slice(0, 200)}`, + ); + } finally { teardown(); } + }); + + it('E17 — LLM_SECURITY_ESCALATION_WINDOW=8 expands primary window', async () => { + setup(); + try { + const entries = []; + entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com')); + // 6 Read calls — input is 7 calls before Task. + // With default window=5 → outside primary, inside secondary (slow-burn). + // With env=8 → inside primary (primary advisory). + for (let i = 0; i < 6; i++) { + entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt')); + } + writeStateFile(entries); + + const { runHookWithEnv } = await import('./hook-helper.mjs'); + const result = await runHookWithEnv(SCRIPT, payload({ + toolName: 'Task', + toolInput: { description: 'env-expanded window' }, + }), { LLM_SECURITY_ESCALATION_WINDOW: '8' }); + assert.equal(result.code, 0); + const advisory = parseAdvisory(result.stdout); + assert.ok(advisory, 'should emit advisory'); + assert.ok( + !advisory.systemMessage.includes('Slow-burn') && !advisory.systemMessage.includes('slow-burn'), + `expected primary message (7 ≤ env=8), got: ${advisory.systemMessage.slice(0, 200)}`, + ); + } finally { teardown(); } + }); }); // ---------------------------------------------------------------------------