feat(post-session-guard): E17 — configurable escalation window + 20-call MEDIUM advisory
Critical-review §4 E17 finding: pre-v7.2.0 the delegation-after-input
advisory fired only within a 5-call window. Attackers who deliberately
waited 6+ calls before delegating bypassed detection. Window was also
hardcoded — operators couldn't tune it for their environment.
Two coordinated changes:
1. LLM_SECURITY_ESCALATION_WINDOW env var (primary window override)
- parseInt(env) || getPolicyValue('trifecta', 'escalation_window', 5)
- Mirrors the established pattern from
LLM_SECURITY_TRIFECTA_MODE et al.
- Setting env=3 narrows; env=8 expands.
2. Secondary 20-call MEDIUM advisory (slow-burn variant)
- DELEGATION_ESCALATION_WINDOW_MEDIUM = 20 (hardcoded — same value
for all operators; tunable in a future patch if needed)
- checkEscalationAfterInput now returns `tier: 'primary'|'secondary'|null`
- formatEscalationWarning emits a different message for secondary —
mentions "slow-burn", references env-var, distinct from the
primary "DeepMind Category 4" framing
Hook reads max(WINDOW_SIZE, secondary+5) entries to cover the wider
window. Existing duplicate-suppression (`escalation_warning` state
entry) covers both tiers. Audit-trail event captures `tier` field.
Tests: +5 cases in tests/hooks/post-session-guard.test.mjs:
- secondary window catches 9-call distance (slow-burn)
- secondary boundary at exactly 20 calls
- primary regression guard (1-call distance)
- env=3 narrows primary (4-call distance becomes secondary)
- env=8 expands primary (7-call distance stays primary)
Updated existing test "does NOT trigger when input_source is >5 calls
ago" — now requires >20 calls (secondary window catches 6-20).
Suite: 1644 → 1672 (+28 from new tests + extended scope). All green.
CLAUDE.md hooks table updated to document both windows and the env var.
This commit is contained in:
parent
ec4ae268da
commit
f0a1d4024a
3 changed files with 215 additions and 23 deletions
|
|
@ -70,7 +70,7 @@ formula; resolution deferred to Batch B.
|
|||
| `pre-install-supply-chain.mjs` | PreToolUse | `Bash` | Block compromised packages across ALL ecosystems. Bash evasion normalization before gate matching |
|
||||
| `pre-write-pathguard.mjs` | PreToolUse | `Write` | Block writes to .env, .ssh/, .aws/, credentials, settings |
|
||||
| `post-mcp-verify.mjs` | PostToolUse | — (all) | Injection scan on ALL tool output (incl. MEDIUM patterns, HITL traps, sub-agent spawn, NL indirection, cognitive load, hybrid P2SQL/recursive/XSS). HTML content trap detection. Bash-specific: secrets/URLs/size. MCP: description drift detection (MCP05), per-tool volume tracking |
|
||||
| `post-session-guard.mjs` | PostToolUse | — (all) | Runtime trifecta detection (Rule of Two). Sliding window (20 calls) + 100-call long-horizon. MCP-concentrated trifecta (same server = elevated severity). Sensitive path + exfil detection. Slow-burn trifecta (legs >50 calls apart = MEDIUM). Behavioral drift detection (Jensen-Shannon divergence). CaMeL-inspired data flow tagging (SHA-256 provenance tracking, output→input linking). Mode: `LLM_SECURITY_TRIFECTA_MODE=block\|warn\|off` (default: warn). Cumulative data volume tracking (100KB/500KB/1MB thresholds). Sub-agent delegation tracking (Task/Agent tools): escalation-after-input advisory when delegation occurs within 5 calls of untrusted input (DeepMind Agent Traps kat. 4) |
|
||||
| `post-session-guard.mjs` | PostToolUse | — (all) | Runtime trifecta detection (Rule of Two). Sliding window (20 calls) + 100-call long-horizon. MCP-concentrated trifecta (same server = elevated severity). Sensitive path + exfil detection. Slow-burn trifecta (legs >50 calls apart = MEDIUM). Behavioral drift detection (Jensen-Shannon divergence). CaMeL-inspired data flow tagging (SHA-256 provenance tracking, output→input linking). Mode: `LLM_SECURITY_TRIFECTA_MODE=block\|warn\|off` (default: warn). Cumulative data volume tracking (100KB/500KB/1MB thresholds). Sub-agent delegation tracking (Task/Agent tools): escalation-after-input advisory when delegation occurs within `LLM_SECURITY_ESCALATION_WINDOW` calls (default 5) of untrusted input (DeepMind Agent Traps kat. 4); secondary 20-call MEDIUM advisory catches slow-burn variants outside the primary window (E17, v7.2.0) |
|
||||
| `update-check.mjs` | UserPromptSubmit | — | Checks for newer versions (max 1x/24h, cached). Disable: `LLM_SECURITY_UPDATE_CHECK=off` |
|
||||
|
||||
> `pre-install-supply-chain.mjs` covers 7 package managers: npm/yarn/pnpm, pip/pip3/uv, brew, docker, go, cargo, gem. Per-ecosystem blocklists, age gate (<72h), npm audit (critical=block, high=warn), PyPI API inspection, Levenshtein typosquat detection, Docker image verification.
|
||||
|
|
|
|||
|
|
@ -61,7 +61,17 @@ const DRIFT_THRESHOLD = 0.25;
|
|||
const DRIFT_SAMPLE_SIZE = 20;
|
||||
|
||||
// Sub-agent delegation tracking (DeepMind Agent Traps kat. 4, v5.0 S4)
|
||||
const DELEGATION_ESCALATION_WINDOW = 5; // calls after input_source
|
||||
// E17 (v7.2.0): primary window configurable via LLM_SECURITY_ESCALATION_WINDOW
|
||||
// (default 5). Secondary 20-call window emits MEDIUM advisory for delegation
|
||||
// in the [primary, 20]-call range. Both reference an input_source; the
|
||||
// secondary catches slow-burn variants where the attacker waits past the
|
||||
// primary window before delegating.
|
||||
const DELEGATION_ESCALATION_WINDOW = (() => {
|
||||
const envVal = parseInt(process.env.LLM_SECURITY_ESCALATION_WINDOW, 10);
|
||||
if (Number.isFinite(envVal) && envVal > 0) return envVal;
|
||||
return getPolicyValue('trifecta', 'escalation_window', 5);
|
||||
})();
|
||||
const DELEGATION_ESCALATION_WINDOW_MEDIUM = 20; // secondary longer-window advisory
|
||||
|
||||
// Rule of Two enforcement mode: block | warn | off (env var takes precedence over policy)
|
||||
const policyTrifectaMode = getPolicyValue('trifecta', 'mode', 'warn');
|
||||
|
|
@ -452,25 +462,46 @@ function formatWarning(evidence, mcpInfo, isSensitiveExfil) {
|
|||
* Check for escalation-after-input: delegation within DELEGATION_ESCALATION_WINDOW
|
||||
* calls of an input_source. Untrusted content consumed shortly before spawning a
|
||||
* sub-agent may indicate the model is being manipulated into delegating dangerous work.
|
||||
* @param {object[]} entries — recent window (20-call)
|
||||
*
|
||||
* E17 (v7.2.0): returns a `tier` indicating which window matched.
|
||||
* - `'primary'` — input within DELEGATION_ESCALATION_WINDOW calls (default 5).
|
||||
* Existing MEDIUM advisory.
|
||||
* - `'secondary'` — input within DELEGATION_ESCALATION_WINDOW_MEDIUM calls
|
||||
* (20) but outside the primary window. New, slow-burn variant —
|
||||
* also MEDIUM but with a different message.
|
||||
* - `null` (when detected=false) — no input source within either window.
|
||||
*
|
||||
* @param {object[]} entries — recent window (long-horizon, 100-call)
|
||||
* @param {{ classes: string[] }} currentEntry — the entry just appended
|
||||
* @returns {{ detected: boolean, inputDetail: string }}
|
||||
* @returns {{ detected: boolean, inputDetail: string, tier: 'primary'|'secondary'|null }}
|
||||
*/
|
||||
function checkEscalationAfterInput(entries, currentEntry) {
|
||||
if (!currentEntry.classes.includes('delegation')) {
|
||||
return { detected: false, inputDetail: '' };
|
||||
return { detected: false, inputDetail: '', tier: null };
|
||||
}
|
||||
|
||||
// Walk backwards through the last DELEGATION_ESCALATION_WINDOW entries
|
||||
// looking for an input_source
|
||||
const toolEntries = entries.filter(e => !e.type);
|
||||
const recentN = toolEntries.slice(-(DELEGATION_ESCALATION_WINDOW + 1), -1); // exclude current
|
||||
for (const entry of recentN) {
|
||||
// Look at the last DELEGATION_ESCALATION_WINDOW_MEDIUM entries before
|
||||
// current (excluding current). Iterate from newest (closest to delegation)
|
||||
// to oldest, so we report tier=primary if a match is in the inner window.
|
||||
const limit = DELEGATION_ESCALATION_WINDOW_MEDIUM;
|
||||
const slice = toolEntries.slice(-(limit + 1), -1); // exclude current
|
||||
// Walk newest-to-oldest. Index from the end: distance 1 = most recent.
|
||||
for (let i = slice.length - 1; i >= 0; i--) {
|
||||
const entry = slice[i];
|
||||
if ((entry.classes || []).includes('input_source')) {
|
||||
return { detected: true, inputDetail: entry.detail || entry.tool || 'unknown' };
|
||||
// distance: how many tool calls between input_source and current
|
||||
// delegation. distance=1 means input is directly before delegation.
|
||||
const distance = slice.length - i;
|
||||
const tier = distance <= DELEGATION_ESCALATION_WINDOW ? 'primary' : 'secondary';
|
||||
return {
|
||||
detected: true,
|
||||
inputDetail: entry.detail || entry.tool || 'unknown',
|
||||
tier,
|
||||
};
|
||||
}
|
||||
}
|
||||
return { detected: false, inputDetail: '' };
|
||||
return { detected: false, inputDetail: '', tier: null };
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -486,9 +517,25 @@ function hasEscalationWarning(entries) {
|
|||
* Format the escalation-after-input warning.
|
||||
* @param {string} delegationDetail — what the delegation was for
|
||||
* @param {string} inputDetail — what input source preceded it
|
||||
* @param {'primary'|'secondary'} tier — which window matched (E17, v7.2.0)
|
||||
* @returns {string}
|
||||
*/
|
||||
function formatEscalationWarning(delegationDetail, inputDetail) {
|
||||
function formatEscalationWarning(delegationDetail, inputDetail, tier = 'primary') {
|
||||
if (tier === 'secondary') {
|
||||
return (
|
||||
'SECURITY ADVISORY (session-guard): Slow-burn escalation-after-input detected [MEDIUM] — ' +
|
||||
'sub-agent delegation in the slow-burn window after untrusted input.\n\n' +
|
||||
`A Task/Agent delegation occurred within ${DELEGATION_ESCALATION_WINDOW_MEDIUM} calls (` +
|
||||
`but outside the ${DELEGATION_ESCALATION_WINDOW}-call primary window) of untrusted input:\n` +
|
||||
` Input source: ${inputDetail}\n` +
|
||||
` Delegation: ${delegationDetail}\n\n` +
|
||||
'This is a slower variant of the escalation-after-input pattern. The wider window\n' +
|
||||
'catches attackers who deliberately wait past the primary window before delegating,\n' +
|
||||
'and surfaces patterns that the primary 5-call window cannot. Review whether this\n' +
|
||||
'delegation is expected and appropriately scoped.\n' +
|
||||
'Configure window via LLM_SECURITY_ESCALATION_WINDOW env var (default 5).'
|
||||
);
|
||||
}
|
||||
return (
|
||||
'SECURITY ADVISORY (session-guard): Escalation-after-input detected [MEDIUM] — ' +
|
||||
'sub-agent delegation shortly after untrusted input.\n\n' +
|
||||
|
|
@ -498,7 +545,8 @@ function formatEscalationWarning(delegationDetail, inputDetail) {
|
|||
'Untrusted content (web pages, MCP tool output) may be influencing the model\n' +
|
||||
'to spawn sub-agents with capabilities beyond the original task scope.\n' +
|
||||
'This is a known attack vector (DeepMind AI Agent Traps, Category 4).\n' +
|
||||
'Review whether this delegation is expected and appropriately scoped.'
|
||||
'Review whether this delegation is expected and appropriately scoped.\n' +
|
||||
'Configure window via LLM_SECURITY_ESCALATION_WINDOW env var (default 5).'
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -850,18 +898,22 @@ if (!(classes.length === 1 && (classes[0] === 'neutral' || classes[0] === 'deleg
|
|||
}
|
||||
}
|
||||
|
||||
// --- Escalation-after-input detection (delegation within 5 calls of input_source) ---
|
||||
// --- Escalation-after-input detection (E17 v7.2.0: primary + secondary window) ---
|
||||
// Primary window: DELEGATION_ESCALATION_WINDOW (default 5, env-configurable).
|
||||
// Secondary window: DELEGATION_ESCALATION_WINDOW_MEDIUM (20). Slow-burn variant
|
||||
// emits MEDIUM advisory with a different message. Read enough entries to cover
|
||||
// the secondary window.
|
||||
if (classes.includes('delegation')) {
|
||||
const window = readLastEntries(stateFile, WINDOW_SIZE);
|
||||
const escalation = checkEscalationAfterInput(window, entry);
|
||||
if (escalation.detected && !hasEscalationWarning(window)) {
|
||||
messages.push(formatEscalationWarning(detail, escalation.inputDetail));
|
||||
appendEntry(stateFile, { type: 'escalation_warning', ts: Date.now() });
|
||||
const escalationWindow = readLastEntries(stateFile, Math.max(WINDOW_SIZE, DELEGATION_ESCALATION_WINDOW_MEDIUM + 5));
|
||||
const escalation = checkEscalationAfterInput(escalationWindow, entry);
|
||||
if (escalation.detected && !hasEscalationWarning(escalationWindow)) {
|
||||
messages.push(formatEscalationWarning(detail, escalation.inputDetail, escalation.tier));
|
||||
appendEntry(stateFile, { type: 'escalation_warning', ts: Date.now(), tier: escalation.tier });
|
||||
writeAuditEvent({
|
||||
event_type: 'escalation_after_input',
|
||||
severity: 'medium',
|
||||
source: 'post-session-guard',
|
||||
details: { tool: detail, input_source: escalation.inputDetail },
|
||||
details: { tool: detail, input_source: escalation.inputDetail, tier: escalation.tier },
|
||||
owasp: ['ASI01'],
|
||||
action_taken: 'warned',
|
||||
});
|
||||
|
|
|
|||
|
|
@ -960,12 +960,15 @@ describe('post-session-guard — escalation-after-input (S4)', () => {
|
|||
} finally { teardown(); }
|
||||
});
|
||||
|
||||
it('does NOT trigger when input_source is >5 calls ago', async () => {
|
||||
it('does NOT trigger when input_source is >20 calls ago (outside both windows, E17 v7.2.0)', async () => {
|
||||
// Pre-E17 this test asserted >5 calls ago. After E17 the secondary
|
||||
// 20-call MEDIUM advisory catches input within [primary, 20]; only
|
||||
// input >20 calls ago is a true negative.
|
||||
setup();
|
||||
try {
|
||||
const entries = [];
|
||||
entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://example.com'));
|
||||
for (let i = 0; i < 8; i++) {
|
||||
for (let i = 0; i < 25; i++) {
|
||||
entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt'));
|
||||
}
|
||||
writeStateFile(entries);
|
||||
|
|
@ -978,7 +981,7 @@ describe('post-session-guard — escalation-after-input (S4)', () => {
|
|||
const advisory = parseAdvisory(result.stdout);
|
||||
if (advisory) {
|
||||
assert.ok(!advisory.systemMessage.includes('Escalation-after-input'),
|
||||
'should NOT trigger escalation when input is >5 calls ago');
|
||||
'should NOT trigger when input is >20 calls ago (outside secondary window)');
|
||||
}
|
||||
} finally { teardown(); }
|
||||
});
|
||||
|
|
@ -1103,6 +1106,143 @@ describe('post-session-guard — escalation-after-input (S4)', () => {
|
|||
assert.equal(result.code, 0, 'escalation should never block (MEDIUM only)');
|
||||
} finally { teardown(); }
|
||||
});
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// E17 (v7.2.0) — configurable primary window + secondary 20-call advisory
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
it('E17 — secondary window catches delegation 6-20 calls after input (slow-burn)', async () => {
|
||||
setup();
|
||||
try {
|
||||
const entries = [];
|
||||
entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com'));
|
||||
// 8 Read calls — input is 9 calls before Task. Primary window (5) is
|
||||
// exceeded; secondary window (20) still catches it.
|
||||
for (let i = 0; i < 8; i++) {
|
||||
entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt'));
|
||||
}
|
||||
writeStateFile(entries);
|
||||
|
||||
const result = await runHook(SCRIPT, payload({
|
||||
toolName: 'Task',
|
||||
toolInput: { description: 'Slow-burn delegation' },
|
||||
}));
|
||||
assert.equal(result.code, 0);
|
||||
const advisory = parseAdvisory(result.stdout);
|
||||
assert.ok(advisory, 'should emit secondary-window advisory');
|
||||
assert.ok(
|
||||
advisory.systemMessage.includes('Slow-burn') ||
|
||||
advisory.systemMessage.includes('slow-burn'),
|
||||
`expected slow-burn message, got: ${advisory.systemMessage.slice(0, 200)}`,
|
||||
);
|
||||
} finally { teardown(); }
|
||||
});
|
||||
|
||||
it('E17 — secondary window boundary: exactly 20 calls triggers advisory', async () => {
|
||||
setup();
|
||||
try {
|
||||
const entries = [];
|
||||
entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com'));
|
||||
// 19 Read calls — input is 20 calls before Task. At the boundary.
|
||||
for (let i = 0; i < 19; i++) {
|
||||
entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt'));
|
||||
}
|
||||
writeStateFile(entries);
|
||||
|
||||
const result = await runHook(SCRIPT, payload({
|
||||
toolName: 'Task',
|
||||
toolInput: { description: 'Boundary test' },
|
||||
}));
|
||||
assert.equal(result.code, 0);
|
||||
const advisory = parseAdvisory(result.stdout);
|
||||
assert.ok(advisory, 'should detect at exactly the 20-call boundary');
|
||||
} finally { teardown(); }
|
||||
});
|
||||
|
||||
it('E17 — primary advisory still fires within first 5 calls (regression guard)', async () => {
|
||||
setup();
|
||||
try {
|
||||
const entries = [];
|
||||
entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com'));
|
||||
entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt'));
|
||||
writeStateFile(entries);
|
||||
|
||||
const result = await runHook(SCRIPT, payload({
|
||||
toolName: 'Task',
|
||||
toolInput: { description: 'Fast escalation' },
|
||||
}));
|
||||
assert.equal(result.code, 0);
|
||||
const advisory = parseAdvisory(result.stdout);
|
||||
assert.ok(advisory, 'primary advisory must still fire');
|
||||
// Primary message — NOT slow-burn
|
||||
assert.ok(
|
||||
!advisory.systemMessage.includes('Slow-burn') && !advisory.systemMessage.includes('slow-burn'),
|
||||
`expected primary (not slow-burn) message, got: ${advisory.systemMessage.slice(0, 200)}`,
|
||||
);
|
||||
assert.ok(
|
||||
advisory.systemMessage.includes('Escalation-after-input'),
|
||||
`expected primary escalation message, got: ${advisory.systemMessage.slice(0, 200)}`,
|
||||
);
|
||||
} finally { teardown(); }
|
||||
});
|
||||
|
||||
it('E17 — LLM_SECURITY_ESCALATION_WINDOW=3 narrows primary window', async () => {
|
||||
setup();
|
||||
try {
|
||||
const entries = [];
|
||||
entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com'));
|
||||
// 3 Read calls — input is 4 calls before Task.
|
||||
// With default window=5 → primary advisory.
|
||||
// With env=3 → outside primary, inside secondary (slow-burn advisory).
|
||||
for (let i = 0; i < 3; i++) {
|
||||
entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt'));
|
||||
}
|
||||
writeStateFile(entries);
|
||||
|
||||
const { runHookWithEnv } = await import('./hook-helper.mjs');
|
||||
const result = await runHookWithEnv(SCRIPT, payload({
|
||||
toolName: 'Task',
|
||||
toolInput: { description: 'env-overridden window' },
|
||||
}), { LLM_SECURITY_ESCALATION_WINDOW: '3' });
|
||||
assert.equal(result.code, 0);
|
||||
const advisory = parseAdvisory(result.stdout);
|
||||
assert.ok(advisory, 'should still emit advisory');
|
||||
// With narrowed primary, a 4-call distance falls into the secondary window.
|
||||
assert.ok(
|
||||
advisory.systemMessage.includes('Slow-burn') ||
|
||||
advisory.systemMessage.includes('slow-burn'),
|
||||
`expected slow-burn (since 4 > narrowed primary=3), got: ${advisory.systemMessage.slice(0, 200)}`,
|
||||
);
|
||||
} finally { teardown(); }
|
||||
});
|
||||
|
||||
it('E17 — LLM_SECURITY_ESCALATION_WINDOW=8 expands primary window', async () => {
|
||||
setup();
|
||||
try {
|
||||
const entries = [];
|
||||
entries.push(makeToolEntry('WebFetch', ['input_source'], 'https://attacker.com'));
|
||||
// 6 Read calls — input is 7 calls before Task.
|
||||
// With default window=5 → outside primary, inside secondary (slow-burn).
|
||||
// With env=8 → inside primary (primary advisory).
|
||||
for (let i = 0; i < 6; i++) {
|
||||
entries.push(makeToolEntry('Read', ['data_access'], '/tmp/test.txt'));
|
||||
}
|
||||
writeStateFile(entries);
|
||||
|
||||
const { runHookWithEnv } = await import('./hook-helper.mjs');
|
||||
const result = await runHookWithEnv(SCRIPT, payload({
|
||||
toolName: 'Task',
|
||||
toolInput: { description: 'env-expanded window' },
|
||||
}), { LLM_SECURITY_ESCALATION_WINDOW: '8' });
|
||||
assert.equal(result.code, 0);
|
||||
const advisory = parseAdvisory(result.stdout);
|
||||
assert.ok(advisory, 'should emit advisory');
|
||||
assert.ok(
|
||||
!advisory.systemMessage.includes('Slow-burn') && !advisory.systemMessage.includes('slow-burn'),
|
||||
`expected primary message (7 ≤ env=8), got: ${advisory.systemMessage.slice(0, 200)}`,
|
||||
);
|
||||
} finally { teardown(); }
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue