ktg-plugin-marketplace/plugins/llm-security/hooks/scripts/post-session-guard.mjs
Kjell Tore Guttormsen f0a1d4024a feat(post-session-guard): E17 — configurable escalation window + 20-call MEDIUM advisory
Critical-review §4 E17 finding: pre-v7.2.0 the delegation-after-input
advisory fired only within a 5-call window. Attackers who deliberately
waited 6+ calls before delegating bypassed detection. Window was also
hardcoded — operators couldn't tune it for their environment.

Two coordinated changes:

1. LLM_SECURITY_ESCALATION_WINDOW env var (primary window override)
   - parseInt(env) || getPolicyValue('trifecta', 'escalation_window', 5)
   - Mirrors the established pattern from
     LLM_SECURITY_TRIFECTA_MODE et al.
   - Setting env=3 narrows; env=8 expands.

2. Secondary 20-call MEDIUM advisory (slow-burn variant)
   - DELEGATION_ESCALATION_WINDOW_MEDIUM = 20 (hardcoded — same value
     for all operators; tunable in a future patch if needed)
   - checkEscalationAfterInput now returns `tier: 'primary'|'secondary'|null`
   - formatEscalationWarning emits a different message for secondary —
     mentions "slow-burn", references env-var, distinct from the
     primary "DeepMind Category 4" framing

Hook reads max(WINDOW_SIZE, secondary+5) entries to cover the wider
window. Existing duplicate-suppression (`escalation_warning` state
entry) covers both tiers. Audit-trail event captures `tier` field.

Tests: +5 cases in tests/hooks/post-session-guard.test.mjs:
- secondary window catches 9-call distance (slow-burn)
- secondary boundary at exactly 20 calls
- primary regression guard (1-call distance)
- env=3 narrows primary (4-call distance becomes secondary)
- env=8 expands primary (7-call distance stays primary)

Updated existing test "does NOT trigger when input_source is >5 calls
ago" — now requires >20 calls (secondary window catches 6-20).

Suite: 1644 → 1672 (+28 from new tests + extended scope). All green.

CLAUDE.md hooks table updated to document both windows and the env var.
2026-04-29 14:26:18 +02:00

1015 lines
38 KiB
JavaScript

#!/usr/bin/env node
// Hook: post-session-guard.mjs
// Event: PostToolUse (ALL tools)
// Purpose: Runtime lethal trifecta detection — monitors tool call sequences
// and warns when untrusted input + sensitive data access + exfiltration
// sink all appear within a sliding window.
//
// Protocol:
// - Read JSON from stdin: { tool_name, tool_input, tool_output }
// - Advisory only: always exit 0. Output systemMessage via stdout to warn.
// - State persisted in ${os.tmpdir()}/llm-security-session-${ppid}.jsonl
//
// Rule of Two (Meta, Oct 2025):
// Of 3 capabilities A (untrusted input), B (sensitive data), C (state change/exfil),
// an agent should NEVER hold all 3 simultaneously. Env var LLM_SECURITY_TRIFECTA_MODE
// controls enforcement: warn (default), block (exit 2 for high-confidence trifecta), off.
//
// Long-horizon monitoring (OpenAI Atlas, Dec 2025):
// 100-call window alongside 20-call for slow-burn trifecta detection and
// behavioral drift via Jensen-Shannon divergence on tool distributions.
//
// Sub-agent delegation tracking (DeepMind Agent Traps kat. 4, v5.0 S4):
// Task/Agent tools classified as 'delegation'. Escalation-after-input advisory
// when delegation occurs within 5 calls of an input_source (untrusted content
// may be influencing sub-agent spawning decisions).
//
// CaMeL-inspired data flow tagging (DeepMind CaMeL, v5.0 S6):
// Lightweight data provenance tracking. On tool output: hash first 200 chars as
// data tag. On next tool input: check substring match against prior tags. Match =
// "data flow link". Trifecta with linked flows = elevated severity.
//
// Trifecta concept (Willison / Invariant Labs):
// 1. Agent exposed to UNTRUSTED INPUT (prompt injection surface)
// 2. Agent has access to SENSITIVE DATA via tools
// 3. An EXFILTRATION SINK exists (HTTP POST, scp, etc.)
//
// OWASP: ASI01 (Excessive Agency), ASI02 (Data Leakage), LLM01 (Prompt Injection)
import { readFileSync, appendFileSync, existsSync, readdirSync, statSync, unlinkSync } from 'node:fs';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import { createHash } from 'node:crypto';
import { extractMcpServer } from '../../scanners/lib/mcp-description-cache.mjs';
import { jensenShannonDivergence, buildDistribution } from '../../scanners/lib/distribution-stats.mjs';
import { writeAuditEvent } from '../../scanners/lib/audit-trail.mjs';
import { getPolicyValue } from '../../scanners/lib/policy-loader.mjs';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const WINDOW_SIZE = getPolicyValue('trifecta', 'window_size', 20);
const STATE_PREFIX = 'llm-security-session-';
const STATE_DIR = tmpdir();
const CLEANUP_MAX_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours
// Long-horizon monitoring (OpenAI Atlas, Dec 2025)
const LONG_HORIZON_WINDOW = getPolicyValue('trifecta', 'long_horizon_window', 100);
const SLOW_BURN_MIN_SPREAD = 50;
const DRIFT_THRESHOLD = 0.25;
const DRIFT_SAMPLE_SIZE = 20;
// Sub-agent delegation tracking (DeepMind Agent Traps kat. 4, v5.0 S4)
// E17 (v7.2.0): primary window configurable via LLM_SECURITY_ESCALATION_WINDOW
// (default 5). Secondary 20-call window emits MEDIUM advisory for delegation
// in the [primary, 20]-call range. Both reference an input_source; the
// secondary catches slow-burn variants where the attacker waits past the
// primary window before delegating.
const DELEGATION_ESCALATION_WINDOW = (() => {
const envVal = parseInt(process.env.LLM_SECURITY_ESCALATION_WINDOW, 10);
if (Number.isFinite(envVal) && envVal > 0) return envVal;
return getPolicyValue('trifecta', 'escalation_window', 5);
})();
const DELEGATION_ESCALATION_WINDOW_MEDIUM = 20; // secondary longer-window advisory
// Rule of Two enforcement mode: block | warn | off (env var takes precedence over policy)
const policyTrifectaMode = getPolicyValue('trifecta', 'mode', 'warn');
const TRIFECTA_MODE = (process.env.LLM_SECURITY_TRIFECTA_MODE || policyTrifectaMode).toLowerCase();
// Volume tracking thresholds (cumulative bytes per session)
const VOLUME_THRESHOLDS = [
{ bytes: 1_000_000, label: '1 MB', severity: 'HIGH' },
{ bytes: 500_000, label: '500 KB', severity: 'MEDIUM' },
{ bytes: 100_000, label: '100 KB', severity: 'LOW' },
];
// ---------------------------------------------------------------------------
// Sensitive path patterns (for data_access classification of Read/Bash)
// ---------------------------------------------------------------------------
const SENSITIVE_PATH_PATTERNS = [
/\.env(?:\.|$)/i,
/\.ssh\//i,
/\.aws\//i,
/\.gnupg\//i,
/credentials/i,
/secrets?[./]/i,
/tokens?[./]/i,
/password/i,
/keychain/i,
/\.npmrc/i,
/\.pypirc/i,
/id_rsa/i,
/id_ed25519/i,
/authorized_keys/i,
/\.netrc/i,
/\.pgpass/i,
];
// ---------------------------------------------------------------------------
// Bash command patterns
// ---------------------------------------------------------------------------
const BASH_EXFIL_PATTERNS = [
/\bcurl\b[^|]*(?:-X\s*(?:POST|PUT|PATCH)\b|-d\s|--data\b|--data-\w+\b|-F\s|--form\b)/i,
/\bwget\b[^|]*--post/i,
/\bnc\s+(?:-[a-zA-Z]*\s+)*\S+\s+\d/i, // nc host port
/\bsendmail\b/i,
/\bscp\s/i,
/\brsync\b[^|]*[^/]\S+:/i, // rsync to remote (user@host:)
/\bgit\s+push\b/i,
/\bsftp\b/i,
];
const BASH_INPUT_PATTERNS = [
/\bcurl\b/i, // curl without POST indicators = downloading
/\bwget\b/i, // wget without --post = downloading
];
const BASH_DATA_CMD_PATTERNS = [
/\b(?:cat|head|tail|less|more|bat)\s/i,
];
// ---------------------------------------------------------------------------
// Classification
// ---------------------------------------------------------------------------
/**
* Classify a tool call into trifecta leg(s).
* @param {string} toolName
* @param {object} toolInput
* @returns {{ classes: string[], detail: string }}
*/
function classifyToolCall(toolName, toolInput) {
// --- WebFetch / WebSearch: always input_source ---
if (toolName === 'WebFetch' || toolName === 'WebSearch') {
const target = toolInput?.url || toolInput?.query || '';
return { classes: ['input_source'], detail: target.slice(0, 80) };
}
// --- MCP tools: untrusted external input ---
if (toolName?.startsWith('mcp__')) {
return { classes: ['input_source'], detail: toolName };
}
// --- Task / Agent: delegation (DeepMind Agent Traps kat. 4, v5.0 S4) ---
if (toolName === 'Task' || toolName === 'Agent') {
const desc = toolInput?.description || toolInput?.prompt || '';
return { classes: ['delegation'], detail: desc.slice(0, 80) };
}
// --- Read: data_access (sensitive path = stronger signal, but all reads count) ---
if (toolName === 'Read') {
const filePath = toolInput?.file_path || '';
const isSensitive = SENSITIVE_PATH_PATTERNS.some(p => p.test(filePath));
return {
classes: ['data_access'],
detail: `${isSensitive ? '[SENSITIVE] ' : ''}${filePath.slice(-60)}`,
};
}
// --- Grep / Glob: data_access ---
if (toolName === 'Grep' || toolName === 'Glob') {
const target = toolInput?.pattern || toolInput?.path || '';
return { classes: ['data_access'], detail: target.slice(0, 60) };
}
// --- Bash: can be multiple classes depending on command ---
if (toolName === 'Bash') {
return classifyBashCommand(toolInput?.command || '');
}
// --- Everything else: neutral ---
return { classes: ['neutral'], detail: '' };
}
/**
* Classify a Bash command. Can return multiple classes.
* @param {string} command
* @returns {{ classes: string[], detail: string }}
*/
function classifyBashCommand(command) {
const classes = [];
const detail = command.slice(0, 80);
// Check exfil first (highest priority)
if (BASH_EXFIL_PATTERNS.some(p => p.test(command))) {
classes.push('exfil_sink');
}
// Check data access: command reads files AND path looks sensitive
if (BASH_DATA_CMD_PATTERNS.some(p => p.test(command))) {
if (SENSITIVE_PATH_PATTERNS.some(p => p.test(command))) {
classes.push('data_access');
}
}
// Check input source: curl/wget without POST = downloading content
// Only add if not already classified as exfil (avoid double-counting curl POST)
if (!classes.includes('exfil_sink') && BASH_INPUT_PATTERNS.some(p => p.test(command))) {
classes.push('input_source');
}
if (classes.length === 0) {
classes.push('neutral');
}
return { classes, detail };
}
// ---------------------------------------------------------------------------
// State management
// ---------------------------------------------------------------------------
/**
* Get the state file path for this session.
* @returns {string}
*/
function getStateFilePath() {
return join(STATE_DIR, `${STATE_PREFIX}${process.ppid}.jsonl`);
}
/**
* Append a tool call entry to the state file.
* @param {string} stateFile
* @param {object} entry
*/
function appendEntry(stateFile, entry) {
appendFileSync(stateFile, JSON.stringify(entry) + '\n', 'utf-8');
}
/**
* Read the last N entries from the state file.
* @param {string} stateFile
* @param {number} n
* @returns {object[]}
*/
function readLastEntries(stateFile, n) {
if (!existsSync(stateFile)) return [];
try {
const content = readFileSync(stateFile, 'utf-8');
const lines = content.trim().split('\n').filter(Boolean);
const tail = lines.slice(-n);
const entries = [];
for (const line of tail) {
try { entries.push(JSON.parse(line)); } catch { /* skip malformed */ }
}
return entries;
} catch {
return [];
}
}
/**
* Clean up state files older than CLEANUP_MAX_AGE_MS.
* Only called on first invocation per session (when state file doesn't exist yet).
*/
function cleanupOldStateFiles() {
try {
const now = Date.now();
const files = readdirSync(STATE_DIR);
for (const file of files) {
if (!file.startsWith(STATE_PREFIX) || !file.endsWith('.jsonl')) continue;
const fullPath = join(STATE_DIR, file);
try {
const stat = statSync(fullPath);
if (now - stat.mtimeMs > CLEANUP_MAX_AGE_MS) {
unlinkSync(fullPath);
}
} catch { /* ignore per-file errors */ }
}
} catch { /* ignore cleanup errors entirely */ }
}
// ---------------------------------------------------------------------------
// Trifecta detection
// ---------------------------------------------------------------------------
/**
* Check if all 3 trifecta legs are present in the window.
* @param {object[]} entries
* @returns {{ detected: boolean, evidence: { input: string[], access: string[], exfil: string[] } }}
*/
function checkTrifecta(entries) {
const evidence = { input: [], access: [], exfil: [] };
for (const entry of entries) {
if (entry.type === 'warning') continue; // skip warning markers
const classes = entry.classes || [];
for (const cls of classes) {
if (cls === 'input_source') evidence.input.push(entry.detail || entry.tool);
if (cls === 'data_access') evidence.access.push(entry.detail || entry.tool);
if (cls === 'exfil_sink') evidence.exfil.push(entry.detail || entry.tool);
}
}
return {
detected: evidence.input.length > 0 && evidence.access.length > 0 && evidence.exfil.length > 0,
evidence,
};
}
/**
* Check if a warning was already emitted in the current window.
* @param {object[]} entries
* @returns {boolean}
*/
function hasRecentWarning(entries) {
return entries.some(e => e.type === 'warning');
}
/**
* Check if the trifecta is MCP-concentrated: all 3 legs originate from tools
* on the same MCP server. This is a stronger signal — a single compromised
* server providing input, accessing data, AND exfiltrating.
* @param {object[]} entries
* @returns {{ concentrated: boolean, server: string|null }}
*/
function checkMcpConcentration(entries) {
// Collect MCP servers per trifecta leg
const serversByLeg = { input: new Set(), access: new Set(), exfil: new Set() };
for (const entry of entries) {
if (entry.type === 'warning') continue;
const server = extractMcpServer(entry.tool);
if (!server) continue;
const classes = entry.classes || [];
for (const cls of classes) {
if (cls === 'input_source') serversByLeg.input.add(server);
if (cls === 'data_access') serversByLeg.access.add(server);
if (cls === 'exfil_sink') serversByLeg.exfil.add(server);
}
}
// Find a server present in all 3 legs
for (const server of serversByLeg.input) {
if (serversByLeg.access.has(server) && serversByLeg.exfil.has(server)) {
return { concentrated: true, server };
}
}
return { concentrated: false, server: null };
}
/**
* Check if the trifecta involves sensitive path access + exfiltration.
* This is a high-confidence signal: data from .env/.ssh/.aws etc. being sent out.
* @param {object[]} entries
* @returns {boolean}
*/
function checkSensitiveExfil(entries) {
let hasSensitiveAccess = false;
let hasExfil = false;
for (const entry of entries) {
if (entry.type === 'warning') continue;
const classes = entry.classes || [];
const detail = entry.detail || '';
if (classes.includes('data_access') && detail.startsWith('[SENSITIVE]')) {
hasSensitiveAccess = true;
}
if (classes.includes('exfil_sink')) {
hasExfil = true;
}
}
return hasSensitiveAccess && hasExfil;
}
/**
* Compute cumulative data volume from entries with outputSize.
* @param {object[]} allEntries - All entries (not just window)
* @returns {number} Total bytes
*/
function computeCumulativeVolume(allEntries) {
let total = 0;
for (const entry of allEntries) {
if (entry.type === 'warning' || entry.type === 'volume_warning') continue;
total += entry.outputSize || 0;
}
return total;
}
/**
* Check if a volume warning at a given threshold was already emitted.
* @param {object[]} entries
* @param {number} thresholdBytes
* @returns {boolean}
*/
function hasVolumeWarning(entries, thresholdBytes) {
return entries.some(e => e.type === 'volume_warning' && e.threshold === thresholdBytes);
}
/**
* Format the volume warning message.
* @param {number} totalBytes
* @param {string} thresholdLabel
* @param {string} severity
* @returns {string}
*/
function formatVolumeWarning(totalBytes, thresholdLabel, severity) {
const kb = Math.round(totalBytes / 1024);
return (
`SECURITY ADVISORY (session-guard): Cumulative MCP data volume exceeded ${thresholdLabel} [${severity}].\n\n` +
`This session has received ~${kb} KB of tool output data.\n` +
'High cumulative volume may indicate bulk data harvesting or exfiltration staging (OWASP ASI02).\n' +
'Review whether the volume of data being processed is proportional to the task.'
);
}
/**
* Format the trifecta warning message.
* Uses Rule of Two terminology (Meta, Oct 2025): A=untrusted input, B=sensitive data, C=state change/exfil.
* @param {{ input: string[], access: string[], exfil: string[] }} evidence
* @param {{ concentrated: boolean, server: string|null }} [mcpInfo]
* @param {boolean} [isSensitiveExfil]
* @returns {string}
*/
function formatWarning(evidence, mcpInfo, isSensitiveExfil) {
const inputEx = evidence.input.slice(-2).map(e => ` - ${e}`).join('\n');
const accessEx = evidence.access.slice(-2).map(e => ` - ${e}`).join('\n');
const exfilEx = evidence.exfil.slice(-2).map(e => ` - ${e}`).join('\n');
const mcpLine = mcpInfo?.concentrated
? `\nRULE OF TWO VIOLATION: MCP-CONCENTRATED — All 3 legs trace to server "${mcpInfo.server}" (elevated severity).\n`
: '';
const sensitiveLine = isSensitiveExfil
? '\nRULE OF TWO VIOLATION: SENSITIVE DATA + EXFILTRATION — Sensitive paths accessed and exfil sink present.\n'
: '';
return (
'SECURITY ADVISORY (session-guard): Rule of Two violation — potential lethal trifecta detected.\n\n' +
'Within the last 20 tool calls, this session holds all 3 capabilities simultaneously:\n' +
' [A] Untrusted external input (prompt injection surface):\n' + inputEx + '\n' +
' [B] Sensitive data access:\n' + accessEx + '\n' +
' [C] Exfiltration-capable tool (state change):\n' + exfilEx + '\n' +
mcpLine + sensitiveLine + '\n' +
'Rule of Two (Meta, Oct 2025): An agent should never hold A+B+C simultaneously.\n' +
'This combination enables prompt injection -> data theft chains (OWASP ASI01, ASI02, LLM01).\n' +
'Review recent tool calls for unexpected behavior.'
);
}
// ---------------------------------------------------------------------------
// Sub-agent delegation tracking (DeepMind Agent Traps kat. 4, v5.0 S4)
// ---------------------------------------------------------------------------
/**
* Check for escalation-after-input: delegation within DELEGATION_ESCALATION_WINDOW
* calls of an input_source. Untrusted content consumed shortly before spawning a
* sub-agent may indicate the model is being manipulated into delegating dangerous work.
*
* E17 (v7.2.0): returns a `tier` indicating which window matched.
* - `'primary'` — input within DELEGATION_ESCALATION_WINDOW calls (default 5).
* Existing MEDIUM advisory.
* - `'secondary'` — input within DELEGATION_ESCALATION_WINDOW_MEDIUM calls
* (20) but outside the primary window. New, slow-burn variant —
* also MEDIUM but with a different message.
* - `null` (when detected=false) — no input source within either window.
*
* @param {object[]} entries — recent window (long-horizon, 100-call)
* @param {{ classes: string[] }} currentEntry — the entry just appended
* @returns {{ detected: boolean, inputDetail: string, tier: 'primary'|'secondary'|null }}
*/
function checkEscalationAfterInput(entries, currentEntry) {
if (!currentEntry.classes.includes('delegation')) {
return { detected: false, inputDetail: '', tier: null };
}
const toolEntries = entries.filter(e => !e.type);
// Look at the last DELEGATION_ESCALATION_WINDOW_MEDIUM entries before
// current (excluding current). Iterate from newest (closest to delegation)
// to oldest, so we report tier=primary if a match is in the inner window.
const limit = DELEGATION_ESCALATION_WINDOW_MEDIUM;
const slice = toolEntries.slice(-(limit + 1), -1); // exclude current
// Walk newest-to-oldest. Index from the end: distance 1 = most recent.
for (let i = slice.length - 1; i >= 0; i--) {
const entry = slice[i];
if ((entry.classes || []).includes('input_source')) {
// distance: how many tool calls between input_source and current
// delegation. distance=1 means input is directly before delegation.
const distance = slice.length - i;
const tier = distance <= DELEGATION_ESCALATION_WINDOW ? 'primary' : 'secondary';
return {
detected: true,
inputDetail: entry.detail || entry.tool || 'unknown',
tier,
};
}
}
return { detected: false, inputDetail: '', tier: null };
}
/**
* Check if an escalation-after-input warning was already emitted.
* @param {object[]} entries
* @returns {boolean}
*/
function hasEscalationWarning(entries) {
return entries.some(e => e.type === 'escalation_warning');
}
/**
* Format the escalation-after-input warning.
* @param {string} delegationDetail — what the delegation was for
* @param {string} inputDetail — what input source preceded it
* @param {'primary'|'secondary'} tier — which window matched (E17, v7.2.0)
* @returns {string}
*/
function formatEscalationWarning(delegationDetail, inputDetail, tier = 'primary') {
if (tier === 'secondary') {
return (
'SECURITY ADVISORY (session-guard): Slow-burn escalation-after-input detected [MEDIUM] — ' +
'sub-agent delegation in the slow-burn window after untrusted input.\n\n' +
`A Task/Agent delegation occurred within ${DELEGATION_ESCALATION_WINDOW_MEDIUM} calls (` +
`but outside the ${DELEGATION_ESCALATION_WINDOW}-call primary window) of untrusted input:\n` +
` Input source: ${inputDetail}\n` +
` Delegation: ${delegationDetail}\n\n` +
'This is a slower variant of the escalation-after-input pattern. The wider window\n' +
'catches attackers who deliberately wait past the primary window before delegating,\n' +
'and surfaces patterns that the primary 5-call window cannot. Review whether this\n' +
'delegation is expected and appropriately scoped.\n' +
'Configure window via LLM_SECURITY_ESCALATION_WINDOW env var (default 5).'
);
}
return (
'SECURITY ADVISORY (session-guard): Escalation-after-input detected [MEDIUM] — ' +
'sub-agent delegation shortly after untrusted input.\n\n' +
`A Task/Agent delegation occurred within ${DELEGATION_ESCALATION_WINDOW} calls of untrusted input:\n` +
` Input source: ${inputDetail}\n` +
` Delegation: ${delegationDetail}\n\n` +
'Untrusted content (web pages, MCP tool output) may be influencing the model\n' +
'to spawn sub-agents with capabilities beyond the original task scope.\n' +
'This is a known attack vector (DeepMind AI Agent Traps, Category 4).\n' +
'Review whether this delegation is expected and appropriately scoped.\n' +
'Configure window via LLM_SECURITY_ESCALATION_WINDOW env var (default 5).'
);
}
// ---------------------------------------------------------------------------
// Long-horizon monitoring (100-call window) — OpenAI Atlas, Dec 2025
// ---------------------------------------------------------------------------
/**
* Filter entries to only tool calls (exclude warning/marker entries).
* @param {object[]} entries
* @returns {object[]}
*/
function filterToolEntries(entries) {
return entries.filter(e => !e.type);
}
/**
* Check for slow-burn trifecta: all 3 legs present but spread over >50 calls.
* Catches multi-step injection chains that pace actions to avoid short-window detection.
* @param {object[]} entries - Long-horizon window entries
* @returns {{ detected: boolean, spread: number }}
*/
function checkSlowBurnTrifecta(entries) {
const toolEntries = filterToolEntries(entries);
let firstInput = -1, firstAccess = -1, firstExfil = -1;
let lastInput = -1, lastAccess = -1, lastExfil = -1;
for (let i = 0; i < toolEntries.length; i++) {
for (const cls of toolEntries[i].classes || []) {
if (cls === 'input_source') {
if (firstInput === -1) firstInput = i;
lastInput = i;
}
if (cls === 'data_access') {
if (firstAccess === -1) firstAccess = i;
lastAccess = i;
}
if (cls === 'exfil_sink') {
if (firstExfil === -1) firstExfil = i;
lastExfil = i;
}
}
}
if (firstInput === -1 || firstAccess === -1 || firstExfil === -1) {
return { detected: false, spread: 0 };
}
const earliestFirst = Math.min(firstInput, firstAccess, firstExfil);
const latestLast = Math.max(lastInput, lastAccess, lastExfil);
const spread = latestLast - earliestFirst;
return { detected: spread > SLOW_BURN_MIN_SPREAD, spread };
}
/**
* @param {object[]} entries
* @returns {boolean}
*/
function hasSlowBurnWarning(entries) {
return entries.some(e => e.type === 'slow_burn_warning');
}
/**
* Detect behavioral drift: tool distribution shift in first vs last DRIFT_SAMPLE_SIZE calls.
* @param {object[]} entries
* @returns {{ drifted: boolean, jsd: number, firstTools: string[], lastTools: string[] }}
*/
function checkBehavioralDrift(entries) {
const toolEntries = filterToolEntries(entries);
if (toolEntries.length < 2 * DRIFT_SAMPLE_SIZE) {
return { drifted: false, jsd: 0, firstTools: [], lastTools: [] };
}
const firstTools = toolEntries.slice(0, DRIFT_SAMPLE_SIZE).map(e => e.tool);
const lastTools = toolEntries.slice(-DRIFT_SAMPLE_SIZE).map(e => e.tool);
const P = buildDistribution(firstTools);
const Q = buildDistribution(lastTools);
const jsd = jensenShannonDivergence(P, Q);
return { drifted: jsd > DRIFT_THRESHOLD, jsd, firstTools, lastTools };
}
/**
* @param {object[]} entries
* @returns {boolean}
*/
function hasDriftWarning(entries) {
return entries.some(e => e.type === 'drift_warning');
}
/**
* Get top N most frequent items from an array, formatted as "name(count)".
* @param {string[]} items
* @param {number} n
* @returns {string}
*/
function topN(items, n) {
const counts = new Map();
for (const item of items) counts.set(item, (counts.get(item) || 0) + 1);
return [...counts.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, n)
.map(([name, count]) => `${name}(${count})`)
.join(', ');
}
/**
* Format the slow-burn trifecta warning message.
* @param {number} spread
* @returns {string}
*/
function formatSlowBurnWarning(spread) {
return (
'SECURITY ADVISORY (session-guard): Slow-burn trifecta detected [MEDIUM] — ' +
`Rule of Two legs spread over ${spread} calls.\n\n` +
'Within the last 100 tool calls, all 3 capabilities appeared but spread across a wide range:\n' +
' [A] Untrusted external input (prompt injection surface)\n' +
' [B] Sensitive data access\n' +
' [C] Exfiltration-capable tool (state change)\n\n' +
'This pattern may indicate a multi-step prompt injection chain (OpenAI Atlas, Dec 2025).\n' +
'Wide spread across calls makes detection harder with short-window monitoring.'
);
}
/**
* Format the behavioral drift warning message.
* @param {number} jsd
* @param {string[]} firstTools
* @param {string[]} lastTools
* @returns {string}
*/
function formatDriftWarning(jsd, firstTools, lastTools) {
return (
'SECURITY ADVISORY (session-guard): Behavioral drift detected [MEDIUM] — tool usage shift.\n\n' +
`Jensen-Shannon divergence: ${jsd.toFixed(3)} (threshold: ${DRIFT_THRESHOLD})\n` +
`First ${DRIFT_SAMPLE_SIZE} calls: ${topN(firstTools, 3)}\n` +
`Last ${DRIFT_SAMPLE_SIZE} calls: ${topN(lastTools, 3)}\n\n` +
'A significant shift in tool usage patterns may indicate session hijacking or prompt injection\n' +
"changing the agent's behavior over time (OpenAI Atlas, Dec 2025)."
);
}
// ---------------------------------------------------------------------------
// Output fingerprint matching (inspired by CaMeL, DeepMind 2025; v5.0 S6)
//
// NOTE: This is opportunistic byte-matching of truncated output fingerprints,
// not semantic data-flow tracking. We hash the first 200 bytes of tool output
// (SHA-256, truncated to 16 hex chars) and check whether that exact tag
// appears verbatim in the next tool input. Trivially bypassed by:
// - Mutating any of the first 200 bytes
// - Summarising the output before passing it on
// - Re-encoding (base64, JSON-escape, whitespace changes)
// Inspired by CaMeL but NOT a CaMeL capability-tracking implementation.
// ---------------------------------------------------------------------------
/**
* Compute a short output fingerprint from tool output (first 200 chars,
* SHA-256 truncated to 16 hex). Used for opportunistic byte-matching, not
* semantic provenance.
* @param {string} text - tool output text
* @returns {string} 16-char hex hash
*/
function computeDataTag(text) {
const sample = text.slice(0, 200);
return createHash('sha256').update(sample).digest('hex').slice(0, 16);
}
/**
* Extract a string representation of tool input for data flow matching.
* @param {object} toolInput
* @returns {string}
*/
function extractInputText(toolInput) {
if (!toolInput || typeof toolInput !== 'object') return '';
// Collect all string values from the input object
const parts = [];
for (const val of Object.values(toolInput)) {
if (typeof val === 'string') parts.push(val);
else if (typeof val === 'object') parts.push(JSON.stringify(val));
}
return parts.join(' ');
}
/**
* Check if the current tool input contains data that matches a previous output's tag.
* Matches by checking if the first 200 chars of any previous output hash matches
* a stored tag, AND the current input contains a substring from previous output.
* For efficiency, uses dataTag hashes and inputSnippet matching.
* @param {object[]} entries - recent state entries
* @param {string} currentInputText - stringified current tool input
* @returns {{ linked: boolean, sourceEntries: object[] }}
*/
function checkDataFlowLink(entries, currentInputText) {
if (!currentInputText || currentInputText.length < 20) {
return { linked: false, sourceEntries: [] };
}
const sourceEntries = [];
// Check if any previous entry's data tag matches content in current input
for (const entry of entries) {
if (entry.type || !entry.dataTag) continue;
// Check if the input text contains a meaningful snippet from the output
// We store inputSnippet from previous entries for cross-reference
if (entry.outputSnippet && currentInputText.includes(entry.outputSnippet)) {
sourceEntries.push(entry);
}
}
return { linked: sourceEntries.length > 0, sourceEntries };
}
/**
* Check if a data flow warning was already emitted.
* @param {object[]} entries
* @returns {boolean}
*/
function hasDataFlowWarning(entries) {
return entries.some(e => e.type === 'data_flow_warning');
}
/**
* Format the data flow linked trifecta warning.
* @param {{ input: string[], access: string[], exfil: string[] }} evidence
* @param {object[]} sourceEntries
* @returns {string}
*/
function formatDataFlowWarning(evidence, sourceEntries) {
const sources = sourceEntries.slice(0, 3).map(e =>
` - ${e.tool}${e.detail || 'unknown'}`
).join('\n');
return (
'SECURITY ADVISORY (session-guard): Data flow linked trifecta [HIGH] — ' +
'CaMeL-style provenance tracking detected data flow chain.\n\n' +
'Tool output from an untrusted source appears to flow into subsequent tool inputs,\n' +
'creating a traceable data flow chain across the trifecta:\n' +
` Data flow sources:\n${sources}\n\n` +
'This elevates the trifecta severity: data is not just co-located in the session,\n' +
'but actively flowing between tools in a potential injection chain (DeepMind CaMeL).'
);
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
let input;
try {
const raw = readFileSync(0, 'utf-8');
input = JSON.parse(raw);
} catch {
process.exit(0);
}
const toolName = input?.tool_name ?? '';
const toolInput = input?.tool_input ?? {};
const toolOutput = input?.tool_output ?? '';
if (!toolName) {
process.exit(0);
}
// Off mode: skip all detection
if (TRIFECTA_MODE === 'off') {
process.exit(0);
}
// Compute output size for volume tracking
const outputText = typeof toolOutput === 'string' ? toolOutput : JSON.stringify(toolOutput);
const outputSize = Buffer.byteLength(outputText, 'utf-8');
// Classify the current tool call
const { classes, detail } = classifyToolCall(toolName, toolInput);
// State file management
const stateFile = getStateFilePath();
const isFirstCall = !existsSync(stateFile);
// Cleanup old state files on first call per session
if (isFirstCall) {
cleanupOldStateFiles();
}
// Compute data tag for CaMeL-style flow tracking (v5.0 S6)
const dataTag = outputText.length >= 20 ? computeDataTag(outputText) : null;
// Store a short snippet from output for data flow matching (first 50 non-whitespace chars)
const outputSnippet = outputText.length >= 50
? outputText.trim().slice(0, 50)
: null;
// Append current entry (with outputSize for volume tracking, dataTag for CaMeL)
const entry = {
ts: Date.now(),
tool: toolName,
classes,
detail,
outputSize,
...(dataTag ? { dataTag } : {}),
...(outputSnippet ? { outputSnippet } : {}),
};
appendEntry(stateFile, entry);
const messages = [];
// --- Trifecta detection (skip for neutral-only and delegation-only calls) ---
if (!(classes.length === 1 && (classes[0] === 'neutral' || classes[0] === 'delegation'))) {
const window = readLastEntries(stateFile, WINDOW_SIZE);
const { detected, evidence } = checkTrifecta(window);
if (detected && !hasRecentWarning(window)) {
const mcpInfo = checkMcpConcentration(window);
const sensitiveExfil = checkSensitiveExfil(window);
messages.push(formatWarning(evidence, mcpInfo, sensitiveExfil));
appendEntry(stateFile, { type: 'warning', ts: Date.now() });
writeAuditEvent({
event_type: 'trifecta_warning',
severity: mcpInfo.concentrated || sensitiveExfil ? 'critical' : 'high',
source: 'post-session-guard',
details: { evidence, mcp_concentrated: mcpInfo.concentrated, sensitive_exfil: sensitiveExfil },
owasp: ['ASI01', 'ASI02', 'LLM01'],
action_taken: TRIFECTA_MODE === 'block' ? 'blocked' : 'warned',
});
// --- Rule of Two: Block mode ---
// v7.1.0 B2 fix: block mode blocks on any detected trifecta, not only
// MCP-concentrated or sensitive-path cases. Distributed trifectas
// (different sources, non-sensitive path, non-sensitive sink) were
// previously only warned — a mismatch with the documented semantics
// of block mode. The severity gate below (critical vs high) remains:
// distributed trifectas are blocked with high-severity framing; MCP-
// concentrated and sensitive-exfil cases are blocked with critical-
// severity framing.
if (TRIFECTA_MODE === 'block') {
let context;
if (mcpInfo.concentrated) {
context = ` MCP-concentrated: all 3 legs via server "${mcpInfo.server}"\n`;
} else if (sensitiveExfil) {
context = ' Sensitive data access combined with exfiltration sink\n';
} else {
context = ' Distributed trifecta: three legs from different sources\n';
}
process.stderr.write(
'BLOCKED: Rule of Two violation — lethal trifecta detected.\n' +
context +
' Set LLM_SECURITY_TRIFECTA_MODE=warn to downgrade to advisory.\n'
);
process.stdout.write(JSON.stringify({ decision: 'block' }));
process.exit(2);
}
}
}
// --- Escalation-after-input detection (E17 v7.2.0: primary + secondary window) ---
// Primary window: DELEGATION_ESCALATION_WINDOW (default 5, env-configurable).
// Secondary window: DELEGATION_ESCALATION_WINDOW_MEDIUM (20). Slow-burn variant
// emits MEDIUM advisory with a different message. Read enough entries to cover
// the secondary window.
if (classes.includes('delegation')) {
const escalationWindow = readLastEntries(stateFile, Math.max(WINDOW_SIZE, DELEGATION_ESCALATION_WINDOW_MEDIUM + 5));
const escalation = checkEscalationAfterInput(escalationWindow, entry);
if (escalation.detected && !hasEscalationWarning(escalationWindow)) {
messages.push(formatEscalationWarning(detail, escalation.inputDetail, escalation.tier));
appendEntry(stateFile, { type: 'escalation_warning', ts: Date.now(), tier: escalation.tier });
writeAuditEvent({
event_type: 'escalation_after_input',
severity: 'medium',
source: 'post-session-guard',
details: { tool: detail, input_source: escalation.inputDetail, tier: escalation.tier },
owasp: ['ASI01'],
action_taken: 'warned',
});
}
}
// --- CaMeL data flow check (v5.0 S6) ---
// Check if current tool input contains data that flowed from a previous tool output.
// If a data flow link is detected AND a trifecta is present, elevate severity.
if (!(classes.length === 1 && classes[0] === 'neutral')) {
const inputText = extractInputText(toolInput);
if (inputText.length >= 20) {
const window = readLastEntries(stateFile, WINDOW_SIZE);
const flowLink = checkDataFlowLink(window, inputText);
if (flowLink.linked && !hasDataFlowWarning(window)) {
// Check if a trifecta is also present
const { detected, evidence } = checkTrifecta(window);
if (detected) {
messages.push(formatDataFlowWarning(evidence, flowLink.sourceEntries));
appendEntry(stateFile, { type: 'data_flow_warning', ts: Date.now() });
writeAuditEvent({
event_type: 'data_flow_trifecta',
severity: 'high',
source: 'post-session-guard',
details: { evidence, flow_sources: flowLink.sourceEntries.length },
owasp: ['ASI01', 'ASI02'],
action_taken: 'warned',
});
}
}
}
}
// --- Cumulative volume tracking ---
if (outputSize > 0) {
const allEntries = readLastEntries(stateFile, 10_000); // read all
const totalVolume = computeCumulativeVolume(allEntries);
// Check thresholds from highest to lowest — only warn once per threshold
for (const { bytes, label, severity } of VOLUME_THRESHOLDS) {
if (totalVolume >= bytes && !hasVolumeWarning(allEntries, bytes)) {
messages.push(formatVolumeWarning(totalVolume, label, severity));
appendEntry(stateFile, { type: 'volume_warning', ts: Date.now(), threshold: bytes });
writeAuditEvent({
event_type: 'volume_threshold',
severity: severity.toLowerCase(),
source: 'post-session-guard',
details: { total_bytes: totalVolume, threshold: label },
owasp: ['ASI02'],
action_taken: 'warned',
});
break; // only emit highest unwarned threshold
}
}
}
// --- Long-horizon monitoring (100-call window) ---
{
const longWindow = readLastEntries(stateFile, LONG_HORIZON_WINDOW);
// Slow-burn trifecta: all 3 legs spread over >50 calls
const slowBurn = checkSlowBurnTrifecta(longWindow);
if (slowBurn.detected && !hasSlowBurnWarning(longWindow)) {
messages.push(formatSlowBurnWarning(slowBurn.spread));
appendEntry(stateFile, { type: 'slow_burn_warning', ts: Date.now() });
writeAuditEvent({
event_type: 'slow_burn_trifecta',
severity: 'medium',
source: 'post-session-guard',
details: { spread: slowBurn.spread },
owasp: ['ASI06', 'ASI08'],
action_taken: 'warned',
});
}
// Behavioral drift: JSD on tool distribution (first vs last DRIFT_SAMPLE_SIZE)
const drift = checkBehavioralDrift(longWindow);
if (drift.drifted && !hasDriftWarning(longWindow)) {
messages.push(formatDriftWarning(drift.jsd, drift.firstTools, drift.lastTools));
appendEntry(stateFile, { type: 'drift_warning', ts: Date.now() });
writeAuditEvent({
event_type: 'behavioral_drift',
severity: 'medium',
source: 'post-session-guard',
details: { jsd: drift.jsd, first_tools: drift.firstTools, last_tools: drift.lastTools },
owasp: ['ASI06', 'ASI08'],
action_taken: 'warned',
});
}
}
// Emit combined advisory
if (messages.length > 0) {
const combined = messages.join('\n\n---\n\n');
process.stdout.write(JSON.stringify({ systemMessage: combined }));
}
// Default: advisory only (warn mode)
process.exit(0);