Previously, `LLM_SECURITY_TRIFECTA_MODE=block` only exited 2 when the
detected trifecta was MCP-concentrated (all three legs via the same MCP
server) or involved sensitive-path + exfil. Distributed trifectas —
three legs originating from different tools, with a non-sensitive data
path and a non-sensitive exfiltration sink — were detected and warned
but not blocked. This mismatched the documented semantics of block mode
and gave operators a false sense of enforcement.
Change: remove the `(mcpInfo.concentrated || sensitiveExfil)` AND-gate
in the `TRIFECTA_MODE === 'block'` branch so any detected trifecta
blocks in block mode. Audit event `severity` still differentiates
critical (concentrated / sensitive-exfil) from high (distributed); the
blocked stderr message now explicitly names "Distributed trifecta:
three legs from different sources" when the confidence sub-signals
are absent.
Addresses critical review 2026-04-20 §2 B2 (HIGH) and §9 row 1
("enforces the Rule of Two").
Tests: 1 added (distributed trifecta in block mode now exits 2).
All 1495 tests pass.
953 lines
34 KiB
JavaScript
953 lines
34 KiB
JavaScript
#!/usr/bin/env node
|
|
// Hook: post-session-guard.mjs
|
|
// Event: PostToolUse (ALL tools)
|
|
// Purpose: Runtime lethal trifecta detection — monitors tool call sequences
|
|
// and warns when untrusted input + sensitive data access + exfiltration
|
|
// sink all appear within a sliding window.
|
|
//
|
|
// Protocol:
|
|
// - Read JSON from stdin: { tool_name, tool_input, tool_output }
|
|
// - Advisory only: always exit 0. Output systemMessage via stdout to warn.
|
|
// - State persisted in ${os.tmpdir()}/llm-security-session-${ppid}.jsonl
|
|
//
|
|
// Rule of Two (Meta, Oct 2025):
|
|
// Of 3 capabilities A (untrusted input), B (sensitive data), C (state change/exfil),
|
|
// an agent should NEVER hold all 3 simultaneously. Env var LLM_SECURITY_TRIFECTA_MODE
|
|
// controls enforcement: warn (default), block (exit 2 for high-confidence trifecta), off.
|
|
//
|
|
// Long-horizon monitoring (OpenAI Atlas, Dec 2025):
|
|
// 100-call window alongside 20-call for slow-burn trifecta detection and
|
|
// behavioral drift via Jensen-Shannon divergence on tool distributions.
|
|
//
|
|
// Sub-agent delegation tracking (DeepMind Agent Traps kat. 4, v5.0 S4):
|
|
// Task/Agent tools classified as 'delegation'. Escalation-after-input advisory
|
|
// when delegation occurs within 5 calls of an input_source (untrusted content
|
|
// may be influencing sub-agent spawning decisions).
|
|
//
|
|
// CaMeL-inspired data flow tagging (DeepMind CaMeL, v5.0 S6):
|
|
// Lightweight data provenance tracking. On tool output: hash first 200 chars as
|
|
// data tag. On next tool input: check substring match against prior tags. Match =
|
|
// "data flow link". Trifecta with linked flows = elevated severity.
|
|
//
|
|
// Trifecta concept (Willison / Invariant Labs):
|
|
// 1. Agent exposed to UNTRUSTED INPUT (prompt injection surface)
|
|
// 2. Agent has access to SENSITIVE DATA via tools
|
|
// 3. An EXFILTRATION SINK exists (HTTP POST, scp, etc.)
|
|
//
|
|
// OWASP: ASI01 (Excessive Agency), ASI02 (Data Leakage), LLM01 (Prompt Injection)
|
|
|
|
import { readFileSync, appendFileSync, existsSync, readdirSync, statSync, unlinkSync } from 'node:fs';
|
|
import { join } from 'node:path';
|
|
import { tmpdir } from 'node:os';
|
|
import { createHash } from 'node:crypto';
|
|
import { extractMcpServer } from '../../scanners/lib/mcp-description-cache.mjs';
|
|
import { jensenShannonDivergence, buildDistribution } from '../../scanners/lib/distribution-stats.mjs';
|
|
import { writeAuditEvent } from '../../scanners/lib/audit-trail.mjs';
|
|
import { getPolicyValue } from '../../scanners/lib/policy-loader.mjs';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Constants
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const WINDOW_SIZE = getPolicyValue('trifecta', 'window_size', 20);
|
|
const STATE_PREFIX = 'llm-security-session-';
|
|
const STATE_DIR = tmpdir();
|
|
const CLEANUP_MAX_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours
|
|
|
|
// Long-horizon monitoring (OpenAI Atlas, Dec 2025)
|
|
const LONG_HORIZON_WINDOW = getPolicyValue('trifecta', 'long_horizon_window', 100);
|
|
const SLOW_BURN_MIN_SPREAD = 50;
|
|
const DRIFT_THRESHOLD = 0.25;
|
|
const DRIFT_SAMPLE_SIZE = 20;
|
|
|
|
// Sub-agent delegation tracking (DeepMind Agent Traps kat. 4, v5.0 S4)
|
|
const DELEGATION_ESCALATION_WINDOW = 5; // calls after input_source
|
|
|
|
// Rule of Two enforcement mode: block | warn | off (env var takes precedence over policy)
|
|
const policyTrifectaMode = getPolicyValue('trifecta', 'mode', 'warn');
|
|
const TRIFECTA_MODE = (process.env.LLM_SECURITY_TRIFECTA_MODE || policyTrifectaMode).toLowerCase();
|
|
|
|
// Volume tracking thresholds (cumulative bytes per session)
|
|
const VOLUME_THRESHOLDS = [
|
|
{ bytes: 1_000_000, label: '1 MB', severity: 'HIGH' },
|
|
{ bytes: 500_000, label: '500 KB', severity: 'MEDIUM' },
|
|
{ bytes: 100_000, label: '100 KB', severity: 'LOW' },
|
|
];
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Sensitive path patterns (for data_access classification of Read/Bash)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const SENSITIVE_PATH_PATTERNS = [
|
|
/\.env(?:\.|$)/i,
|
|
/\.ssh\//i,
|
|
/\.aws\//i,
|
|
/\.gnupg\//i,
|
|
/credentials/i,
|
|
/secrets?[./]/i,
|
|
/tokens?[./]/i,
|
|
/password/i,
|
|
/keychain/i,
|
|
/\.npmrc/i,
|
|
/\.pypirc/i,
|
|
/id_rsa/i,
|
|
/id_ed25519/i,
|
|
/authorized_keys/i,
|
|
/\.netrc/i,
|
|
/\.pgpass/i,
|
|
];
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Bash command patterns
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const BASH_EXFIL_PATTERNS = [
|
|
/\bcurl\b[^|]*(?:-X\s*(?:POST|PUT|PATCH)\b|-d\s|--data\b|--data-\w+\b|-F\s|--form\b)/i,
|
|
/\bwget\b[^|]*--post/i,
|
|
/\bnc\s+(?:-[a-zA-Z]*\s+)*\S+\s+\d/i, // nc host port
|
|
/\bsendmail\b/i,
|
|
/\bscp\s/i,
|
|
/\brsync\b[^|]*[^/]\S+:/i, // rsync to remote (user@host:)
|
|
/\bgit\s+push\b/i,
|
|
/\bsftp\b/i,
|
|
];
|
|
|
|
const BASH_INPUT_PATTERNS = [
|
|
/\bcurl\b/i, // curl without POST indicators = downloading
|
|
/\bwget\b/i, // wget without --post = downloading
|
|
];
|
|
|
|
const BASH_DATA_CMD_PATTERNS = [
|
|
/\b(?:cat|head|tail|less|more|bat)\s/i,
|
|
];
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Classification
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Classify a tool call into trifecta leg(s).
|
|
* @param {string} toolName
|
|
* @param {object} toolInput
|
|
* @returns {{ classes: string[], detail: string }}
|
|
*/
|
|
function classifyToolCall(toolName, toolInput) {
|
|
// --- WebFetch / WebSearch: always input_source ---
|
|
if (toolName === 'WebFetch' || toolName === 'WebSearch') {
|
|
const target = toolInput?.url || toolInput?.query || '';
|
|
return { classes: ['input_source'], detail: target.slice(0, 80) };
|
|
}
|
|
|
|
// --- MCP tools: untrusted external input ---
|
|
if (toolName?.startsWith('mcp__')) {
|
|
return { classes: ['input_source'], detail: toolName };
|
|
}
|
|
|
|
// --- Task / Agent: delegation (DeepMind Agent Traps kat. 4, v5.0 S4) ---
|
|
if (toolName === 'Task' || toolName === 'Agent') {
|
|
const desc = toolInput?.description || toolInput?.prompt || '';
|
|
return { classes: ['delegation'], detail: desc.slice(0, 80) };
|
|
}
|
|
|
|
// --- Read: data_access (sensitive path = stronger signal, but all reads count) ---
|
|
if (toolName === 'Read') {
|
|
const filePath = toolInput?.file_path || '';
|
|
const isSensitive = SENSITIVE_PATH_PATTERNS.some(p => p.test(filePath));
|
|
return {
|
|
classes: ['data_access'],
|
|
detail: `${isSensitive ? '[SENSITIVE] ' : ''}${filePath.slice(-60)}`,
|
|
};
|
|
}
|
|
|
|
// --- Grep / Glob: data_access ---
|
|
if (toolName === 'Grep' || toolName === 'Glob') {
|
|
const target = toolInput?.pattern || toolInput?.path || '';
|
|
return { classes: ['data_access'], detail: target.slice(0, 60) };
|
|
}
|
|
|
|
// --- Bash: can be multiple classes depending on command ---
|
|
if (toolName === 'Bash') {
|
|
return classifyBashCommand(toolInput?.command || '');
|
|
}
|
|
|
|
// --- Everything else: neutral ---
|
|
return { classes: ['neutral'], detail: '' };
|
|
}
|
|
|
|
/**
|
|
* Classify a Bash command. Can return multiple classes.
|
|
* @param {string} command
|
|
* @returns {{ classes: string[], detail: string }}
|
|
*/
|
|
function classifyBashCommand(command) {
|
|
const classes = [];
|
|
const detail = command.slice(0, 80);
|
|
|
|
// Check exfil first (highest priority)
|
|
if (BASH_EXFIL_PATTERNS.some(p => p.test(command))) {
|
|
classes.push('exfil_sink');
|
|
}
|
|
|
|
// Check data access: command reads files AND path looks sensitive
|
|
if (BASH_DATA_CMD_PATTERNS.some(p => p.test(command))) {
|
|
if (SENSITIVE_PATH_PATTERNS.some(p => p.test(command))) {
|
|
classes.push('data_access');
|
|
}
|
|
}
|
|
|
|
// Check input source: curl/wget without POST = downloading content
|
|
// Only add if not already classified as exfil (avoid double-counting curl POST)
|
|
if (!classes.includes('exfil_sink') && BASH_INPUT_PATTERNS.some(p => p.test(command))) {
|
|
classes.push('input_source');
|
|
}
|
|
|
|
if (classes.length === 0) {
|
|
classes.push('neutral');
|
|
}
|
|
|
|
return { classes, detail };
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// State management
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Get the state file path for this session.
|
|
* @returns {string}
|
|
*/
|
|
function getStateFilePath() {
|
|
return join(STATE_DIR, `${STATE_PREFIX}${process.ppid}.jsonl`);
|
|
}
|
|
|
|
/**
|
|
* Append a tool call entry to the state file.
|
|
* @param {string} stateFile
|
|
* @param {object} entry
|
|
*/
|
|
function appendEntry(stateFile, entry) {
|
|
appendFileSync(stateFile, JSON.stringify(entry) + '\n', 'utf-8');
|
|
}
|
|
|
|
/**
|
|
* Read the last N entries from the state file.
|
|
* @param {string} stateFile
|
|
* @param {number} n
|
|
* @returns {object[]}
|
|
*/
|
|
function readLastEntries(stateFile, n) {
|
|
if (!existsSync(stateFile)) return [];
|
|
|
|
try {
|
|
const content = readFileSync(stateFile, 'utf-8');
|
|
const lines = content.trim().split('\n').filter(Boolean);
|
|
const tail = lines.slice(-n);
|
|
const entries = [];
|
|
for (const line of tail) {
|
|
try { entries.push(JSON.parse(line)); } catch { /* skip malformed */ }
|
|
}
|
|
return entries;
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clean up state files older than CLEANUP_MAX_AGE_MS.
|
|
* Only called on first invocation per session (when state file doesn't exist yet).
|
|
*/
|
|
function cleanupOldStateFiles() {
|
|
try {
|
|
const now = Date.now();
|
|
const files = readdirSync(STATE_DIR);
|
|
for (const file of files) {
|
|
if (!file.startsWith(STATE_PREFIX) || !file.endsWith('.jsonl')) continue;
|
|
const fullPath = join(STATE_DIR, file);
|
|
try {
|
|
const stat = statSync(fullPath);
|
|
if (now - stat.mtimeMs > CLEANUP_MAX_AGE_MS) {
|
|
unlinkSync(fullPath);
|
|
}
|
|
} catch { /* ignore per-file errors */ }
|
|
}
|
|
} catch { /* ignore cleanup errors entirely */ }
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Trifecta detection
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Check if all 3 trifecta legs are present in the window.
|
|
* @param {object[]} entries
|
|
* @returns {{ detected: boolean, evidence: { input: string[], access: string[], exfil: string[] } }}
|
|
*/
|
|
function checkTrifecta(entries) {
|
|
const evidence = { input: [], access: [], exfil: [] };
|
|
|
|
for (const entry of entries) {
|
|
if (entry.type === 'warning') continue; // skip warning markers
|
|
const classes = entry.classes || [];
|
|
for (const cls of classes) {
|
|
if (cls === 'input_source') evidence.input.push(entry.detail || entry.tool);
|
|
if (cls === 'data_access') evidence.access.push(entry.detail || entry.tool);
|
|
if (cls === 'exfil_sink') evidence.exfil.push(entry.detail || entry.tool);
|
|
}
|
|
}
|
|
|
|
return {
|
|
detected: evidence.input.length > 0 && evidence.access.length > 0 && evidence.exfil.length > 0,
|
|
evidence,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Check if a warning was already emitted in the current window.
|
|
* @param {object[]} entries
|
|
* @returns {boolean}
|
|
*/
|
|
function hasRecentWarning(entries) {
|
|
return entries.some(e => e.type === 'warning');
|
|
}
|
|
|
|
/**
|
|
* Check if the trifecta is MCP-concentrated: all 3 legs originate from tools
|
|
* on the same MCP server. This is a stronger signal — a single compromised
|
|
* server providing input, accessing data, AND exfiltrating.
|
|
* @param {object[]} entries
|
|
* @returns {{ concentrated: boolean, server: string|null }}
|
|
*/
|
|
function checkMcpConcentration(entries) {
|
|
// Collect MCP servers per trifecta leg
|
|
const serversByLeg = { input: new Set(), access: new Set(), exfil: new Set() };
|
|
|
|
for (const entry of entries) {
|
|
if (entry.type === 'warning') continue;
|
|
const server = extractMcpServer(entry.tool);
|
|
if (!server) continue;
|
|
|
|
const classes = entry.classes || [];
|
|
for (const cls of classes) {
|
|
if (cls === 'input_source') serversByLeg.input.add(server);
|
|
if (cls === 'data_access') serversByLeg.access.add(server);
|
|
if (cls === 'exfil_sink') serversByLeg.exfil.add(server);
|
|
}
|
|
}
|
|
|
|
// Find a server present in all 3 legs
|
|
for (const server of serversByLeg.input) {
|
|
if (serversByLeg.access.has(server) && serversByLeg.exfil.has(server)) {
|
|
return { concentrated: true, server };
|
|
}
|
|
}
|
|
return { concentrated: false, server: null };
|
|
}
|
|
|
|
/**
|
|
* Check if the trifecta involves sensitive path access + exfiltration.
|
|
* This is a high-confidence signal: data from .env/.ssh/.aws etc. being sent out.
|
|
* @param {object[]} entries
|
|
* @returns {boolean}
|
|
*/
|
|
function checkSensitiveExfil(entries) {
|
|
let hasSensitiveAccess = false;
|
|
let hasExfil = false;
|
|
|
|
for (const entry of entries) {
|
|
if (entry.type === 'warning') continue;
|
|
const classes = entry.classes || [];
|
|
const detail = entry.detail || '';
|
|
|
|
if (classes.includes('data_access') && detail.startsWith('[SENSITIVE]')) {
|
|
hasSensitiveAccess = true;
|
|
}
|
|
if (classes.includes('exfil_sink')) {
|
|
hasExfil = true;
|
|
}
|
|
}
|
|
|
|
return hasSensitiveAccess && hasExfil;
|
|
}
|
|
|
|
/**
|
|
* Compute cumulative data volume from entries with outputSize.
|
|
* @param {object[]} allEntries - All entries (not just window)
|
|
* @returns {number} Total bytes
|
|
*/
|
|
function computeCumulativeVolume(allEntries) {
|
|
let total = 0;
|
|
for (const entry of allEntries) {
|
|
if (entry.type === 'warning' || entry.type === 'volume_warning') continue;
|
|
total += entry.outputSize || 0;
|
|
}
|
|
return total;
|
|
}
|
|
|
|
/**
|
|
* Check if a volume warning at a given threshold was already emitted.
|
|
* @param {object[]} entries
|
|
* @param {number} thresholdBytes
|
|
* @returns {boolean}
|
|
*/
|
|
function hasVolumeWarning(entries, thresholdBytes) {
|
|
return entries.some(e => e.type === 'volume_warning' && e.threshold === thresholdBytes);
|
|
}
|
|
|
|
/**
|
|
* Format the volume warning message.
|
|
* @param {number} totalBytes
|
|
* @param {string} thresholdLabel
|
|
* @param {string} severity
|
|
* @returns {string}
|
|
*/
|
|
function formatVolumeWarning(totalBytes, thresholdLabel, severity) {
|
|
const kb = Math.round(totalBytes / 1024);
|
|
return (
|
|
`SECURITY ADVISORY (session-guard): Cumulative MCP data volume exceeded ${thresholdLabel} [${severity}].\n\n` +
|
|
`This session has received ~${kb} KB of tool output data.\n` +
|
|
'High cumulative volume may indicate bulk data harvesting or exfiltration staging (OWASP ASI02).\n' +
|
|
'Review whether the volume of data being processed is proportional to the task.'
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Format the trifecta warning message.
|
|
* Uses Rule of Two terminology (Meta, Oct 2025): A=untrusted input, B=sensitive data, C=state change/exfil.
|
|
* @param {{ input: string[], access: string[], exfil: string[] }} evidence
|
|
* @param {{ concentrated: boolean, server: string|null }} [mcpInfo]
|
|
* @param {boolean} [isSensitiveExfil]
|
|
* @returns {string}
|
|
*/
|
|
function formatWarning(evidence, mcpInfo, isSensitiveExfil) {
|
|
const inputEx = evidence.input.slice(-2).map(e => ` - ${e}`).join('\n');
|
|
const accessEx = evidence.access.slice(-2).map(e => ` - ${e}`).join('\n');
|
|
const exfilEx = evidence.exfil.slice(-2).map(e => ` - ${e}`).join('\n');
|
|
|
|
const mcpLine = mcpInfo?.concentrated
|
|
? `\nRULE OF TWO VIOLATION: MCP-CONCENTRATED — All 3 legs trace to server "${mcpInfo.server}" (elevated severity).\n`
|
|
: '';
|
|
|
|
const sensitiveLine = isSensitiveExfil
|
|
? '\nRULE OF TWO VIOLATION: SENSITIVE DATA + EXFILTRATION — Sensitive paths accessed and exfil sink present.\n'
|
|
: '';
|
|
|
|
return (
|
|
'SECURITY ADVISORY (session-guard): Rule of Two violation — potential lethal trifecta detected.\n\n' +
|
|
'Within the last 20 tool calls, this session holds all 3 capabilities simultaneously:\n' +
|
|
' [A] Untrusted external input (prompt injection surface):\n' + inputEx + '\n' +
|
|
' [B] Sensitive data access:\n' + accessEx + '\n' +
|
|
' [C] Exfiltration-capable tool (state change):\n' + exfilEx + '\n' +
|
|
mcpLine + sensitiveLine + '\n' +
|
|
'Rule of Two (Meta, Oct 2025): An agent should never hold A+B+C simultaneously.\n' +
|
|
'This combination enables prompt injection -> data theft chains (OWASP ASI01, ASI02, LLM01).\n' +
|
|
'Review recent tool calls for unexpected behavior.'
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Sub-agent delegation tracking (DeepMind Agent Traps kat. 4, v5.0 S4)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Check for escalation-after-input: delegation within DELEGATION_ESCALATION_WINDOW
|
|
* calls of an input_source. Untrusted content consumed shortly before spawning a
|
|
* sub-agent may indicate the model is being manipulated into delegating dangerous work.
|
|
* @param {object[]} entries — recent window (20-call)
|
|
* @param {{ classes: string[] }} currentEntry — the entry just appended
|
|
* @returns {{ detected: boolean, inputDetail: string }}
|
|
*/
|
|
function checkEscalationAfterInput(entries, currentEntry) {
|
|
if (!currentEntry.classes.includes('delegation')) {
|
|
return { detected: false, inputDetail: '' };
|
|
}
|
|
|
|
// Walk backwards through the last DELEGATION_ESCALATION_WINDOW entries
|
|
// looking for an input_source
|
|
const toolEntries = entries.filter(e => !e.type);
|
|
const recentN = toolEntries.slice(-(DELEGATION_ESCALATION_WINDOW + 1), -1); // exclude current
|
|
for (const entry of recentN) {
|
|
if ((entry.classes || []).includes('input_source')) {
|
|
return { detected: true, inputDetail: entry.detail || entry.tool || 'unknown' };
|
|
}
|
|
}
|
|
return { detected: false, inputDetail: '' };
|
|
}
|
|
|
|
/**
|
|
* Check if an escalation-after-input warning was already emitted.
|
|
* @param {object[]} entries
|
|
* @returns {boolean}
|
|
*/
|
|
function hasEscalationWarning(entries) {
|
|
return entries.some(e => e.type === 'escalation_warning');
|
|
}
|
|
|
|
/**
|
|
* Format the escalation-after-input warning.
|
|
* @param {string} delegationDetail — what the delegation was for
|
|
* @param {string} inputDetail — what input source preceded it
|
|
* @returns {string}
|
|
*/
|
|
function formatEscalationWarning(delegationDetail, inputDetail) {
|
|
return (
|
|
'SECURITY ADVISORY (session-guard): Escalation-after-input detected [MEDIUM] — ' +
|
|
'sub-agent delegation shortly after untrusted input.\n\n' +
|
|
`A Task/Agent delegation occurred within ${DELEGATION_ESCALATION_WINDOW} calls of untrusted input:\n` +
|
|
` Input source: ${inputDetail}\n` +
|
|
` Delegation: ${delegationDetail}\n\n` +
|
|
'Untrusted content (web pages, MCP tool output) may be influencing the model\n' +
|
|
'to spawn sub-agents with capabilities beyond the original task scope.\n' +
|
|
'This is a known attack vector (DeepMind AI Agent Traps, Category 4).\n' +
|
|
'Review whether this delegation is expected and appropriately scoped.'
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Long-horizon monitoring (100-call window) — OpenAI Atlas, Dec 2025
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Filter entries to only tool calls (exclude warning/marker entries).
|
|
* @param {object[]} entries
|
|
* @returns {object[]}
|
|
*/
|
|
function filterToolEntries(entries) {
|
|
return entries.filter(e => !e.type);
|
|
}
|
|
|
|
/**
|
|
* Check for slow-burn trifecta: all 3 legs present but spread over >50 calls.
|
|
* Catches multi-step injection chains that pace actions to avoid short-window detection.
|
|
* @param {object[]} entries - Long-horizon window entries
|
|
* @returns {{ detected: boolean, spread: number }}
|
|
*/
|
|
function checkSlowBurnTrifecta(entries) {
|
|
const toolEntries = filterToolEntries(entries);
|
|
let firstInput = -1, firstAccess = -1, firstExfil = -1;
|
|
let lastInput = -1, lastAccess = -1, lastExfil = -1;
|
|
|
|
for (let i = 0; i < toolEntries.length; i++) {
|
|
for (const cls of toolEntries[i].classes || []) {
|
|
if (cls === 'input_source') {
|
|
if (firstInput === -1) firstInput = i;
|
|
lastInput = i;
|
|
}
|
|
if (cls === 'data_access') {
|
|
if (firstAccess === -1) firstAccess = i;
|
|
lastAccess = i;
|
|
}
|
|
if (cls === 'exfil_sink') {
|
|
if (firstExfil === -1) firstExfil = i;
|
|
lastExfil = i;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (firstInput === -1 || firstAccess === -1 || firstExfil === -1) {
|
|
return { detected: false, spread: 0 };
|
|
}
|
|
|
|
const earliestFirst = Math.min(firstInput, firstAccess, firstExfil);
|
|
const latestLast = Math.max(lastInput, lastAccess, lastExfil);
|
|
const spread = latestLast - earliestFirst;
|
|
|
|
return { detected: spread > SLOW_BURN_MIN_SPREAD, spread };
|
|
}
|
|
|
|
/**
|
|
* @param {object[]} entries
|
|
* @returns {boolean}
|
|
*/
|
|
function hasSlowBurnWarning(entries) {
|
|
return entries.some(e => e.type === 'slow_burn_warning');
|
|
}
|
|
|
|
/**
|
|
* Detect behavioral drift: tool distribution shift in first vs last DRIFT_SAMPLE_SIZE calls.
|
|
* @param {object[]} entries
|
|
* @returns {{ drifted: boolean, jsd: number, firstTools: string[], lastTools: string[] }}
|
|
*/
|
|
function checkBehavioralDrift(entries) {
|
|
const toolEntries = filterToolEntries(entries);
|
|
if (toolEntries.length < 2 * DRIFT_SAMPLE_SIZE) {
|
|
return { drifted: false, jsd: 0, firstTools: [], lastTools: [] };
|
|
}
|
|
|
|
const firstTools = toolEntries.slice(0, DRIFT_SAMPLE_SIZE).map(e => e.tool);
|
|
const lastTools = toolEntries.slice(-DRIFT_SAMPLE_SIZE).map(e => e.tool);
|
|
const P = buildDistribution(firstTools);
|
|
const Q = buildDistribution(lastTools);
|
|
const jsd = jensenShannonDivergence(P, Q);
|
|
|
|
return { drifted: jsd > DRIFT_THRESHOLD, jsd, firstTools, lastTools };
|
|
}
|
|
|
|
/**
|
|
* @param {object[]} entries
|
|
* @returns {boolean}
|
|
*/
|
|
function hasDriftWarning(entries) {
|
|
return entries.some(e => e.type === 'drift_warning');
|
|
}
|
|
|
|
/**
|
|
* Get top N most frequent items from an array, formatted as "name(count)".
|
|
* @param {string[]} items
|
|
* @param {number} n
|
|
* @returns {string}
|
|
*/
|
|
function topN(items, n) {
|
|
const counts = new Map();
|
|
for (const item of items) counts.set(item, (counts.get(item) || 0) + 1);
|
|
return [...counts.entries()]
|
|
.sort((a, b) => b[1] - a[1])
|
|
.slice(0, n)
|
|
.map(([name, count]) => `${name}(${count})`)
|
|
.join(', ');
|
|
}
|
|
|
|
/**
|
|
* Format the slow-burn trifecta warning message.
|
|
* @param {number} spread
|
|
* @returns {string}
|
|
*/
|
|
function formatSlowBurnWarning(spread) {
|
|
return (
|
|
'SECURITY ADVISORY (session-guard): Slow-burn trifecta detected [MEDIUM] — ' +
|
|
`Rule of Two legs spread over ${spread} calls.\n\n` +
|
|
'Within the last 100 tool calls, all 3 capabilities appeared but spread across a wide range:\n' +
|
|
' [A] Untrusted external input (prompt injection surface)\n' +
|
|
' [B] Sensitive data access\n' +
|
|
' [C] Exfiltration-capable tool (state change)\n\n' +
|
|
'This pattern may indicate a multi-step prompt injection chain (OpenAI Atlas, Dec 2025).\n' +
|
|
'Wide spread across calls makes detection harder with short-window monitoring.'
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Format the behavioral drift warning message.
|
|
* @param {number} jsd
|
|
* @param {string[]} firstTools
|
|
* @param {string[]} lastTools
|
|
* @returns {string}
|
|
*/
|
|
function formatDriftWarning(jsd, firstTools, lastTools) {
|
|
return (
|
|
'SECURITY ADVISORY (session-guard): Behavioral drift detected [MEDIUM] — tool usage shift.\n\n' +
|
|
`Jensen-Shannon divergence: ${jsd.toFixed(3)} (threshold: ${DRIFT_THRESHOLD})\n` +
|
|
`First ${DRIFT_SAMPLE_SIZE} calls: ${topN(firstTools, 3)}\n` +
|
|
`Last ${DRIFT_SAMPLE_SIZE} calls: ${topN(lastTools, 3)}\n\n` +
|
|
'A significant shift in tool usage patterns may indicate session hijacking or prompt injection\n' +
|
|
"changing the agent's behavior over time (OpenAI Atlas, Dec 2025)."
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// CaMeL-inspired data flow tagging (DeepMind CaMeL, v5.0 S6)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Compute a short data tag from tool output (first 200 chars, SHA-256 truncated to 16 hex).
|
|
* Used for lightweight data provenance tracking.
|
|
* @param {string} text - tool output text
|
|
* @returns {string} 16-char hex hash
|
|
*/
|
|
function computeDataTag(text) {
|
|
const sample = text.slice(0, 200);
|
|
return createHash('sha256').update(sample).digest('hex').slice(0, 16);
|
|
}
|
|
|
|
/**
|
|
* Extract a string representation of tool input for data flow matching.
|
|
* @param {object} toolInput
|
|
* @returns {string}
|
|
*/
|
|
function extractInputText(toolInput) {
|
|
if (!toolInput || typeof toolInput !== 'object') return '';
|
|
// Collect all string values from the input object
|
|
const parts = [];
|
|
for (const val of Object.values(toolInput)) {
|
|
if (typeof val === 'string') parts.push(val);
|
|
else if (typeof val === 'object') parts.push(JSON.stringify(val));
|
|
}
|
|
return parts.join(' ');
|
|
}
|
|
|
|
/**
|
|
* Check if the current tool input contains data that matches a previous output's tag.
|
|
* Matches by checking if the first 200 chars of any previous output hash matches
|
|
* a stored tag, AND the current input contains a substring from previous output.
|
|
* For efficiency, uses dataTag hashes and inputSnippet matching.
|
|
* @param {object[]} entries - recent state entries
|
|
* @param {string} currentInputText - stringified current tool input
|
|
* @returns {{ linked: boolean, sourceEntries: object[] }}
|
|
*/
|
|
function checkDataFlowLink(entries, currentInputText) {
|
|
if (!currentInputText || currentInputText.length < 20) {
|
|
return { linked: false, sourceEntries: [] };
|
|
}
|
|
|
|
const sourceEntries = [];
|
|
// Check if any previous entry's data tag matches content in current input
|
|
for (const entry of entries) {
|
|
if (entry.type || !entry.dataTag) continue;
|
|
// Check if the input text contains a meaningful snippet from the output
|
|
// We store inputSnippet from previous entries for cross-reference
|
|
if (entry.outputSnippet && currentInputText.includes(entry.outputSnippet)) {
|
|
sourceEntries.push(entry);
|
|
}
|
|
}
|
|
return { linked: sourceEntries.length > 0, sourceEntries };
|
|
}
|
|
|
|
/**
|
|
* Check if a data flow warning was already emitted.
|
|
* @param {object[]} entries
|
|
* @returns {boolean}
|
|
*/
|
|
function hasDataFlowWarning(entries) {
|
|
return entries.some(e => e.type === 'data_flow_warning');
|
|
}
|
|
|
|
/**
|
|
* Format the data flow linked trifecta warning.
|
|
* @param {{ input: string[], access: string[], exfil: string[] }} evidence
|
|
* @param {object[]} sourceEntries
|
|
* @returns {string}
|
|
*/
|
|
function formatDataFlowWarning(evidence, sourceEntries) {
|
|
const sources = sourceEntries.slice(0, 3).map(e =>
|
|
` - ${e.tool} → ${e.detail || 'unknown'}`
|
|
).join('\n');
|
|
return (
|
|
'SECURITY ADVISORY (session-guard): Data flow linked trifecta [HIGH] — ' +
|
|
'CaMeL-style provenance tracking detected data flow chain.\n\n' +
|
|
'Tool output from an untrusted source appears to flow into subsequent tool inputs,\n' +
|
|
'creating a traceable data flow chain across the trifecta:\n' +
|
|
` Data flow sources:\n${sources}\n\n` +
|
|
'This elevates the trifecta severity: data is not just co-located in the session,\n' +
|
|
'but actively flowing between tools in a potential injection chain (DeepMind CaMeL).'
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main
|
|
// ---------------------------------------------------------------------------
|
|
|
|
let input;
|
|
try {
|
|
const raw = readFileSync(0, 'utf-8');
|
|
input = JSON.parse(raw);
|
|
} catch {
|
|
process.exit(0);
|
|
}
|
|
|
|
const toolName = input?.tool_name ?? '';
|
|
const toolInput = input?.tool_input ?? {};
|
|
const toolOutput = input?.tool_output ?? '';
|
|
|
|
if (!toolName) {
|
|
process.exit(0);
|
|
}
|
|
|
|
// Off mode: skip all detection
|
|
if (TRIFECTA_MODE === 'off') {
|
|
process.exit(0);
|
|
}
|
|
|
|
// Compute output size for volume tracking
|
|
const outputText = typeof toolOutput === 'string' ? toolOutput : JSON.stringify(toolOutput);
|
|
const outputSize = Buffer.byteLength(outputText, 'utf-8');
|
|
|
|
// Classify the current tool call
|
|
const { classes, detail } = classifyToolCall(toolName, toolInput);
|
|
|
|
// State file management
|
|
const stateFile = getStateFilePath();
|
|
const isFirstCall = !existsSync(stateFile);
|
|
|
|
// Cleanup old state files on first call per session
|
|
if (isFirstCall) {
|
|
cleanupOldStateFiles();
|
|
}
|
|
|
|
// Compute data tag for CaMeL-style flow tracking (v5.0 S6)
|
|
const dataTag = outputText.length >= 20 ? computeDataTag(outputText) : null;
|
|
// Store a short snippet from output for data flow matching (first 50 non-whitespace chars)
|
|
const outputSnippet = outputText.length >= 50
|
|
? outputText.trim().slice(0, 50)
|
|
: null;
|
|
|
|
// Append current entry (with outputSize for volume tracking, dataTag for CaMeL)
|
|
const entry = {
|
|
ts: Date.now(),
|
|
tool: toolName,
|
|
classes,
|
|
detail,
|
|
outputSize,
|
|
...(dataTag ? { dataTag } : {}),
|
|
...(outputSnippet ? { outputSnippet } : {}),
|
|
};
|
|
appendEntry(stateFile, entry);
|
|
|
|
const messages = [];
|
|
|
|
// --- Trifecta detection (skip for neutral-only and delegation-only calls) ---
|
|
if (!(classes.length === 1 && (classes[0] === 'neutral' || classes[0] === 'delegation'))) {
|
|
const window = readLastEntries(stateFile, WINDOW_SIZE);
|
|
const { detected, evidence } = checkTrifecta(window);
|
|
|
|
if (detected && !hasRecentWarning(window)) {
|
|
const mcpInfo = checkMcpConcentration(window);
|
|
const sensitiveExfil = checkSensitiveExfil(window);
|
|
messages.push(formatWarning(evidence, mcpInfo, sensitiveExfil));
|
|
appendEntry(stateFile, { type: 'warning', ts: Date.now() });
|
|
writeAuditEvent({
|
|
event_type: 'trifecta_warning',
|
|
severity: mcpInfo.concentrated || sensitiveExfil ? 'critical' : 'high',
|
|
source: 'post-session-guard',
|
|
details: { evidence, mcp_concentrated: mcpInfo.concentrated, sensitive_exfil: sensitiveExfil },
|
|
owasp: ['ASI01', 'ASI02', 'LLM01'],
|
|
action_taken: TRIFECTA_MODE === 'block' ? 'blocked' : 'warned',
|
|
});
|
|
|
|
// --- Rule of Two: Block mode ---
|
|
// v7.1.0 B2 fix: block mode blocks on any detected trifecta, not only
|
|
// MCP-concentrated or sensitive-path cases. Distributed trifectas
|
|
// (different sources, non-sensitive path, non-sensitive sink) were
|
|
// previously only warned — a mismatch with the documented semantics
|
|
// of block mode. The severity gate below (critical vs high) remains:
|
|
// distributed trifectas are blocked with high-severity framing; MCP-
|
|
// concentrated and sensitive-exfil cases are blocked with critical-
|
|
// severity framing.
|
|
if (TRIFECTA_MODE === 'block') {
|
|
let context;
|
|
if (mcpInfo.concentrated) {
|
|
context = ` MCP-concentrated: all 3 legs via server "${mcpInfo.server}"\n`;
|
|
} else if (sensitiveExfil) {
|
|
context = ' Sensitive data access combined with exfiltration sink\n';
|
|
} else {
|
|
context = ' Distributed trifecta: three legs from different sources\n';
|
|
}
|
|
process.stderr.write(
|
|
'BLOCKED: Rule of Two violation — lethal trifecta detected.\n' +
|
|
context +
|
|
' Set LLM_SECURITY_TRIFECTA_MODE=warn to downgrade to advisory.\n'
|
|
);
|
|
process.stdout.write(JSON.stringify({ decision: 'block' }));
|
|
process.exit(2);
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Escalation-after-input detection (delegation within 5 calls of input_source) ---
|
|
if (classes.includes('delegation')) {
|
|
const window = readLastEntries(stateFile, WINDOW_SIZE);
|
|
const escalation = checkEscalationAfterInput(window, entry);
|
|
if (escalation.detected && !hasEscalationWarning(window)) {
|
|
messages.push(formatEscalationWarning(detail, escalation.inputDetail));
|
|
appendEntry(stateFile, { type: 'escalation_warning', ts: Date.now() });
|
|
writeAuditEvent({
|
|
event_type: 'escalation_after_input',
|
|
severity: 'medium',
|
|
source: 'post-session-guard',
|
|
details: { tool: detail, input_source: escalation.inputDetail },
|
|
owasp: ['ASI01'],
|
|
action_taken: 'warned',
|
|
});
|
|
}
|
|
}
|
|
|
|
// --- CaMeL data flow check (v5.0 S6) ---
|
|
// Check if current tool input contains data that flowed from a previous tool output.
|
|
// If a data flow link is detected AND a trifecta is present, elevate severity.
|
|
if (!(classes.length === 1 && classes[0] === 'neutral')) {
|
|
const inputText = extractInputText(toolInput);
|
|
if (inputText.length >= 20) {
|
|
const window = readLastEntries(stateFile, WINDOW_SIZE);
|
|
const flowLink = checkDataFlowLink(window, inputText);
|
|
if (flowLink.linked && !hasDataFlowWarning(window)) {
|
|
// Check if a trifecta is also present
|
|
const { detected, evidence } = checkTrifecta(window);
|
|
if (detected) {
|
|
messages.push(formatDataFlowWarning(evidence, flowLink.sourceEntries));
|
|
appendEntry(stateFile, { type: 'data_flow_warning', ts: Date.now() });
|
|
writeAuditEvent({
|
|
event_type: 'data_flow_trifecta',
|
|
severity: 'high',
|
|
source: 'post-session-guard',
|
|
details: { evidence, flow_sources: flowLink.sourceEntries.length },
|
|
owasp: ['ASI01', 'ASI02'],
|
|
action_taken: 'warned',
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Cumulative volume tracking ---
|
|
if (outputSize > 0) {
|
|
const allEntries = readLastEntries(stateFile, 10_000); // read all
|
|
const totalVolume = computeCumulativeVolume(allEntries);
|
|
|
|
// Check thresholds from highest to lowest — only warn once per threshold
|
|
for (const { bytes, label, severity } of VOLUME_THRESHOLDS) {
|
|
if (totalVolume >= bytes && !hasVolumeWarning(allEntries, bytes)) {
|
|
messages.push(formatVolumeWarning(totalVolume, label, severity));
|
|
appendEntry(stateFile, { type: 'volume_warning', ts: Date.now(), threshold: bytes });
|
|
writeAuditEvent({
|
|
event_type: 'volume_threshold',
|
|
severity: severity.toLowerCase(),
|
|
source: 'post-session-guard',
|
|
details: { total_bytes: totalVolume, threshold: label },
|
|
owasp: ['ASI02'],
|
|
action_taken: 'warned',
|
|
});
|
|
break; // only emit highest unwarned threshold
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Long-horizon monitoring (100-call window) ---
|
|
{
|
|
const longWindow = readLastEntries(stateFile, LONG_HORIZON_WINDOW);
|
|
|
|
// Slow-burn trifecta: all 3 legs spread over >50 calls
|
|
const slowBurn = checkSlowBurnTrifecta(longWindow);
|
|
if (slowBurn.detected && !hasSlowBurnWarning(longWindow)) {
|
|
messages.push(formatSlowBurnWarning(slowBurn.spread));
|
|
appendEntry(stateFile, { type: 'slow_burn_warning', ts: Date.now() });
|
|
writeAuditEvent({
|
|
event_type: 'slow_burn_trifecta',
|
|
severity: 'medium',
|
|
source: 'post-session-guard',
|
|
details: { spread: slowBurn.spread },
|
|
owasp: ['ASI06', 'ASI08'],
|
|
action_taken: 'warned',
|
|
});
|
|
}
|
|
|
|
// Behavioral drift: JSD on tool distribution (first vs last DRIFT_SAMPLE_SIZE)
|
|
const drift = checkBehavioralDrift(longWindow);
|
|
if (drift.drifted && !hasDriftWarning(longWindow)) {
|
|
messages.push(formatDriftWarning(drift.jsd, drift.firstTools, drift.lastTools));
|
|
appendEntry(stateFile, { type: 'drift_warning', ts: Date.now() });
|
|
writeAuditEvent({
|
|
event_type: 'behavioral_drift',
|
|
severity: 'medium',
|
|
source: 'post-session-guard',
|
|
details: { jsd: drift.jsd, first_tools: drift.firstTools, last_tools: drift.lastTools },
|
|
owasp: ['ASI06', 'ASI08'],
|
|
action_taken: 'warned',
|
|
});
|
|
}
|
|
}
|
|
|
|
// Emit combined advisory
|
|
if (messages.length > 0) {
|
|
const combined = messages.join('\n\n---\n\n');
|
|
process.stdout.write(JSON.stringify({ systemMessage: combined }));
|
|
}
|
|
|
|
// Default: advisory only (warn mode)
|
|
process.exit(0);
|