diff --git a/plugins/llm-security/CHANGELOG.md b/plugins/llm-security/CHANGELOG.md index b31bea5..09d5642 100644 --- a/plugins/llm-security/CHANGELOG.md +++ b/plugins/llm-security/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### BREAKING CHANGES - **Risk-score formula rewritten** (`scanners/lib/severity.mjs`). The v1 sum-and-cap formula (`critical*25 + high*10 + medium*4 + low*1`, capped at 100) collapsed every non-trivial scan to 100/Extreme regardless of actual risk distribution. v2 is severity-dominated and log-scaled within tier: - - Critical present → 70–95 (1=80, 2=86, 4=90, 10=95) + - Critical present → 70–95 (1=80, 2=86, 4=93, 10=95) - High only → 40–65 (1=48, 5=60, 17=65) - Medium only → 15–35 (1=20, 5=28, 50=33) - Low only → 1–11 (1=4, 10=11) diff --git a/plugins/llm-security/CLAUDE.md b/plugins/llm-security/CLAUDE.md index 78bd0cc..d80691a 100644 --- a/plugins/llm-security/CLAUDE.md +++ b/plugins/llm-security/CLAUDE.md @@ -181,7 +181,7 @@ Prompt injection is **structurally unsolvable** with current architectures (join - **Broader detection** — MEDIUM advisory for obfuscation signals (leetspeak, homoglyphs, zero-width, multi-language), Unicode Tag steganography, bash expansion evasion - **Increased attack cost** — Rule of Two enforcement (configurable block/warn/off for lethal trifecta), bash normalization before gate matching - **Longer monitoring windows** — 100-call long-horizon alongside 20-call sliding window, slow-burn trifecta detection, behavioral drift via Jensen-Shannon divergence -- **Architectural constraints** — CaMeL-inspired data flow tagging, sub-agent delegation tracking, HITL trap detection +- **Architectural constraints** — opportunistic byte-matching of truncated output fingerprints (first 200 bytes, SHA-256/16-hex tag; not semantic lineage; trivially bypassed by mutation or summarisation of tool output), sub-agent delegation tracking, HITL trap detection. Inspired by CaMeL (DeepMind, 2025), but this is a lightweight byte-fingerprint, not semantic capability tracking - **Honest documentation** — Known Limitations section acknowledges what deterministic hooks cannot detect **Bash evasion layers (T1-T6):** `bash-normalize.mjs` collapses six known obfuscation techniques before gate matching as a defense-in-depth layer. T1 empty quotes (`rm''-rf`), T2 `${}` parameter expansion, T3 backslash continuation, T4 tab/whitespace splitting, T5 `${IFS}` word-splitting, T6 ANSI-C hex quoting (`$'\x72\x6d'`). These layers complement — not replace — Claude Code 2.1.98+ harness-level protections. Full reference: `docs/security-hardening-guide.md`. diff --git a/plugins/llm-security/hooks/scripts/post-session-guard.mjs b/plugins/llm-security/hooks/scripts/post-session-guard.mjs index 842eb6b..333adf1 100644 --- a/plugins/llm-security/hooks/scripts/post-session-guard.mjs +++ b/plugins/llm-security/hooks/scripts/post-session-guard.mjs @@ -643,12 +643,22 @@ function formatDriftWarning(jsd, firstTools, lastTools) { } // --------------------------------------------------------------------------- -// CaMeL-inspired data flow tagging (DeepMind CaMeL, v5.0 S6) +// Output fingerprint matching (inspired by CaMeL, DeepMind 2025; v5.0 S6) +// +// NOTE: This is opportunistic byte-matching of truncated output fingerprints, +// not semantic data-flow tracking. We hash the first 200 bytes of tool output +// (SHA-256, truncated to 16 hex chars) and check whether that exact tag +// appears verbatim in the next tool input. Trivially bypassed by: +// - Mutating any of the first 200 bytes +// - Summarising the output before passing it on +// - Re-encoding (base64, JSON-escape, whitespace changes) +// Inspired by CaMeL but NOT a CaMeL capability-tracking implementation. // --------------------------------------------------------------------------- /** - * Compute a short data tag from tool output (first 200 chars, SHA-256 truncated to 16 hex). - * Used for lightweight data provenance tracking. + * Compute a short output fingerprint from tool output (first 200 chars, + * SHA-256 truncated to 16 hex). Used for opportunistic byte-matching, not + * semantic provenance. * @param {string} text - tool output text * @returns {string} 16-char hex hash */ diff --git a/plugins/llm-security/scanners/lib/severity.mjs b/plugins/llm-security/scanners/lib/severity.mjs index 1ca85bd..0ed2dab 100644 --- a/plugins/llm-security/scanners/lib/severity.mjs +++ b/plugins/llm-security/scanners/lib/severity.mjs @@ -20,7 +20,7 @@ const SEVERITY_WEIGHTS_V1 = { critical: 25, high: 10, medium: 4, low: 1, info: 0 * of actual risk distribution. * * Tiers: - * Critical present → 70-95 (1=80, 2=86, 4=90, 10=95) + * Critical present → 70-95 (1=80, 2=86, 4=93, 10=95) * High only → 40-65 (1=48, 5=60, 17=65) * Medium only → 15-35 (1=20, 5=28, 50=33) * Low only → 1-11 (1=4, 10=11) diff --git a/plugins/llm-security/tests/lib/severity.test.mjs b/plugins/llm-security/tests/lib/severity.test.mjs index a6b7318..573b235 100644 --- a/plugins/llm-security/tests/lib/severity.test.mjs +++ b/plugins/llm-security/tests/lib/severity.test.mjs @@ -235,6 +235,74 @@ describe('riskBand (v7.0.0 cutoffs: 14/39/64/84)', () => { }); }); +// --------------------------------------------------------------------------- +// Verdict / riskBand co-monotonicity sweep (critical-review §5.4) +// +// Asserts that for every representative count vector, (verdict, riskBand) +// agree under the v7.0.0 contract: +// BLOCK ⇔ band ∈ {Critical, Extreme} OR critical ≥ 1 +// WARNING ⇔ band ∈ {Medium, High} OR (high ≥ 1 AND verdict != BLOCK) +// ALLOW ⇔ band == Low AND no high/critical +// +// Catches regressions where a future change to riskScore tiers, verdict +// cutoffs, or riskBand cutoffs would re-introduce contradictions like +// "ALLOW + High band" or "BLOCK + Medium band". +// --------------------------------------------------------------------------- + +describe('verdict/riskBand co-monotonicity (v7.0.0 §5.4)', () => { + const cases = [ + { critical: 0, high: 0, medium: 0, low: 0, info: 0 }, + { low: 1 }, + { low: 10 }, + { medium: 1 }, + { medium: 5 }, + { medium: 50 }, + { high: 1 }, + { high: 5 }, + { high: 7 }, + { high: 8 }, + { high: 17 }, + { critical: 1 }, + { critical: 2 }, + { critical: 4 }, + { critical: 10 }, + ]; + + for (const counts of cases) { + const label = JSON.stringify(counts); + it(`(${label}) — verdict and riskBand agree`, () => { + const score = riskScore(counts); + const v = verdict(counts); + const band = riskBand(score); + const hasCritical = (counts.critical || 0) >= 1; + const hasHigh = (counts.high || 0) >= 1; + + if (v === 'BLOCK') { + assert.ok( + band === 'Critical' || band === 'Extreme' || hasCritical, + `BLOCK requires Critical/Extreme band or critical>=1; got band=${band}, score=${score}, counts=${label}`, + ); + } else if (v === 'WARNING') { + assert.ok( + band === 'Medium' || band === 'High' || hasHigh, + `WARNING requires Medium/High band or high>=1; got band=${band}, score=${score}, counts=${label}`, + ); + assert.ok(!hasCritical, `WARNING must not have critical>=1; counts=${label}`); + } else { + assert.equal(v, 'ALLOW'); + assert.equal(band, 'Low', `ALLOW requires Low band; got band=${band}, score=${score}, counts=${label}`); + assert.ok(!hasHigh && !hasCritical, `ALLOW must not have high/critical>=1; counts=${label}`); + } + }); + } + + it('JSDoc arithmetic anchor — 4 critical = 93 (not 90)', () => { + // Pin against doc/code drift documented in critical-review §5 (B4). + // 70 + min(25, log2(5)*10) = 70 + 23.219... = 93.219 → round → 93. + assert.equal(riskScore({ critical: 4 }), 93); + }); +}); + // --------------------------------------------------------------------------- // gradeFromPassRate // ---------------------------------------------------------------------------