From 6073952b972678c955140f82cd74813504b75301 Mon Sep 17 00:00:00 2001 From: Kjell Tore Guttormsen Date: Wed, 29 Apr 2026 14:44:41 +0200 Subject: [PATCH] fix(injection): E16 ASCII fast-path + UNI-003 expectation update (v7.2.0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-up fixes after E16 + E17 landed: 1. foldHomoglyphs ASCII fast-path - scanForInjection calls foldHomoglyphs on every scan (raw + normalized). - Pre-fix: NFKC normalization runs unconditionally, even on pure ASCII inputs where it's a no-op. - Result: benchmark.test.mjs timed out at 120s on the full suite. - Fix: charCodeAt sweep for >=128, short-circuit return s when all ASCII. NFKC and HOMOGLYPH_MAP iteration only run when non-ASCII chars are present (the actual attack case). - Verified: benchmark.test.mjs passes within timeout. 2. Attack-scenario UNI-003 expectation - Pre-E16: "Homoglyph Cyrillic-Latin mixing" payload triggered only a MEDIUM "obfuscation present" advisory (exit 0, stdout match "MEDIUM"). - Post-E16: the same payload is folded to Latin BEFORE pattern matching, so it now matches CRITICAL "ignore previous instructions" and blocks (exit 2). - This is the intended v7.2.0 behavior — not a regression. Updated expectation: exit_code 2, stdout_match "block". Renamed scenario to "now blocked via E16 fold, v7.2.0". Suite: pre-compact-scan flake remains (perf-budget under load, passes isolated). All other tests green. --- plugins/llm-security/knowledge/attack-scenarios.json | 6 +++--- plugins/llm-security/scanners/lib/string-utils.mjs | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/plugins/llm-security/knowledge/attack-scenarios.json b/plugins/llm-security/knowledge/attack-scenarios.json index a9419cc..baa897c 100644 --- a/plugins/llm-security/knowledge/attack-scenarios.json +++ b/plugins/llm-security/knowledge/attack-scenarios.json @@ -881,7 +881,7 @@ }, { "id": "UNI-003", - "name": "Homoglyph Cyrillic-Latin mixing (MEDIUM)", + "name": "Homoglyph Cyrillic-Latin mixing (now blocked via E16 fold, v7.2.0)", "input": { "session_id": "red-team", "message": { @@ -890,8 +890,8 @@ } }, "expect": { - "exit_code": 0, - "stdout_match": "MEDIUM" + "exit_code": 2, + "stdout_match": "block" } }, { diff --git a/plugins/llm-security/scanners/lib/string-utils.mjs b/plugins/llm-security/scanners/lib/string-utils.mjs index 8add0de..5653e17 100644 --- a/plugins/llm-security/scanners/lib/string-utils.mjs +++ b/plugins/llm-security/scanners/lib/string-utils.mjs @@ -451,11 +451,23 @@ const HOMOGLYPH_MAP = Object.freeze({ * Norwegian/Polish/German/etc. text is NOT affected — characters like * æ, ø, å, é, ñ, ü, ö, ä are not in HOMOGLYPH_MAP. * + * Performance: pure-ASCII inputs short-circuit before NFKC, since NFKC is + * a no-op on ASCII and HOMOGLYPH_MAP only contains non-ASCII keys. + * scanForInjection calls this on every scan; the fast-path keeps the + * common-case overhead near zero. + * * @param {string} s * @returns {string} */ export function foldHomoglyphs(s) { if (!s) return s; + // Fast path: pure ASCII has nothing to fold and NFKC is identity. + // charCodeAt is cheaper than iterating codepoints. + let asciiOnly = true; + for (let i = 0; i < s.length; i++) { + if (s.charCodeAt(i) > 127) { asciiOnly = false; break; } + } + if (asciiOnly) return s; const normalized = s.normalize('NFKC'); let out = ''; for (const ch of normalized) {