From 6073952b972678c955140f82cd74813504b75301 Mon Sep 17 00:00:00 2001
From: Kjell Tore Guttormsen <ktg@humanize.no>
Date: Wed, 29 Apr 2026 14:44:41 +0200
Subject: [PATCH] fix(injection): E16 ASCII fast-path + UNI-003 expectation
 update (v7.2.0)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-up fixes after E16 + E17 landed:

1. foldHomoglyphs ASCII fast-path
   - scanForInjection calls foldHomoglyphs on every scan (raw + normalized).
   - Pre-fix: NFKC normalization runs unconditionally, even on pure
     ASCII inputs where it's a no-op.
   - Result: benchmark.test.mjs timed out at 120s on the full suite.
   - Fix: charCodeAt sweep for >=128, short-circuit return s when
     all ASCII. NFKC and HOMOGLYPH_MAP iteration only run when
     non-ASCII chars are present (the actual attack case).
   - Verified: benchmark.test.mjs passes within timeout.

2. Attack-scenario UNI-003 expectation
   - Pre-E16: "Homoglyph Cyrillic-Latin mixing" payload triggered only
     a MEDIUM "obfuscation present" advisory (exit 0, stdout match
     "MEDIUM").
   - Post-E16: the same payload is folded to Latin BEFORE pattern
     matching, so it now matches CRITICAL "ignore previous instructions"
     and blocks (exit 2).
   - This is the intended v7.2.0 behavior — not a regression. Updated
     expectation: exit_code 2, stdout_match "block". Renamed scenario
     to "now blocked via E16 fold, v7.2.0".

Suite: pre-compact-scan flake remains (perf-budget under load,
passes isolated). All other tests green.
---
 plugins/llm-security/knowledge/attack-scenarios.json |  6 +++---
 plugins/llm-security/scanners/lib/string-utils.mjs   | 12 ++++++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/plugins/llm-security/knowledge/attack-scenarios.json b/plugins/llm-security/knowledge/attack-scenarios.json
index a9419cc..baa897c 100644
--- a/plugins/llm-security/knowledge/attack-scenarios.json
+++ b/plugins/llm-security/knowledge/attack-scenarios.json
@@ -881,7 +881,7 @@
         },
         {
           "id": "UNI-003",
-          "name": "Homoglyph Cyrillic-Latin mixing (MEDIUM)",
+          "name": "Homoglyph Cyrillic-Latin mixing (now blocked via E16 fold, v7.2.0)",
           "input": {
             "session_id": "red-team",
             "message": {
@@ -890,8 +890,8 @@
             }
           },
           "expect": {
-            "exit_code": 0,
-            "stdout_match": "MEDIUM"
+            "exit_code": 2,
+            "stdout_match": "block"
           }
         },
         {
diff --git a/plugins/llm-security/scanners/lib/string-utils.mjs b/plugins/llm-security/scanners/lib/string-utils.mjs
index 8add0de..5653e17 100644
--- a/plugins/llm-security/scanners/lib/string-utils.mjs
+++ b/plugins/llm-security/scanners/lib/string-utils.mjs
@@ -451,11 +451,23 @@ const HOMOGLYPH_MAP = Object.freeze({
  * Norwegian/Polish/German/etc. text is NOT affected — characters like
  * æ, ø, å, é, ñ, ü, ö, ä are not in HOMOGLYPH_MAP.
  *
+ * Performance: pure-ASCII inputs short-circuit before NFKC, since NFKC is
+ * a no-op on ASCII and HOMOGLYPH_MAP only contains non-ASCII keys.
+ * scanForInjection calls this on every scan; the fast-path keeps the
+ * common-case overhead near zero.
+ *
  * @param {string} s
  * @returns {string}
  */
 export function foldHomoglyphs(s) {
   if (!s) return s;
+  // Fast path: pure ASCII has nothing to fold and NFKC is identity.
+  // charCodeAt is cheaper than iterating codepoints.
+  let asciiOnly = true;
+  for (let i = 0; i < s.length; i++) {
+    if (s.charCodeAt(i) > 127) { asciiOnly = false; break; }
+  }
+  if (asciiOnly) return s;
   const normalized = s.normalize('NFKC');
   let out = '';
   for (const ch of normalized) {