ktg-plugin-marketplace/plugins/llm-security/scanners/lib/string-utils.mjs
Kjell Tore Guttormsen 6073952b97 fix(injection): E16 ASCII fast-path + UNI-003 expectation update (v7.2.0)
Two follow-up fixes after E16 + E17 landed:

1. foldHomoglyphs ASCII fast-path
   - scanForInjection calls foldHomoglyphs on every scan (raw + normalized).
   - Pre-fix: NFKC normalization runs unconditionally, even on pure
     ASCII inputs where it's a no-op.
   - Result: benchmark.test.mjs timed out at 120s on the full suite.
   - Fix: charCodeAt sweep for >=128, short-circuit return s when
     all ASCII. NFKC and HOMOGLYPH_MAP iteration only run when
     non-ASCII chars are present (the actual attack case).
   - Verified: benchmark.test.mjs passes within timeout.

2. Attack-scenario UNI-003 expectation
   - Pre-E16: "Homoglyph Cyrillic-Latin mixing" payload triggered only
     a MEDIUM "obfuscation present" advisory (exit 0, stdout match
     "MEDIUM").
   - Post-E16: the same payload is folded to Latin BEFORE pattern
     matching, so it now matches CRITICAL "ignore previous instructions"
     and blocks (exit 2).
   - This is the intended v7.2.0 behavior — not a regression. Updated
     expectation: exit_code 2, stdout_match "block". Renamed scenario
     to "now blocked via E16 fold, v7.2.0".

Suite: pre-compact-scan flake remains (perf-budget under load,
passes isolated). All other tests green.
2026-04-29 14:44:41 +02:00

512 lines
16 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding
// Zero dependencies.
/**
* Shannon entropy of a string (bits per character).
* @param {string} s
* @returns {number}
*/
export function shannonEntropy(s) {
if (s.length === 0) return 0;
const freq = new Map();
for (const ch of s) {
freq.set(ch, (freq.get(ch) || 0) + 1);
}
let H = 0;
const len = s.length;
for (const count of freq.values()) {
const p = count / len;
H -= p * Math.log2(p);
}
return H;
}
/**
* Levenshtein edit distance between two strings.
* @param {string} a
* @param {string} b
* @returns {number}
*/
export function levenshtein(a, b) {
if (a === b) return 0;
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
const m = a.length;
const n = b.length;
// Single-row optimization
let prev = new Array(n + 1);
let curr = new Array(n + 1);
for (let j = 0; j <= n; j++) prev[j] = j;
for (let i = 1; i <= m; i++) {
curr[0] = i;
for (let j = 1; j <= n; j++) {
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
curr[j] = Math.min(
prev[j] + 1, // deletion
curr[j - 1] + 1, // insertion
prev[j - 1] + cost // substitution
);
}
[prev, curr] = [curr, prev];
}
return prev[n];
}
/**
* Split a package name into lowercase tokens on `-` and `_` boundaries.
* Used by the B7 typosquat token-overlap heuristic. Empty tokens are
* dropped. Single-character tokens are kept (some package names like
* `a-b` are real).
*
* @param {string} name
* @returns {string[]}
*/
export function tokenize(name) {
if (!name) return [];
return name
.toLowerCase()
.split(/[-_]+/)
.filter(t => t.length > 0);
}
/**
* Token-overlap ratio between two package names. Returns the size of the
* intersection divided by the size of the smaller token set. Returns 0 if
* either input is empty.
*
* Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0
* `tokenOverlap('react-router-dom', 'react')` → 1.0
* `tokenOverlap('react-helper', 'react-router')` → 0.5
* `tokenOverlap('foo', 'bar')` → 0.0
*
* Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein —
* Levenshtein <=2 catches small typos; token-overlap catches
* popular-name-with-suffix typosquats.
*
* @param {string} a
* @param {string} b
* @returns {number} 0..1
*/
export function tokenOverlap(a, b) {
const ta = new Set(tokenize(a));
const tb = new Set(tokenize(b));
if (ta.size === 0 || tb.size === 0) return 0;
let intersection = 0;
for (const t of ta) if (tb.has(t)) intersection++;
return intersection / Math.min(ta.size, tb.size);
}
/**
* Suspicious suffix tokens commonly used by typosquats to dress up a
* popular package name. Module-level for B7 reuse.
*
* Excluded by design (would conflict with the v7.0.0 typosquat allowlist
* or trigger false positives on legitimate packages):
* - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many
* legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The
* v7.0.0 allowlist contains `tsx` directly; including the same token
* in the suspicious set would create an internal contradiction.
* - `pro` — too common as a legitimate edition marker (`vue-pro`,
* `tailwindcss-pro`).
*
* Kept tokens are the unambiguous typosquat suffixes: utility/helper
* dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers.
*/
export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([
'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras',
'bin', 'cli', 'tool', 'tools',
'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim',
]);
/**
* Check if a string looks like base64-encoded data.
* @param {string} s
* @returns {boolean}
*/
export function isBase64Like(s) {
if (s.length < 20) return false;
// Must be mostly base64 chars and optionally end with =
return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s);
}
/**
* Check if a string looks like a hex-encoded blob.
* @param {string} s
* @returns {boolean}
*/
export function isHexBlob(s) {
if (s.length < 32) return false;
return /^(0x)?[0-9a-fA-F]{32,}$/.test(s);
}
/**
* Redact a string for safe display — show first 8 and last 4 chars.
* @param {string} s
* @param {number} [showStart=8]
* @param {number} [showEnd=4]
* @returns {string}
*/
export function redact(s, showStart = 8, showEnd = 4) {
if (s.length <= showStart + showEnd + 3) return s;
return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`;
}
/**
* Extract string literals from a line of code.
* Handles single-quoted, double-quoted, and backtick strings.
* @param {string} line
* @returns {string[]}
*/
export function extractStringLiterals(line) {
const results = [];
const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g;
let match;
while ((match = regex.exec(line)) !== null) {
results.push(match[1] ?? match[2] ?? match[3]);
}
return results;
}
// ---------------------------------------------------------------------------
// Encoding/obfuscation decoders
// ---------------------------------------------------------------------------
/**
* Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}.
* @param {string} s
* @returns {string}
*/
export function decodeUnicodeEscapes(s) {
return s
.replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => {
const cp = parseInt(hex, 16);
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
})
.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) =>
String.fromCodePoint(parseInt(hex, 16))
);
}
/**
* Decode hex escape sequences: \xXX.
* @param {string} s
* @returns {string}
*/
export function decodeHexEscapes(s) {
return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) =>
String.fromCharCode(parseInt(hex, 16))
);
}
/**
* Decode URL percent-encoding: %XX.
* Uses decodeURIComponent with fallback for malformed sequences.
* @param {string} s
* @returns {string}
*/
export function decodeUrlEncoding(s) {
// Fast path: no percent signs means nothing to decode
if (!s.includes('%')) return s;
try {
return decodeURIComponent(s);
} catch {
// Malformed sequences — decode individual %XX pairs
return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) =>
String.fromCharCode(parseInt(hex, 16))
);
}
}
/**
* Attempt to decode a base64 string to UTF-8 text.
* Returns null if the input is not base64-like or decoded result is not readable text.
* @param {string} s
* @returns {string|null}
*/
export function tryDecodeBase64(s) {
if (!isBase64Like(s)) return null;
try {
const decoded = Buffer.from(s, 'base64').toString('utf-8');
// Check if result is mostly printable text (>= 80% printable ASCII)
const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length;
if (decoded.length === 0 || printable / decoded.length < 0.8) return null;
return decoded;
} catch {
return null;
}
}
/**
* Decode HTML entities: named (&lt; &gt; &amp; &quot; &apos;),
* decimal (&#105;), and hex (&#x69;).
* @param {string} s
* @returns {string}
*/
export function decodeHtmlEntities(s) {
if (!s.includes('&')) return s;
const NAMED = {
'&lt;': '<', '&gt;': '>', '&amp;': '&', '&quot;': '"', '&apos;': "'",
'&nbsp;': ' ', '&tab;': '\t', '&newline;': '\n',
'&lpar;': '(', '&rpar;': ')', '&lsqb;': '[', '&rsqb;': ']',
'&lcub;': '{', '&rcub;': '}', '&sol;': '/', '&bsol;': '\\',
'&colon;': ':', '&semi;': ';', '&comma;': ',', '&period;': '.',
'&excl;': '!', '&quest;': '?', '&num;': '#', '&percnt;': '%',
'&equals;': '=', '&plus;': '+', '&minus;': '-', '&ast;': '*',
'&vert;': '|', '&tilde;': '~', '&grave;': '`', '&Hat;': '^',
'&lowbar;': '_', '&at;': '@', '&dollar;': '$',
};
return s
.replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => {
const cp = parseInt(hex, 16);
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
})
.replace(/&#(\d{1,7});/g, (_, dec) => {
const cp = parseInt(dec, 10);
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
})
.replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity);
}
/**
* Collapse letter-spaced text: "i g n o r e" → "ignore".
* Only collapses runs of single letters separated by spaces/tabs.
* Minimum 4 letters to avoid false positives on normal text.
* @param {string} s
* @returns {string}
*/
export function collapseLetterSpacing(s) {
// Match 4+ single-letter tokens separated by 1+ spaces/tabs
return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) =>
match.replace(/ /g, '')
);
}
// ---------------------------------------------------------------------------
// Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1
// ---------------------------------------------------------------------------
/**
* Decode Unicode Tags steganography: U+E0001-E007F → ASCII.
* Unicode Tags (U+E0000 block) can encode invisible ASCII text inside
* what appears to be empty or normal-looking strings.
* E.g., U+E0069 U+E0067 U+E006E → "ign"
*
* **Note (E1, v7.2.0):** Tag-block characters decode to ASCII via the
* `cp - 0xE0000` mapping. Private Use Areas (PUA-A: U+F0000-FFFFD;
* PUA-B: U+100000-10FFFD) are also detected as hidden Unicode by
* `containsUnicodeTags`, but they have NO standard ASCII mapping —
* they pass through this function unchanged. Detection of PUA presence
* is sufficient (HIGH advisory in scanForInjection), no decode needed.
*
* @param {string} s
* @returns {string}
*/
export function decodeUnicodeTags(s) {
let result = '';
let decoded = '';
let inTagSequence = false;
for (const ch of s) {
const cp = ch.codePointAt(0);
if (cp >= 0xE0001 && cp <= 0xE007F) {
// Tag character — map to ASCII (subtract 0xE0000)
decoded += String.fromCharCode(cp - 0xE0000);
inTagSequence = true;
} else {
if (inTagSequence && decoded.length > 0) {
result += decoded;
decoded = '';
inTagSequence = false;
}
result += ch;
}
}
// Flush remaining tag sequence
if (decoded.length > 0) {
result += decoded;
}
return result;
}
/**
* Check if a string contains hidden-Unicode characters that are commonly
* used for steganography in prompts and tool output.
*
* Covered ranges:
* - U+E0001-E007F Unicode Tag block (DeepMind traps kat. 1)
* - U+F0000-FFFFD Supplementary Private Use Area-A (E1, v7.2.0)
* - U+100000-10FFFD Supplementary Private Use Area-B (E1, v7.2.0)
*
* Presence of any of these characters is suspicious regardless of
* decoded content — they are invisible in most terminals and survive
* normalization. The function name `containsUnicodeTags` is preserved
* for back-compat (existing call sites in injection-patterns.mjs and
* elsewhere); semantically it is now "containsHiddenUnicode".
*
* Tag-block characters decode to ASCII via `decodeUnicodeTags`. PUA
* characters do NOT — they have no standard mapping and remain
* detection-only.
*
* @param {string} s
* @returns {boolean}
*/
export function containsUnicodeTags(s) {
for (const ch of s) {
const cp = ch.codePointAt(0);
if (cp >= 0xE0001 && cp <= 0xE007F) return true; // Tag block
if (cp >= 0xF0000 && cp <= 0xFFFFD) return true; // PUA-A (E1)
if (cp >= 0x100000 && cp <= 0x10FFFD) return true; // PUA-B (E1)
}
return false;
}
// ---------------------------------------------------------------------------
// BIDI override stripping
// ---------------------------------------------------------------------------
/**
* Strip BIDI override characters that can reorder text visually.
* U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO),
* U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI).
* These can hide injection by making text render differently than it parses.
* @param {string} s
* @returns {string}
*/
export function stripBidiOverrides(s) {
return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, '');
}
// ---------------------------------------------------------------------------
// Homoglyph folding (E16, v7.2.0)
// ---------------------------------------------------------------------------
/**
* Confusable mapping — characters that LOOK like Latin letters but are
* different codepoints (most commonly Cyrillic and Greek). Surgical map
* focused on letters that appear in injection vocabulary
* (`ignore`, `system`, `you are`, `assistant`, `tool`, `response`).
*
* Excluded by design:
* - Latin Extended characters (æ, ø, å, é, è, ñ, ü, ö, ä, ç, ß, þ, ð, etc.)
* — these are legitimate letters in Norwegian, German, Danish, Spanish,
* French, Icelandic, etc., and would generate false positives in
* non-English source code or documentation.
* - Greek letters that don't visually overlap with Latin (`β`, `γ`, `δ`, ...)
* - Cyrillic letters that don't visually overlap (`б`, `г`, `д`, `ж`, ...)
* - Mathematical alphanumeric symbols (the U+1D400 block) — covered by
* NFKC normalization in `foldHomoglyphs` itself.
*
* The map is deliberately small (~25 entries). Adding more risks
* false-positive escalation on benign multilingual content.
*/
const HOMOGLYPH_MAP = Object.freeze({
// Cyrillic → Latin (lowercase)
'а': 'a', // U+0430
'е': 'e', // U+0435
'о': 'o', // U+043E
'с': 'c', // U+0441
'р': 'p', // U+0440
'х': 'x', // U+0445
'у': 'y', // U+0443
'і': 'i', // U+0456 (Ukrainian)
'ј': 'j', // U+0458
'ѕ': 's', // U+0455
'ӏ': 'l', // U+04CF (Cyrillic Palochka)
// Cyrillic → Latin (uppercase)
'А': 'A', // U+0410
'Е': 'E', // U+0415
'О': 'O', // U+041E
'С': 'C', // U+0421
'Р': 'P', // U+0420
'Х': 'X', // U+0425
'У': 'Y', // U+0423
// Greek → Latin (only the unambiguous Latin-look-alikes)
'α': 'a', // U+03B1
'ο': 'o', // U+03BF
'ρ': 'p', // U+03C1
'ι': 'i', // U+03B9
'ν': 'v', // U+03BD
'τ': 't', // U+03C4
// Greek uppercase
'Α': 'A', // U+0391
'Ο': 'O', // U+039F
'Ρ': 'P', // U+03A1
'Τ': 'T', // U+03A4
});
/**
* Fold visually-confusable characters to their Latin look-alikes. Used by
* E16 (v7.2.0) to neutralize homoglyph-substitution injection attacks
* before pattern matching.
*
* Pipeline:
* 1. NFKC normalize — collapses Mathematical Alphanumeric (U+1D400),
* width variants, ligatures, and other compatibility decompositions.
* 2. Apply HOMOGLYPH_MAP — Cyrillic/Greek look-alikes → Latin.
*
* Idempotent: `foldHomoglyphs(foldHomoglyphs(s)) === foldHomoglyphs(s)`.
*
* Norwegian/Polish/German/etc. text is NOT affected — characters like
* æ, ø, å, é, ñ, ü, ö, ä are not in HOMOGLYPH_MAP.
*
* Performance: pure-ASCII inputs short-circuit before NFKC, since NFKC is
* a no-op on ASCII and HOMOGLYPH_MAP only contains non-ASCII keys.
* scanForInjection calls this on every scan; the fast-path keeps the
* common-case overhead near zero.
*
* @param {string} s
* @returns {string}
*/
export function foldHomoglyphs(s) {
if (!s) return s;
// Fast path: pure ASCII has nothing to fold and NFKC is identity.
// charCodeAt is cheaper than iterating codepoints.
let asciiOnly = true;
for (let i = 0; i < s.length; i++) {
if (s.charCodeAt(i) > 127) { asciiOnly = false; break; }
}
if (asciiOnly) return s;
const normalized = s.normalize('NFKC');
let out = '';
for (const ch of normalized) {
out += HOMOGLYPH_MAP[ch] || ch;
}
return out;
}
/**
* Normalize a string by decoding all known obfuscation layers.
* Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded).
* Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes ->
* hex escapes -> URL encoding -> base64.
* After decoding: collapse letter-spaced text.
* @param {string} s
* @returns {string}
*/
export function normalizeForScan(s) {
let result = s;
const MAX_ITERATIONS = 3;
// Pre-decode: Unicode Tags and BIDI overrides (before the main loop)
result = decodeUnicodeTags(result);
result = stripBidiOverrides(result);
for (let i = 0; i < MAX_ITERATIONS; i++) {
const prev = result;
result = decodeHtmlEntities(result);
result = decodeUnicodeEscapes(result);
result = decodeHexEscapes(result);
result = decodeUrlEncoding(result);
const b64decoded = tryDecodeBase64(result);
if (b64decoded) result = b64decoded;
// Stable — no further decoding possible
if (result === prev) break;
}
// Post-decode: collapse letter-spaced evasion
result = collapseLetterSpacing(result);
return result;
}