ktg-plugin-marketplace/plugins/llm-security/scanners/lib/string-utils.mjs
Kjell Tore Guttormsen 950e4e4bce feat(injection): E3 — rot13 layer for comment-block injection
Adds rot13 to the variantSet built in scanForInjection(), so
imperative phrases hidden as rot13 inside code comments still hit
the existing CRITICAL/HIGH/MEDIUM pattern arrays.

normalizeForScan() already covers base64, hex, URL, and HTML decoding
in a 3-iteration loop — those are NOT duplicated here. rot13 is the
only genuinely new variant: it is its own inverse and not part of any
NIST/Unicode normalization spec, so it has to be applied explicitly.

Threshold: only inputs >40 chars enter the rot13 pass, to suppress
false positives on accidental letter-shifts in tokens, ids, and short
identifiers. Variants are deduplicated against the existing set so
matchers do not run twice.

3 new tests in injection-patterns.test.mjs (rot13 detection, sub-40
char suppression, plaintext path still green). Total 168 tests pass.

Closes E3 in critical-review-2026-04-20.md.
2026-04-30 15:21:03 +02:00

536 lines
17 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding
// Zero dependencies.
/**
* Shannon entropy of a string (bits per character).
* @param {string} s
* @returns {number}
*/
export function shannonEntropy(s) {
if (s.length === 0) return 0;
const freq = new Map();
for (const ch of s) {
freq.set(ch, (freq.get(ch) || 0) + 1);
}
let H = 0;
const len = s.length;
for (const count of freq.values()) {
const p = count / len;
H -= p * Math.log2(p);
}
return H;
}
/**
* Levenshtein edit distance between two strings.
* @param {string} a
* @param {string} b
* @returns {number}
*/
export function levenshtein(a, b) {
if (a === b) return 0;
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
const m = a.length;
const n = b.length;
// Single-row optimization
let prev = new Array(n + 1);
let curr = new Array(n + 1);
for (let j = 0; j <= n; j++) prev[j] = j;
for (let i = 1; i <= m; i++) {
curr[0] = i;
for (let j = 1; j <= n; j++) {
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
curr[j] = Math.min(
prev[j] + 1, // deletion
curr[j - 1] + 1, // insertion
prev[j - 1] + cost // substitution
);
}
[prev, curr] = [curr, prev];
}
return prev[n];
}
/**
* Split a package name into lowercase tokens on `-` and `_` boundaries.
* Used by the B7 typosquat token-overlap heuristic. Empty tokens are
* dropped. Single-character tokens are kept (some package names like
* `a-b` are real).
*
* @param {string} name
* @returns {string[]}
*/
export function tokenize(name) {
if (!name) return [];
return name
.toLowerCase()
.split(/[-_]+/)
.filter(t => t.length > 0);
}
/**
* Token-overlap ratio between two package names. Returns the size of the
* intersection divided by the size of the smaller token set. Returns 0 if
* either input is empty.
*
* Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0
* `tokenOverlap('react-router-dom', 'react')` → 1.0
* `tokenOverlap('react-helper', 'react-router')` → 0.5
* `tokenOverlap('foo', 'bar')` → 0.0
*
* Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein —
* Levenshtein <=2 catches small typos; token-overlap catches
* popular-name-with-suffix typosquats.
*
* @param {string} a
* @param {string} b
* @returns {number} 0..1
*/
export function tokenOverlap(a, b) {
const ta = new Set(tokenize(a));
const tb = new Set(tokenize(b));
if (ta.size === 0 || tb.size === 0) return 0;
let intersection = 0;
for (const t of ta) if (tb.has(t)) intersection++;
return intersection / Math.min(ta.size, tb.size);
}
/**
* Suspicious suffix tokens commonly used by typosquats to dress up a
* popular package name. Module-level for B7 reuse.
*
* Excluded by design (would conflict with the v7.0.0 typosquat allowlist
* or trigger false positives on legitimate packages):
* - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many
* legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The
* v7.0.0 allowlist contains `tsx` directly; including the same token
* in the suspicious set would create an internal contradiction.
* - `pro` — too common as a legitimate edition marker (`vue-pro`,
* `tailwindcss-pro`).
*
* Kept tokens are the unambiguous typosquat suffixes: utility/helper
* dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers.
*/
export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([
'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras',
'bin', 'cli', 'tool', 'tools',
'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim',
]);
/**
* Check if a string looks like base64-encoded data.
* @param {string} s
* @returns {boolean}
*/
export function isBase64Like(s) {
if (s.length < 20) return false;
// Must be mostly base64 chars and optionally end with =
return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s);
}
/**
* Check if a string looks like a hex-encoded blob.
* @param {string} s
* @returns {boolean}
*/
export function isHexBlob(s) {
if (s.length < 32) return false;
return /^(0x)?[0-9a-fA-F]{32,}$/.test(s);
}
/**
* Redact a string for safe display — show first 8 and last 4 chars.
* @param {string} s
* @param {number} [showStart=8]
* @param {number} [showEnd=4]
* @returns {string}
*/
export function redact(s, showStart = 8, showEnd = 4) {
if (s.length <= showStart + showEnd + 3) return s;
return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`;
}
/**
* Extract string literals from a line of code.
* Handles single-quoted, double-quoted, and backtick strings.
* @param {string} line
* @returns {string[]}
*/
export function extractStringLiterals(line) {
const results = [];
const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g;
let match;
while ((match = regex.exec(line)) !== null) {
results.push(match[1] ?? match[2] ?? match[3]);
}
return results;
}
// ---------------------------------------------------------------------------
// Encoding/obfuscation decoders
// ---------------------------------------------------------------------------
/**
* Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}.
* @param {string} s
* @returns {string}
*/
export function decodeUnicodeEscapes(s) {
return s
.replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => {
const cp = parseInt(hex, 16);
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
})
.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) =>
String.fromCodePoint(parseInt(hex, 16))
);
}
/**
* Decode hex escape sequences: \xXX.
* @param {string} s
* @returns {string}
*/
export function decodeHexEscapes(s) {
return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) =>
String.fromCharCode(parseInt(hex, 16))
);
}
/**
* Decode URL percent-encoding: %XX.
* Uses decodeURIComponent with fallback for malformed sequences.
* @param {string} s
* @returns {string}
*/
export function decodeUrlEncoding(s) {
// Fast path: no percent signs means nothing to decode
if (!s.includes('%')) return s;
try {
return decodeURIComponent(s);
} catch {
// Malformed sequences — decode individual %XX pairs
return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) =>
String.fromCharCode(parseInt(hex, 16))
);
}
}
/**
* Attempt to decode a base64 string to UTF-8 text.
* Returns null if the input is not base64-like or decoded result is not readable text.
* @param {string} s
* @returns {string|null}
*/
export function tryDecodeBase64(s) {
if (!isBase64Like(s)) return null;
try {
const decoded = Buffer.from(s, 'base64').toString('utf-8');
// Check if result is mostly printable text (>= 80% printable ASCII)
const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length;
if (decoded.length === 0 || printable / decoded.length < 0.8) return null;
return decoded;
} catch {
return null;
}
}
/**
* Decode HTML entities: named (&lt; &gt; &amp; &quot; &apos;),
* decimal (&#105;), and hex (&#x69;).
* @param {string} s
* @returns {string}
*/
export function decodeHtmlEntities(s) {
if (!s.includes('&')) return s;
const NAMED = {
'&lt;': '<', '&gt;': '>', '&amp;': '&', '&quot;': '"', '&apos;': "'",
'&nbsp;': ' ', '&tab;': '\t', '&newline;': '\n',
'&lpar;': '(', '&rpar;': ')', '&lsqb;': '[', '&rsqb;': ']',
'&lcub;': '{', '&rcub;': '}', '&sol;': '/', '&bsol;': '\\',
'&colon;': ':', '&semi;': ';', '&comma;': ',', '&period;': '.',
'&excl;': '!', '&quest;': '?', '&num;': '#', '&percnt;': '%',
'&equals;': '=', '&plus;': '+', '&minus;': '-', '&ast;': '*',
'&vert;': '|', '&tilde;': '~', '&grave;': '`', '&Hat;': '^',
'&lowbar;': '_', '&at;': '@', '&dollar;': '$',
};
return s
.replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => {
const cp = parseInt(hex, 16);
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
})
.replace(/&#(\d{1,7});/g, (_, dec) => {
const cp = parseInt(dec, 10);
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
})
.replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity);
}
/**
* Collapse letter-spaced text: "i g n o r e" → "ignore".
* Only collapses runs of single letters separated by spaces/tabs.
* Minimum 4 letters to avoid false positives on normal text.
* @param {string} s
* @returns {string}
*/
export function collapseLetterSpacing(s) {
// Match 4+ single-letter tokens separated by 1+ spaces/tabs
return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) =>
match.replace(/ /g, '')
);
}
// ---------------------------------------------------------------------------
// Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1
// ---------------------------------------------------------------------------
/**
* Decode Unicode Tags steganography: U+E0001-E007F → ASCII.
* Unicode Tags (U+E0000 block) can encode invisible ASCII text inside
* what appears to be empty or normal-looking strings.
* E.g., U+E0069 U+E0067 U+E006E → "ign"
*
* **Note (E1, v7.2.0):** Tag-block characters decode to ASCII via the
* `cp - 0xE0000` mapping. Private Use Areas (PUA-A: U+F0000-FFFFD;
* PUA-B: U+100000-10FFFD) are also detected as hidden Unicode by
* `containsUnicodeTags`, but they have NO standard ASCII mapping —
* they pass through this function unchanged. Detection of PUA presence
* is sufficient (HIGH advisory in scanForInjection), no decode needed.
*
* @param {string} s
* @returns {string}
*/
export function decodeUnicodeTags(s) {
let result = '';
let decoded = '';
let inTagSequence = false;
for (const ch of s) {
const cp = ch.codePointAt(0);
if (cp >= 0xE0001 && cp <= 0xE007F) {
// Tag character — map to ASCII (subtract 0xE0000)
decoded += String.fromCharCode(cp - 0xE0000);
inTagSequence = true;
} else {
if (inTagSequence && decoded.length > 0) {
result += decoded;
decoded = '';
inTagSequence = false;
}
result += ch;
}
}
// Flush remaining tag sequence
if (decoded.length > 0) {
result += decoded;
}
return result;
}
/**
* Check if a string contains hidden-Unicode characters that are commonly
* used for steganography in prompts and tool output.
*
* Covered ranges:
* - U+E0001-E007F Unicode Tag block (DeepMind traps kat. 1)
* - U+F0000-FFFFD Supplementary Private Use Area-A (E1, v7.2.0)
* - U+100000-10FFFD Supplementary Private Use Area-B (E1, v7.2.0)
*
* Presence of any of these characters is suspicious regardless of
* decoded content — they are invisible in most terminals and survive
* normalization. The function name `containsUnicodeTags` is preserved
* for back-compat (existing call sites in injection-patterns.mjs and
* elsewhere); semantically it is now "containsHiddenUnicode".
*
* Tag-block characters decode to ASCII via `decodeUnicodeTags`. PUA
* characters do NOT — they have no standard mapping and remain
* detection-only.
*
* @param {string} s
* @returns {boolean}
*/
export function containsUnicodeTags(s) {
for (const ch of s) {
const cp = ch.codePointAt(0);
if (cp >= 0xE0001 && cp <= 0xE007F) return true; // Tag block
if (cp >= 0xF0000 && cp <= 0xFFFFD) return true; // PUA-A (E1)
if (cp >= 0x100000 && cp <= 0x10FFFD) return true; // PUA-B (E1)
}
return false;
}
// ---------------------------------------------------------------------------
// BIDI override stripping
// ---------------------------------------------------------------------------
/**
* Strip BIDI override characters that can reorder text visually.
* U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO),
* U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI).
* These can hide injection by making text render differently than it parses.
* @param {string} s
* @returns {string}
*/
export function stripBidiOverrides(s) {
return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, '');
}
// ---------------------------------------------------------------------------
// Homoglyph folding (E16, v7.2.0)
// ---------------------------------------------------------------------------
/**
* Confusable mapping — characters that LOOK like Latin letters but are
* different codepoints (most commonly Cyrillic and Greek). Surgical map
* focused on letters that appear in injection vocabulary
* (`ignore`, `system`, `you are`, `assistant`, `tool`, `response`).
*
* Excluded by design:
* - Latin Extended characters (æ, ø, å, é, è, ñ, ü, ö, ä, ç, ß, þ, ð, etc.)
* — these are legitimate letters in Norwegian, German, Danish, Spanish,
* French, Icelandic, etc., and would generate false positives in
* non-English source code or documentation.
* - Greek letters that don't visually overlap with Latin (`β`, `γ`, `δ`, ...)
* - Cyrillic letters that don't visually overlap (`б`, `г`, `д`, `ж`, ...)
* - Mathematical alphanumeric symbols (the U+1D400 block) — covered by
* NFKC normalization in `foldHomoglyphs` itself.
*
* The map is deliberately small (~25 entries). Adding more risks
* false-positive escalation on benign multilingual content.
*/
const HOMOGLYPH_MAP = Object.freeze({
// Cyrillic → Latin (lowercase)
'а': 'a', // U+0430
'е': 'e', // U+0435
'о': 'o', // U+043E
'с': 'c', // U+0441
'р': 'p', // U+0440
'х': 'x', // U+0445
'у': 'y', // U+0443
'і': 'i', // U+0456 (Ukrainian)
'ј': 'j', // U+0458
'ѕ': 's', // U+0455
'ӏ': 'l', // U+04CF (Cyrillic Palochka)
// Cyrillic → Latin (uppercase)
'А': 'A', // U+0410
'Е': 'E', // U+0415
'О': 'O', // U+041E
'С': 'C', // U+0421
'Р': 'P', // U+0420
'Х': 'X', // U+0425
'У': 'Y', // U+0423
// Greek → Latin (only the unambiguous Latin-look-alikes)
'α': 'a', // U+03B1
'ο': 'o', // U+03BF
'ρ': 'p', // U+03C1
'ι': 'i', // U+03B9
'ν': 'v', // U+03BD
'τ': 't', // U+03C4
// Greek uppercase
'Α': 'A', // U+0391
'Ο': 'O', // U+039F
'Ρ': 'P', // U+03A1
'Τ': 'T', // U+03A4
});
/**
* Fold visually-confusable characters to their Latin look-alikes. Used by
* E16 (v7.2.0) to neutralize homoglyph-substitution injection attacks
* before pattern matching.
*
* Pipeline:
* 1. NFKC normalize — collapses Mathematical Alphanumeric (U+1D400),
* width variants, ligatures, and other compatibility decompositions.
* 2. Apply HOMOGLYPH_MAP — Cyrillic/Greek look-alikes → Latin.
*
* Idempotent: `foldHomoglyphs(foldHomoglyphs(s)) === foldHomoglyphs(s)`.
*
* Norwegian/Polish/German/etc. text is NOT affected — characters like
* æ, ø, å, é, ñ, ü, ö, ä are not in HOMOGLYPH_MAP.
*
* Performance: pure-ASCII inputs short-circuit before NFKC, since NFKC is
* a no-op on ASCII and HOMOGLYPH_MAP only contains non-ASCII keys.
* scanForInjection calls this on every scan; the fast-path keeps the
* common-case overhead near zero.
*
* @param {string} s
* @returns {string}
*/
/**
* Apply rot13 (Caesar shift by 13) to ASCII letters.
* Non-letters pass through unchanged. The transform is its own inverse.
*
* Used by E3 comment-block injection detection: attackers sometimes hide
* imperative phrases ("ignore previous instructions") in rot13 inside
* code comments. normalizeForScan() does not apply rot13, so this layer
* is added explicitly to the variantSet in scanForInjection().
*
* @param {string} s
* @returns {string}
*/
export function rot13(s) {
if (!s) return s;
let out = '';
for (let i = 0; i < s.length; i++) {
const c = s.charCodeAt(i);
if (c >= 65 && c <= 90) out += String.fromCharCode(((c - 65 + 13) % 26) + 65);
else if (c >= 97 && c <= 122) out += String.fromCharCode(((c - 97 + 13) % 26) + 97);
else out += s[i];
}
return out;
}
export function foldHomoglyphs(s) {
if (!s) return s;
// Fast path: pure ASCII has nothing to fold and NFKC is identity.
// charCodeAt is cheaper than iterating codepoints.
let asciiOnly = true;
for (let i = 0; i < s.length; i++) {
if (s.charCodeAt(i) > 127) { asciiOnly = false; break; }
}
if (asciiOnly) return s;
const normalized = s.normalize('NFKC');
let out = '';
for (const ch of normalized) {
out += HOMOGLYPH_MAP[ch] || ch;
}
return out;
}
/**
* Normalize a string by decoding all known obfuscation layers.
* Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded).
* Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes ->
* hex escapes -> URL encoding -> base64.
* After decoding: collapse letter-spaced text.
* @param {string} s
* @returns {string}
*/
export function normalizeForScan(s) {
let result = s;
const MAX_ITERATIONS = 3;
// Pre-decode: Unicode Tags and BIDI overrides (before the main loop)
result = decodeUnicodeTags(result);
result = stripBidiOverrides(result);
for (let i = 0; i < MAX_ITERATIONS; i++) {
const prev = result;
result = decodeHtmlEntities(result);
result = decodeUnicodeEscapes(result);
result = decodeHexEscapes(result);
result = decodeUrlEncoding(result);
const b64decoded = tryDecodeBase64(result);
if (b64decoded) result = b64decoded;
// Stable — no further decoding possible
if (result === prev) break;
}
// Post-decode: collapse letter-spaced evasion
result = collapseLetterSpacing(result);
return result;
}