Adds rot13 to the variantSet built in scanForInjection(), so imperative phrases hidden as rot13 inside code comments still hit the existing CRITICAL/HIGH/MEDIUM pattern arrays. normalizeForScan() already covers base64, hex, URL, and HTML decoding in a 3-iteration loop — those are NOT duplicated here. rot13 is the only genuinely new variant: it is its own inverse and not part of any NIST/Unicode normalization spec, so it has to be applied explicitly. Threshold: only inputs >40 chars enter the rot13 pass, to suppress false positives on accidental letter-shifts in tokens, ids, and short identifiers. Variants are deduplicated against the existing set so matchers do not run twice. 3 new tests in injection-patterns.test.mjs (rot13 detection, sub-40 char suppression, plaintext path still green). Total 168 tests pass. Closes E3 in critical-review-2026-04-20.md.
536 lines
17 KiB
JavaScript
536 lines
17 KiB
JavaScript
// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding
|
||
// Zero dependencies.
|
||
|
||
/**
|
||
* Shannon entropy of a string (bits per character).
|
||
* @param {string} s
|
||
* @returns {number}
|
||
*/
|
||
export function shannonEntropy(s) {
|
||
if (s.length === 0) return 0;
|
||
const freq = new Map();
|
||
for (const ch of s) {
|
||
freq.set(ch, (freq.get(ch) || 0) + 1);
|
||
}
|
||
let H = 0;
|
||
const len = s.length;
|
||
for (const count of freq.values()) {
|
||
const p = count / len;
|
||
H -= p * Math.log2(p);
|
||
}
|
||
return H;
|
||
}
|
||
|
||
/**
|
||
* Levenshtein edit distance between two strings.
|
||
* @param {string} a
|
||
* @param {string} b
|
||
* @returns {number}
|
||
*/
|
||
export function levenshtein(a, b) {
|
||
if (a === b) return 0;
|
||
if (a.length === 0) return b.length;
|
||
if (b.length === 0) return a.length;
|
||
|
||
const m = a.length;
|
||
const n = b.length;
|
||
// Single-row optimization
|
||
let prev = new Array(n + 1);
|
||
let curr = new Array(n + 1);
|
||
for (let j = 0; j <= n; j++) prev[j] = j;
|
||
|
||
for (let i = 1; i <= m; i++) {
|
||
curr[0] = i;
|
||
for (let j = 1; j <= n; j++) {
|
||
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
||
curr[j] = Math.min(
|
||
prev[j] + 1, // deletion
|
||
curr[j - 1] + 1, // insertion
|
||
prev[j - 1] + cost // substitution
|
||
);
|
||
}
|
||
[prev, curr] = [curr, prev];
|
||
}
|
||
return prev[n];
|
||
}
|
||
|
||
/**
|
||
* Split a package name into lowercase tokens on `-` and `_` boundaries.
|
||
* Used by the B7 typosquat token-overlap heuristic. Empty tokens are
|
||
* dropped. Single-character tokens are kept (some package names like
|
||
* `a-b` are real).
|
||
*
|
||
* @param {string} name
|
||
* @returns {string[]}
|
||
*/
|
||
export function tokenize(name) {
|
||
if (!name) return [];
|
||
return name
|
||
.toLowerCase()
|
||
.split(/[-_]+/)
|
||
.filter(t => t.length > 0);
|
||
}
|
||
|
||
/**
|
||
* Token-overlap ratio between two package names. Returns the size of the
|
||
* intersection divided by the size of the smaller token set. Returns 0 if
|
||
* either input is empty.
|
||
*
|
||
* Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0
|
||
* `tokenOverlap('react-router-dom', 'react')` → 1.0
|
||
* `tokenOverlap('react-helper', 'react-router')` → 0.5
|
||
* `tokenOverlap('foo', 'bar')` → 0.0
|
||
*
|
||
* Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein —
|
||
* Levenshtein <=2 catches small typos; token-overlap catches
|
||
* popular-name-with-suffix typosquats.
|
||
*
|
||
* @param {string} a
|
||
* @param {string} b
|
||
* @returns {number} 0..1
|
||
*/
|
||
export function tokenOverlap(a, b) {
|
||
const ta = new Set(tokenize(a));
|
||
const tb = new Set(tokenize(b));
|
||
if (ta.size === 0 || tb.size === 0) return 0;
|
||
let intersection = 0;
|
||
for (const t of ta) if (tb.has(t)) intersection++;
|
||
return intersection / Math.min(ta.size, tb.size);
|
||
}
|
||
|
||
/**
|
||
* Suspicious suffix tokens commonly used by typosquats to dress up a
|
||
* popular package name. Module-level for B7 reuse.
|
||
*
|
||
* Excluded by design (would conflict with the v7.0.0 typosquat allowlist
|
||
* or trigger false positives on legitimate packages):
|
||
* - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many
|
||
* legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The
|
||
* v7.0.0 allowlist contains `tsx` directly; including the same token
|
||
* in the suspicious set would create an internal contradiction.
|
||
* - `pro` — too common as a legitimate edition marker (`vue-pro`,
|
||
* `tailwindcss-pro`).
|
||
*
|
||
* Kept tokens are the unambiguous typosquat suffixes: utility/helper
|
||
* dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers.
|
||
*/
|
||
export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([
|
||
'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras',
|
||
'bin', 'cli', 'tool', 'tools',
|
||
'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim',
|
||
]);
|
||
|
||
/**
|
||
* Check if a string looks like base64-encoded data.
|
||
* @param {string} s
|
||
* @returns {boolean}
|
||
*/
|
||
export function isBase64Like(s) {
|
||
if (s.length < 20) return false;
|
||
// Must be mostly base64 chars and optionally end with =
|
||
return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s);
|
||
}
|
||
|
||
/**
|
||
* Check if a string looks like a hex-encoded blob.
|
||
* @param {string} s
|
||
* @returns {boolean}
|
||
*/
|
||
export function isHexBlob(s) {
|
||
if (s.length < 32) return false;
|
||
return /^(0x)?[0-9a-fA-F]{32,}$/.test(s);
|
||
}
|
||
|
||
/**
|
||
* Redact a string for safe display — show first 8 and last 4 chars.
|
||
* @param {string} s
|
||
* @param {number} [showStart=8]
|
||
* @param {number} [showEnd=4]
|
||
* @returns {string}
|
||
*/
|
||
export function redact(s, showStart = 8, showEnd = 4) {
|
||
if (s.length <= showStart + showEnd + 3) return s;
|
||
return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`;
|
||
}
|
||
|
||
/**
|
||
* Extract string literals from a line of code.
|
||
* Handles single-quoted, double-quoted, and backtick strings.
|
||
* @param {string} line
|
||
* @returns {string[]}
|
||
*/
|
||
export function extractStringLiterals(line) {
|
||
const results = [];
|
||
const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g;
|
||
let match;
|
||
while ((match = regex.exec(line)) !== null) {
|
||
results.push(match[1] ?? match[2] ?? match[3]);
|
||
}
|
||
return results;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Encoding/obfuscation decoders
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}.
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function decodeUnicodeEscapes(s) {
|
||
return s
|
||
.replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => {
|
||
const cp = parseInt(hex, 16);
|
||
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
||
})
|
||
.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) =>
|
||
String.fromCodePoint(parseInt(hex, 16))
|
||
);
|
||
}
|
||
|
||
/**
|
||
* Decode hex escape sequences: \xXX.
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function decodeHexEscapes(s) {
|
||
return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) =>
|
||
String.fromCharCode(parseInt(hex, 16))
|
||
);
|
||
}
|
||
|
||
/**
|
||
* Decode URL percent-encoding: %XX.
|
||
* Uses decodeURIComponent with fallback for malformed sequences.
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function decodeUrlEncoding(s) {
|
||
// Fast path: no percent signs means nothing to decode
|
||
if (!s.includes('%')) return s;
|
||
try {
|
||
return decodeURIComponent(s);
|
||
} catch {
|
||
// Malformed sequences — decode individual %XX pairs
|
||
return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) =>
|
||
String.fromCharCode(parseInt(hex, 16))
|
||
);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Attempt to decode a base64 string to UTF-8 text.
|
||
* Returns null if the input is not base64-like or decoded result is not readable text.
|
||
* @param {string} s
|
||
* @returns {string|null}
|
||
*/
|
||
export function tryDecodeBase64(s) {
|
||
if (!isBase64Like(s)) return null;
|
||
try {
|
||
const decoded = Buffer.from(s, 'base64').toString('utf-8');
|
||
// Check if result is mostly printable text (>= 80% printable ASCII)
|
||
const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length;
|
||
if (decoded.length === 0 || printable / decoded.length < 0.8) return null;
|
||
return decoded;
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Decode HTML entities: named (< > & " '),
|
||
* decimal (i), and hex (i).
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function decodeHtmlEntities(s) {
|
||
if (!s.includes('&')) return s;
|
||
const NAMED = {
|
||
'<': '<', '>': '>', '&': '&', '"': '"', ''': "'",
|
||
' ': ' ', '&tab;': '\t', '&newline;': '\n',
|
||
'(': '(', ')': ')', '[': '[', ']': ']',
|
||
'{': '{', '}': '}', '/': '/', '\': '\\',
|
||
':': ':', ';': ';', ',': ',', '.': '.',
|
||
'!': '!', '?': '?', '#': '#', '%': '%',
|
||
'=': '=', '+': '+', '−': '-', '*': '*',
|
||
'|': '|', '˜': '~', '`': '`', '^': '^',
|
||
'_': '_', '&at;': '@', '$': '$',
|
||
};
|
||
return s
|
||
.replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => {
|
||
const cp = parseInt(hex, 16);
|
||
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
||
})
|
||
.replace(/&#(\d{1,7});/g, (_, dec) => {
|
||
const cp = parseInt(dec, 10);
|
||
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
||
})
|
||
.replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity);
|
||
}
|
||
|
||
/**
|
||
* Collapse letter-spaced text: "i g n o r e" → "ignore".
|
||
* Only collapses runs of single letters separated by spaces/tabs.
|
||
* Minimum 4 letters to avoid false positives on normal text.
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function collapseLetterSpacing(s) {
|
||
// Match 4+ single-letter tokens separated by 1+ spaces/tabs
|
||
return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) =>
|
||
match.replace(/ /g, '')
|
||
);
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Decode Unicode Tags steganography: U+E0001-E007F → ASCII.
|
||
* Unicode Tags (U+E0000 block) can encode invisible ASCII text inside
|
||
* what appears to be empty or normal-looking strings.
|
||
* E.g., U+E0069 U+E0067 U+E006E → "ign"
|
||
*
|
||
* **Note (E1, v7.2.0):** Tag-block characters decode to ASCII via the
|
||
* `cp - 0xE0000` mapping. Private Use Areas (PUA-A: U+F0000-FFFFD;
|
||
* PUA-B: U+100000-10FFFD) are also detected as hidden Unicode by
|
||
* `containsUnicodeTags`, but they have NO standard ASCII mapping —
|
||
* they pass through this function unchanged. Detection of PUA presence
|
||
* is sufficient (HIGH advisory in scanForInjection), no decode needed.
|
||
*
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function decodeUnicodeTags(s) {
|
||
let result = '';
|
||
let decoded = '';
|
||
let inTagSequence = false;
|
||
|
||
for (const ch of s) {
|
||
const cp = ch.codePointAt(0);
|
||
if (cp >= 0xE0001 && cp <= 0xE007F) {
|
||
// Tag character — map to ASCII (subtract 0xE0000)
|
||
decoded += String.fromCharCode(cp - 0xE0000);
|
||
inTagSequence = true;
|
||
} else {
|
||
if (inTagSequence && decoded.length > 0) {
|
||
result += decoded;
|
||
decoded = '';
|
||
inTagSequence = false;
|
||
}
|
||
result += ch;
|
||
}
|
||
}
|
||
// Flush remaining tag sequence
|
||
if (decoded.length > 0) {
|
||
result += decoded;
|
||
}
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* Check if a string contains hidden-Unicode characters that are commonly
|
||
* used for steganography in prompts and tool output.
|
||
*
|
||
* Covered ranges:
|
||
* - U+E0001-E007F Unicode Tag block (DeepMind traps kat. 1)
|
||
* - U+F0000-FFFFD Supplementary Private Use Area-A (E1, v7.2.0)
|
||
* - U+100000-10FFFD Supplementary Private Use Area-B (E1, v7.2.0)
|
||
*
|
||
* Presence of any of these characters is suspicious regardless of
|
||
* decoded content — they are invisible in most terminals and survive
|
||
* normalization. The function name `containsUnicodeTags` is preserved
|
||
* for back-compat (existing call sites in injection-patterns.mjs and
|
||
* elsewhere); semantically it is now "containsHiddenUnicode".
|
||
*
|
||
* Tag-block characters decode to ASCII via `decodeUnicodeTags`. PUA
|
||
* characters do NOT — they have no standard mapping and remain
|
||
* detection-only.
|
||
*
|
||
* @param {string} s
|
||
* @returns {boolean}
|
||
*/
|
||
export function containsUnicodeTags(s) {
|
||
for (const ch of s) {
|
||
const cp = ch.codePointAt(0);
|
||
if (cp >= 0xE0001 && cp <= 0xE007F) return true; // Tag block
|
||
if (cp >= 0xF0000 && cp <= 0xFFFFD) return true; // PUA-A (E1)
|
||
if (cp >= 0x100000 && cp <= 0x10FFFD) return true; // PUA-B (E1)
|
||
}
|
||
return false;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// BIDI override stripping
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Strip BIDI override characters that can reorder text visually.
|
||
* U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO),
|
||
* U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI).
|
||
* These can hide injection by making text render differently than it parses.
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function stripBidiOverrides(s) {
|
||
return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, '');
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Homoglyph folding (E16, v7.2.0)
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Confusable mapping — characters that LOOK like Latin letters but are
|
||
* different codepoints (most commonly Cyrillic and Greek). Surgical map
|
||
* focused on letters that appear in injection vocabulary
|
||
* (`ignore`, `system`, `you are`, `assistant`, `tool`, `response`).
|
||
*
|
||
* Excluded by design:
|
||
* - Latin Extended characters (æ, ø, å, é, è, ñ, ü, ö, ä, ç, ß, þ, ð, etc.)
|
||
* — these are legitimate letters in Norwegian, German, Danish, Spanish,
|
||
* French, Icelandic, etc., and would generate false positives in
|
||
* non-English source code or documentation.
|
||
* - Greek letters that don't visually overlap with Latin (`β`, `γ`, `δ`, ...)
|
||
* - Cyrillic letters that don't visually overlap (`б`, `г`, `д`, `ж`, ...)
|
||
* - Mathematical alphanumeric symbols (the U+1D400 block) — covered by
|
||
* NFKC normalization in `foldHomoglyphs` itself.
|
||
*
|
||
* The map is deliberately small (~25 entries). Adding more risks
|
||
* false-positive escalation on benign multilingual content.
|
||
*/
|
||
const HOMOGLYPH_MAP = Object.freeze({
|
||
// Cyrillic → Latin (lowercase)
|
||
'а': 'a', // U+0430
|
||
'е': 'e', // U+0435
|
||
'о': 'o', // U+043E
|
||
'с': 'c', // U+0441
|
||
'р': 'p', // U+0440
|
||
'х': 'x', // U+0445
|
||
'у': 'y', // U+0443
|
||
'і': 'i', // U+0456 (Ukrainian)
|
||
'ј': 'j', // U+0458
|
||
'ѕ': 's', // U+0455
|
||
'ӏ': 'l', // U+04CF (Cyrillic Palochka)
|
||
// Cyrillic → Latin (uppercase)
|
||
'А': 'A', // U+0410
|
||
'Е': 'E', // U+0415
|
||
'О': 'O', // U+041E
|
||
'С': 'C', // U+0421
|
||
'Р': 'P', // U+0420
|
||
'Х': 'X', // U+0425
|
||
'У': 'Y', // U+0423
|
||
// Greek → Latin (only the unambiguous Latin-look-alikes)
|
||
'α': 'a', // U+03B1
|
||
'ο': 'o', // U+03BF
|
||
'ρ': 'p', // U+03C1
|
||
'ι': 'i', // U+03B9
|
||
'ν': 'v', // U+03BD
|
||
'τ': 't', // U+03C4
|
||
// Greek uppercase
|
||
'Α': 'A', // U+0391
|
||
'Ο': 'O', // U+039F
|
||
'Ρ': 'P', // U+03A1
|
||
'Τ': 'T', // U+03A4
|
||
});
|
||
|
||
/**
|
||
* Fold visually-confusable characters to their Latin look-alikes. Used by
|
||
* E16 (v7.2.0) to neutralize homoglyph-substitution injection attacks
|
||
* before pattern matching.
|
||
*
|
||
* Pipeline:
|
||
* 1. NFKC normalize — collapses Mathematical Alphanumeric (U+1D400),
|
||
* width variants, ligatures, and other compatibility decompositions.
|
||
* 2. Apply HOMOGLYPH_MAP — Cyrillic/Greek look-alikes → Latin.
|
||
*
|
||
* Idempotent: `foldHomoglyphs(foldHomoglyphs(s)) === foldHomoglyphs(s)`.
|
||
*
|
||
* Norwegian/Polish/German/etc. text is NOT affected — characters like
|
||
* æ, ø, å, é, ñ, ü, ö, ä are not in HOMOGLYPH_MAP.
|
||
*
|
||
* Performance: pure-ASCII inputs short-circuit before NFKC, since NFKC is
|
||
* a no-op on ASCII and HOMOGLYPH_MAP only contains non-ASCII keys.
|
||
* scanForInjection calls this on every scan; the fast-path keeps the
|
||
* common-case overhead near zero.
|
||
*
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
/**
|
||
* Apply rot13 (Caesar shift by 13) to ASCII letters.
|
||
* Non-letters pass through unchanged. The transform is its own inverse.
|
||
*
|
||
* Used by E3 comment-block injection detection: attackers sometimes hide
|
||
* imperative phrases ("ignore previous instructions") in rot13 inside
|
||
* code comments. normalizeForScan() does not apply rot13, so this layer
|
||
* is added explicitly to the variantSet in scanForInjection().
|
||
*
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function rot13(s) {
|
||
if (!s) return s;
|
||
let out = '';
|
||
for (let i = 0; i < s.length; i++) {
|
||
const c = s.charCodeAt(i);
|
||
if (c >= 65 && c <= 90) out += String.fromCharCode(((c - 65 + 13) % 26) + 65);
|
||
else if (c >= 97 && c <= 122) out += String.fromCharCode(((c - 97 + 13) % 26) + 97);
|
||
else out += s[i];
|
||
}
|
||
return out;
|
||
}
|
||
|
||
export function foldHomoglyphs(s) {
|
||
if (!s) return s;
|
||
// Fast path: pure ASCII has nothing to fold and NFKC is identity.
|
||
// charCodeAt is cheaper than iterating codepoints.
|
||
let asciiOnly = true;
|
||
for (let i = 0; i < s.length; i++) {
|
||
if (s.charCodeAt(i) > 127) { asciiOnly = false; break; }
|
||
}
|
||
if (asciiOnly) return s;
|
||
const normalized = s.normalize('NFKC');
|
||
let out = '';
|
||
for (const ch of normalized) {
|
||
out += HOMOGLYPH_MAP[ch] || ch;
|
||
}
|
||
return out;
|
||
}
|
||
|
||
/**
|
||
* Normalize a string by decoding all known obfuscation layers.
|
||
* Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded).
|
||
* Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes ->
|
||
* hex escapes -> URL encoding -> base64.
|
||
* After decoding: collapse letter-spaced text.
|
||
* @param {string} s
|
||
* @returns {string}
|
||
*/
|
||
export function normalizeForScan(s) {
|
||
let result = s;
|
||
const MAX_ITERATIONS = 3;
|
||
|
||
// Pre-decode: Unicode Tags and BIDI overrides (before the main loop)
|
||
result = decodeUnicodeTags(result);
|
||
result = stripBidiOverrides(result);
|
||
|
||
for (let i = 0; i < MAX_ITERATIONS; i++) {
|
||
const prev = result;
|
||
result = decodeHtmlEntities(result);
|
||
result = decodeUnicodeEscapes(result);
|
||
result = decodeHexEscapes(result);
|
||
result = decodeUrlEncoding(result);
|
||
const b64decoded = tryDecodeBase64(result);
|
||
if (b64decoded) result = b64decoded;
|
||
// Stable — no further decoding possible
|
||
if (result === prev) break;
|
||
}
|
||
|
||
// Post-decode: collapse letter-spaced evasion
|
||
result = collapseLetterSpacing(result);
|
||
|
||
return result;
|
||
}
|