// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding // Zero dependencies. /** * Shannon entropy of a string (bits per character). * @param {string} s * @returns {number} */ export function shannonEntropy(s) { if (s.length === 0) return 0; const freq = new Map(); for (const ch of s) { freq.set(ch, (freq.get(ch) || 0) + 1); } let H = 0; const len = s.length; for (const count of freq.values()) { const p = count / len; H -= p * Math.log2(p); } return H; } /** * Levenshtein edit distance between two strings. * @param {string} a * @param {string} b * @returns {number} */ export function levenshtein(a, b) { if (a === b) return 0; if (a.length === 0) return b.length; if (b.length === 0) return a.length; const m = a.length; const n = b.length; // Single-row optimization let prev = new Array(n + 1); let curr = new Array(n + 1); for (let j = 0; j <= n; j++) prev[j] = j; for (let i = 1; i <= m; i++) { curr[0] = i; for (let j = 1; j <= n; j++) { const cost = a[i - 1] === b[j - 1] ? 0 : 1; curr[j] = Math.min( prev[j] + 1, // deletion curr[j - 1] + 1, // insertion prev[j - 1] + cost // substitution ); } [prev, curr] = [curr, prev]; } return prev[n]; } /** * Split a package name into lowercase tokens on `-` and `_` boundaries. * Used by the B7 typosquat token-overlap heuristic. Empty tokens are * dropped. Single-character tokens are kept (some package names like * `a-b` are real). * * @param {string} name * @returns {string[]} */ export function tokenize(name) { if (!name) return []; return name .toLowerCase() .split(/[-_]+/) .filter(t => t.length > 0); } /** * Token-overlap ratio between two package names. Returns the size of the * intersection divided by the size of the smaller token set. Returns 0 if * either input is empty. * * Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0 * `tokenOverlap('react-router-dom', 'react')` → 1.0 * `tokenOverlap('react-helper', 'react-router')` → 0.5 * `tokenOverlap('foo', 'bar')` → 0.0 * * Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein — * Levenshtein <=2 catches small typos; token-overlap catches * popular-name-with-suffix typosquats. * * @param {string} a * @param {string} b * @returns {number} 0..1 */ export function tokenOverlap(a, b) { const ta = new Set(tokenize(a)); const tb = new Set(tokenize(b)); if (ta.size === 0 || tb.size === 0) return 0; let intersection = 0; for (const t of ta) if (tb.has(t)) intersection++; return intersection / Math.min(ta.size, tb.size); } /** * Suspicious suffix tokens commonly used by typosquats to dress up a * popular package name. Module-level for B7 reuse. * * Excluded by design (would conflict with the v7.0.0 typosquat allowlist * or trigger false positives on legitimate packages): * - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many * legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The * v7.0.0 allowlist contains `tsx` directly; including the same token * in the suspicious set would create an internal contradiction. * - `pro` — too common as a legitimate edition marker (`vue-pro`, * `tailwindcss-pro`). * * Kept tokens are the unambiguous typosquat suffixes: utility/helper * dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers. */ export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([ 'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras', 'bin', 'cli', 'tool', 'tools', 'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim', ]); /** * Check if a string looks like base64-encoded data. * @param {string} s * @returns {boolean} */ export function isBase64Like(s) { if (s.length < 20) return false; // Must be mostly base64 chars and optionally end with = return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s); } /** * Check if a string looks like a hex-encoded blob. * @param {string} s * @returns {boolean} */ export function isHexBlob(s) { if (s.length < 32) return false; return /^(0x)?[0-9a-fA-F]{32,}$/.test(s); } /** * Redact a string for safe display — show first 8 and last 4 chars. * @param {string} s * @param {number} [showStart=8] * @param {number} [showEnd=4] * @returns {string} */ export function redact(s, showStart = 8, showEnd = 4) { if (s.length <= showStart + showEnd + 3) return s; return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`; } /** * Extract string literals from a line of code. * Handles single-quoted, double-quoted, and backtick strings. * @param {string} line * @returns {string[]} */ export function extractStringLiterals(line) { const results = []; const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g; let match; while ((match = regex.exec(line)) !== null) { results.push(match[1] ?? match[2] ?? match[3]); } return results; } // --------------------------------------------------------------------------- // Encoding/obfuscation decoders // --------------------------------------------------------------------------- /** * Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}. * @param {string} s * @returns {string} */ export function decodeUnicodeEscapes(s) { return s .replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => { const cp = parseInt(hex, 16); return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _; }) .replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)) ); } /** * Decode hex escape sequences: \xXX. * @param {string} s * @returns {string} */ export function decodeHexEscapes(s) { return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)) ); } /** * Decode URL percent-encoding: %XX. * Uses decodeURIComponent with fallback for malformed sequences. * @param {string} s * @returns {string} */ export function decodeUrlEncoding(s) { // Fast path: no percent signs means nothing to decode if (!s.includes('%')) return s; try { return decodeURIComponent(s); } catch { // Malformed sequences — decode individual %XX pairs return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)) ); } } /** * Attempt to decode a base64 string to UTF-8 text. * Returns null if the input is not base64-like or decoded result is not readable text. * @param {string} s * @returns {string|null} */ export function tryDecodeBase64(s) { if (!isBase64Like(s)) return null; try { const decoded = Buffer.from(s, 'base64').toString('utf-8'); // Check if result is mostly printable text (>= 80% printable ASCII) const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length; if (decoded.length === 0 || printable / decoded.length < 0.8) return null; return decoded; } catch { return null; } } /** * Decode HTML entities: named (< > & " '), * decimal (i), and hex (i). * @param {string} s * @returns {string} */ export function decodeHtmlEntities(s) { if (!s.includes('&')) return s; const NAMED = { '<': '<', '>': '>', '&': '&', '"': '"', ''': "'", ' ': ' ', '&tab;': '\t', '&newline;': '\n', '(': '(', ')': ')', '[': '[', ']': ']', '{': '{', '}': '}', '/': '/', '\': '\\', ':': ':', ';': ';', ',': ',', '.': '.', '!': '!', '?': '?', '#': '#', '%': '%', '=': '=', '+': '+', '−': '-', '*': '*', '|': '|', '˜': '~', '`': '`', '^': '^', '_': '_', '&at;': '@', '$': '$', }; return s .replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => { const cp = parseInt(hex, 16); return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _; }) .replace(/&#(\d{1,7});/g, (_, dec) => { const cp = parseInt(dec, 10); return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _; }) .replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity); } /** * Collapse letter-spaced text: "i g n o r e" → "ignore". * Only collapses runs of single letters separated by spaces/tabs. * Minimum 4 letters to avoid false positives on normal text. * @param {string} s * @returns {string} */ export function collapseLetterSpacing(s) { // Match 4+ single-letter tokens separated by 1+ spaces/tabs return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) => match.replace(/ /g, '') ); } // --------------------------------------------------------------------------- // Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1 // --------------------------------------------------------------------------- /** * Decode Unicode Tags steganography: U+E0001-E007F → ASCII. * Unicode Tags (U+E0000 block) can encode invisible ASCII text inside * what appears to be empty or normal-looking strings. * E.g., U+E0069 U+E0067 U+E006E → "ign" * * **Note (E1, v7.2.0):** Tag-block characters decode to ASCII via the * `cp - 0xE0000` mapping. Private Use Areas (PUA-A: U+F0000-FFFFD; * PUA-B: U+100000-10FFFD) are also detected as hidden Unicode by * `containsUnicodeTags`, but they have NO standard ASCII mapping — * they pass through this function unchanged. Detection of PUA presence * is sufficient (HIGH advisory in scanForInjection), no decode needed. * * @param {string} s * @returns {string} */ export function decodeUnicodeTags(s) { let result = ''; let decoded = ''; let inTagSequence = false; for (const ch of s) { const cp = ch.codePointAt(0); if (cp >= 0xE0001 && cp <= 0xE007F) { // Tag character — map to ASCII (subtract 0xE0000) decoded += String.fromCharCode(cp - 0xE0000); inTagSequence = true; } else { if (inTagSequence && decoded.length > 0) { result += decoded; decoded = ''; inTagSequence = false; } result += ch; } } // Flush remaining tag sequence if (decoded.length > 0) { result += decoded; } return result; } /** * Check if a string contains hidden-Unicode characters that are commonly * used for steganography in prompts and tool output. * * Covered ranges: * - U+E0001-E007F Unicode Tag block (DeepMind traps kat. 1) * - U+F0000-FFFFD Supplementary Private Use Area-A (E1, v7.2.0) * - U+100000-10FFFD Supplementary Private Use Area-B (E1, v7.2.0) * * Presence of any of these characters is suspicious regardless of * decoded content — they are invisible in most terminals and survive * normalization. The function name `containsUnicodeTags` is preserved * for back-compat (existing call sites in injection-patterns.mjs and * elsewhere); semantically it is now "containsHiddenUnicode". * * Tag-block characters decode to ASCII via `decodeUnicodeTags`. PUA * characters do NOT — they have no standard mapping and remain * detection-only. * * @param {string} s * @returns {boolean} */ export function containsUnicodeTags(s) { for (const ch of s) { const cp = ch.codePointAt(0); if (cp >= 0xE0001 && cp <= 0xE007F) return true; // Tag block if (cp >= 0xF0000 && cp <= 0xFFFFD) return true; // PUA-A (E1) if (cp >= 0x100000 && cp <= 0x10FFFD) return true; // PUA-B (E1) } return false; } // --------------------------------------------------------------------------- // BIDI override stripping // --------------------------------------------------------------------------- /** * Strip BIDI override characters that can reorder text visually. * U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO), * U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI). * These can hide injection by making text render differently than it parses. * @param {string} s * @returns {string} */ export function stripBidiOverrides(s) { return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, ''); } // --------------------------------------------------------------------------- // Homoglyph folding (E16, v7.2.0) // --------------------------------------------------------------------------- /** * Confusable mapping — characters that LOOK like Latin letters but are * different codepoints (most commonly Cyrillic and Greek). Surgical map * focused on letters that appear in injection vocabulary * (`ignore`, `system`, `you are`, `assistant`, `tool`, `response`). * * Excluded by design: * - Latin Extended characters (æ, ø, å, é, è, ñ, ü, ö, ä, ç, ß, þ, ð, etc.) * — these are legitimate letters in Norwegian, German, Danish, Spanish, * French, Icelandic, etc., and would generate false positives in * non-English source code or documentation. * - Greek letters that don't visually overlap with Latin (`β`, `γ`, `δ`, ...) * - Cyrillic letters that don't visually overlap (`б`, `г`, `д`, `ж`, ...) * - Mathematical alphanumeric symbols (the U+1D400 block) — covered by * NFKC normalization in `foldHomoglyphs` itself. * * The map is deliberately small (~25 entries). Adding more risks * false-positive escalation on benign multilingual content. */ const HOMOGLYPH_MAP = Object.freeze({ // Cyrillic → Latin (lowercase) 'а': 'a', // U+0430 'е': 'e', // U+0435 'о': 'o', // U+043E 'с': 'c', // U+0441 'р': 'p', // U+0440 'х': 'x', // U+0445 'у': 'y', // U+0443 'і': 'i', // U+0456 (Ukrainian) 'ј': 'j', // U+0458 'ѕ': 's', // U+0455 'ӏ': 'l', // U+04CF (Cyrillic Palochka) // Cyrillic → Latin (uppercase) 'А': 'A', // U+0410 'Е': 'E', // U+0415 'О': 'O', // U+041E 'С': 'C', // U+0421 'Р': 'P', // U+0420 'Х': 'X', // U+0425 'У': 'Y', // U+0423 // Greek → Latin (only the unambiguous Latin-look-alikes) 'α': 'a', // U+03B1 'ο': 'o', // U+03BF 'ρ': 'p', // U+03C1 'ι': 'i', // U+03B9 'ν': 'v', // U+03BD 'τ': 't', // U+03C4 // Greek uppercase 'Α': 'A', // U+0391 'Ο': 'O', // U+039F 'Ρ': 'P', // U+03A1 'Τ': 'T', // U+03A4 }); /** * Fold visually-confusable characters to their Latin look-alikes. Used by * E16 (v7.2.0) to neutralize homoglyph-substitution injection attacks * before pattern matching. * * Pipeline: * 1. NFKC normalize — collapses Mathematical Alphanumeric (U+1D400), * width variants, ligatures, and other compatibility decompositions. * 2. Apply HOMOGLYPH_MAP — Cyrillic/Greek look-alikes → Latin. * * Idempotent: `foldHomoglyphs(foldHomoglyphs(s)) === foldHomoglyphs(s)`. * * Norwegian/Polish/German/etc. text is NOT affected — characters like * æ, ø, å, é, ñ, ü, ö, ä are not in HOMOGLYPH_MAP. * * Performance: pure-ASCII inputs short-circuit before NFKC, since NFKC is * a no-op on ASCII and HOMOGLYPH_MAP only contains non-ASCII keys. * scanForInjection calls this on every scan; the fast-path keeps the * common-case overhead near zero. * * @param {string} s * @returns {string} */ /** * Apply rot13 (Caesar shift by 13) to ASCII letters. * Non-letters pass through unchanged. The transform is its own inverse. * * Used by E3 comment-block injection detection: attackers sometimes hide * imperative phrases ("ignore previous instructions") in rot13 inside * code comments. normalizeForScan() does not apply rot13, so this layer * is added explicitly to the variantSet in scanForInjection(). * * @param {string} s * @returns {string} */ export function rot13(s) { if (!s) return s; let out = ''; for (let i = 0; i < s.length; i++) { const c = s.charCodeAt(i); if (c >= 65 && c <= 90) out += String.fromCharCode(((c - 65 + 13) % 26) + 65); else if (c >= 97 && c <= 122) out += String.fromCharCode(((c - 97 + 13) % 26) + 97); else out += s[i]; } return out; } export function foldHomoglyphs(s) { if (!s) return s; // Fast path: pure ASCII has nothing to fold and NFKC is identity. // charCodeAt is cheaper than iterating codepoints. let asciiOnly = true; for (let i = 0; i < s.length; i++) { if (s.charCodeAt(i) > 127) { asciiOnly = false; break; } } if (asciiOnly) return s; const normalized = s.normalize('NFKC'); let out = ''; for (const ch of normalized) { out += HOMOGLYPH_MAP[ch] || ch; } return out; } /** * Normalize a string by decoding all known obfuscation layers. * Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded). * Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes -> * hex escapes -> URL encoding -> base64. * After decoding: collapse letter-spaced text. * @param {string} s * @returns {string} */ export function normalizeForScan(s) { let result = s; const MAX_ITERATIONS = 3; // Pre-decode: Unicode Tags and BIDI overrides (before the main loop) result = decodeUnicodeTags(result); result = stripBidiOverrides(result); for (let i = 0; i < MAX_ITERATIONS; i++) { const prev = result; result = decodeHtmlEntities(result); result = decodeUnicodeEscapes(result); result = decodeHexEscapes(result); result = decodeUrlEncoding(result); const b64decoded = tryDecodeBase64(result); if (b64decoded) result = b64decoded; // Stable — no further decoding possible if (result === prev) break; } // Post-decode: collapse letter-spaced evasion result = collapseLetterSpacing(result); return result; }