// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding // Zero dependencies. /** * Shannon entropy of a string (bits per character). * @param {string} s * @returns {number} */ export function shannonEntropy(s) { if (s.length === 0) return 0; const freq = new Map(); for (const ch of s) { freq.set(ch, (freq.get(ch) || 0) + 1); } let H = 0; const len = s.length; for (const count of freq.values()) { const p = count / len; H -= p * Math.log2(p); } return H; } /** * Levenshtein edit distance between two strings. * @param {string} a * @param {string} b * @returns {number} */ export function levenshtein(a, b) { if (a === b) return 0; if (a.length === 0) return b.length; if (b.length === 0) return a.length; const m = a.length; const n = b.length; // Single-row optimization let prev = new Array(n + 1); let curr = new Array(n + 1); for (let j = 0; j <= n; j++) prev[j] = j; for (let i = 1; i <= m; i++) { curr[0] = i; for (let j = 1; j <= n; j++) { const cost = a[i - 1] === b[j - 1] ? 0 : 1; curr[j] = Math.min( prev[j] + 1, // deletion curr[j - 1] + 1, // insertion prev[j - 1] + cost // substitution ); } [prev, curr] = [curr, prev]; } return prev[n]; } /** * Split a package name into lowercase tokens on `-` and `_` boundaries. * Used by the B7 typosquat token-overlap heuristic. Empty tokens are * dropped. Single-character tokens are kept (some package names like * `a-b` are real). * * @param {string} name * @returns {string[]} */ export function tokenize(name) { if (!name) return []; return name .toLowerCase() .split(/[-_]+/) .filter(t => t.length > 0); } /** * Token-overlap ratio between two package names. Returns the size of the * intersection divided by the size of the smaller token set. Returns 0 if * either input is empty. * * Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0 * `tokenOverlap('react-router-dom', 'react')` → 1.0 * `tokenOverlap('react-helper', 'react-router')` → 0.5 * `tokenOverlap('foo', 'bar')` → 0.0 * * Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein — * Levenshtein <=2 catches small typos; token-overlap catches * popular-name-with-suffix typosquats. * * @param {string} a * @param {string} b * @returns {number} 0..1 */ export function tokenOverlap(a, b) { const ta = new Set(tokenize(a)); const tb = new Set(tokenize(b)); if (ta.size === 0 || tb.size === 0) return 0; let intersection = 0; for (const t of ta) if (tb.has(t)) intersection++; return intersection / Math.min(ta.size, tb.size); } /** * Suspicious suffix tokens commonly used by typosquats to dress up a * popular package name. Module-level for B7 reuse. * * Excluded by design (would conflict with the v7.0.0 typosquat allowlist * or trigger false positives on legitimate packages): * - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many * legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The * v7.0.0 allowlist contains `tsx` directly; including the same token * in the suspicious set would create an internal contradiction. * - `pro` — too common as a legitimate edition marker (`vue-pro`, * `tailwindcss-pro`). * * Kept tokens are the unambiguous typosquat suffixes: utility/helper * dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers. */ export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([ 'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras', 'bin', 'cli', 'tool', 'tools', 'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim', ]); /** * Check if a string looks like base64-encoded data. * @param {string} s * @returns {boolean} */ export function isBase64Like(s) { if (s.length < 20) return false; // Must be mostly base64 chars and optionally end with = return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s); } /** * Check if a string looks like a hex-encoded blob. * @param {string} s * @returns {boolean} */ export function isHexBlob(s) { if (s.length < 32) return false; return /^(0x)?[0-9a-fA-F]{32,}$/.test(s); } /** * Redact a string for safe display — show first 8 and last 4 chars. * @param {string} s * @param {number} [showStart=8] * @param {number} [showEnd=4] * @returns {string} */ export function redact(s, showStart = 8, showEnd = 4) { if (s.length <= showStart + showEnd + 3) return s; return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`; } /** * Extract string literals from a line of code. * Handles single-quoted, double-quoted, and backtick strings. * @param {string} line * @returns {string[]} */ export function extractStringLiterals(line) { const results = []; const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g; let match; while ((match = regex.exec(line)) !== null) { results.push(match[1] ?? match[2] ?? match[3]); } return results; } // --------------------------------------------------------------------------- // Encoding/obfuscation decoders // --------------------------------------------------------------------------- /** * Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}. * @param {string} s * @returns {string} */ export function decodeUnicodeEscapes(s) { return s .replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => { const cp = parseInt(hex, 16); return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _; }) .replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)) ); } /** * Decode hex escape sequences: \xXX. * @param {string} s * @returns {string} */ export function decodeHexEscapes(s) { return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)) ); } /** * Decode URL percent-encoding: %XX. * Uses decodeURIComponent with fallback for malformed sequences. * @param {string} s * @returns {string} */ export function decodeUrlEncoding(s) { // Fast path: no percent signs means nothing to decode if (!s.includes('%')) return s; try { return decodeURIComponent(s); } catch { // Malformed sequences — decode individual %XX pairs return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)) ); } } /** * Attempt to decode a base64 string to UTF-8 text. * Returns null if the input is not base64-like or decoded result is not readable text. * @param {string} s * @returns {string|null} */ export function tryDecodeBase64(s) { if (!isBase64Like(s)) return null; try { const decoded = Buffer.from(s, 'base64').toString('utf-8'); // Check if result is mostly printable text (>= 80% printable ASCII) const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length; if (decoded.length === 0 || printable / decoded.length < 0.8) return null; return decoded; } catch { return null; } } /** * Decode HTML entities: named (< > & " '), * decimal (i), and hex (i). * @param {string} s * @returns {string} */ export function decodeHtmlEntities(s) { if (!s.includes('&')) return s; const NAMED = { '<': '<', '>': '>', '&': '&', '"': '"', ''': "'", ' ': ' ', '&tab;': '\t', '&newline;': '\n', '(': '(', ')': ')', '[': '[', ']': ']', '{': '{', '}': '}', '/': '/', '\': '\\', ':': ':', ';': ';', ',': ',', '.': '.', '!': '!', '?': '?', '#': '#', '%': '%', '=': '=', '+': '+', '−': '-', '*': '*', '|': '|', '˜': '~', '`': '`', '^': '^', '_': '_', '&at;': '@', '$': '$', }; return s .replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => { const cp = parseInt(hex, 16); return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _; }) .replace(/&#(\d{1,7});/g, (_, dec) => { const cp = parseInt(dec, 10); return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _; }) .replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity); } /** * Collapse letter-spaced text: "i g n o r e" → "ignore". * Only collapses runs of single letters separated by spaces/tabs. * Minimum 4 letters to avoid false positives on normal text. * @param {string} s * @returns {string} */ export function collapseLetterSpacing(s) { // Match 4+ single-letter tokens separated by 1+ spaces/tabs return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) => match.replace(/ /g, '') ); } // --------------------------------------------------------------------------- // Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1 // --------------------------------------------------------------------------- /** * Decode Unicode Tags steganography: U+E0001-E007F → ASCII. * Unicode Tags (U+E0000 block) can encode invisible ASCII text inside * what appears to be empty or normal-looking strings. * E.g., U+E0069 U+E0067 U+E006E → "ign" * @param {string} s * @returns {string} */ export function decodeUnicodeTags(s) { let result = ''; let decoded = ''; let inTagSequence = false; for (const ch of s) { const cp = ch.codePointAt(0); if (cp >= 0xE0001 && cp <= 0xE007F) { // Tag character — map to ASCII (subtract 0xE0000) decoded += String.fromCharCode(cp - 0xE0000); inTagSequence = true; } else { if (inTagSequence && decoded.length > 0) { result += decoded; decoded = ''; inTagSequence = false; } result += ch; } } // Flush remaining tag sequence if (decoded.length > 0) { result += decoded; } return result; } /** * Check if a string contains Unicode Tag characters (U+E0001-E007F). * Presence of these characters is suspicious regardless of decoded content. * @param {string} s * @returns {boolean} */ export function containsUnicodeTags(s) { for (const ch of s) { const cp = ch.codePointAt(0); if (cp >= 0xE0001 && cp <= 0xE007F) return true; } return false; } // --------------------------------------------------------------------------- // BIDI override stripping // --------------------------------------------------------------------------- /** * Strip BIDI override characters that can reorder text visually. * U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO), * U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI). * These can hide injection by making text render differently than it parses. * @param {string} s * @returns {string} */ export function stripBidiOverrides(s) { return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, ''); } /** * Normalize a string by decoding all known obfuscation layers. * Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded). * Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes -> * hex escapes -> URL encoding -> base64. * After decoding: collapse letter-spaced text. * @param {string} s * @returns {string} */ export function normalizeForScan(s) { let result = s; const MAX_ITERATIONS = 3; // Pre-decode: Unicode Tags and BIDI overrides (before the main loop) result = decodeUnicodeTags(result); result = stripBidiOverrides(result); for (let i = 0; i < MAX_ITERATIONS; i++) { const prev = result; result = decodeHtmlEntities(result); result = decodeUnicodeEscapes(result); result = decodeHexEscapes(result); result = decodeUrlEncoding(result); const b64decoded = tryDecodeBase64(result); if (b64decoded) result = b64decoded; // Stable — no further decoding possible if (result === prev) break; } // Post-decode: collapse letter-spaced evasion result = collapseLetterSpacing(result); return result; }