ktg-plugin-marketplace/plugins/llm-security/scanners/lib/string-utils.mjs

// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding
// Zero dependencies.

/**
 * Shannon entropy of a string (bits per character).
 * @param {string} s
 * @returns {number}
 */
export function shannonEntropy(s) {
  if (s.length === 0) return 0;
  const freq = new Map();
  for (const ch of s) {
    freq.set(ch, (freq.get(ch) || 0) + 1);
  }
  let H = 0;
  const len = s.length;
  for (const count of freq.values()) {
    const p = count / len;
    H -= p * Math.log2(p);
  }
  return H;
}

/**
 * Levenshtein edit distance between two strings.
 * @param {string} a
 * @param {string} b
 * @returns {number}
 */
export function levenshtein(a, b) {
  if (a === b) return 0;
  if (a.length === 0) return b.length;
  if (b.length === 0) return a.length;

  const m = a.length;
  const n = b.length;
  // Single-row optimization
  let prev = new Array(n + 1);
  let curr = new Array(n + 1);
  for (let j = 0; j <= n; j++) prev[j] = j;

  for (let i = 1; i <= m; i++) {
    curr[0] = i;
    for (let j = 1; j <= n; j++) {
      const cost = a[i - 1] === b[j - 1] ? 0 : 1;
      curr[j] = Math.min(
        prev[j] + 1,       // deletion
        curr[j - 1] + 1,   // insertion
        prev[j - 1] + cost  // substitution
      );
    }
    [prev, curr] = [curr, prev];
  }
  return prev[n];
}

/**
 * Split a package name into lowercase tokens on `-` and `_` boundaries.
 * Used by the B7 typosquat token-overlap heuristic. Empty tokens are
 * dropped. Single-character tokens are kept (some package names like
 * `a-b` are real).
 *
 * @param {string} name
 * @returns {string[]}
 */
export function tokenize(name) {
  if (!name) return [];
  return name
    .toLowerCase()
    .split(/[-_]+/)
    .filter(t => t.length > 0);
}

/**
 * Token-overlap ratio between two package names. Returns the size of the
 * intersection divided by the size of the smaller token set. Returns 0 if
 * either input is empty.
 *
 * Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0
 *          `tokenOverlap('react-router-dom', 'react')` → 1.0
 *          `tokenOverlap('react-helper', 'react-router')` → 0.5
 *          `tokenOverlap('foo', 'bar')` → 0.0
 *
 * Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein —
 * Levenshtein <=2 catches small typos; token-overlap catches
 * popular-name-with-suffix typosquats.
 *
 * @param {string} a
 * @param {string} b
 * @returns {number}  0..1
 */
export function tokenOverlap(a, b) {
  const ta = new Set(tokenize(a));
  const tb = new Set(tokenize(b));
  if (ta.size === 0 || tb.size === 0) return 0;
  let intersection = 0;
  for (const t of ta) if (tb.has(t)) intersection++;
  return intersection / Math.min(ta.size, tb.size);
}

/**
 * Suspicious suffix tokens commonly used by typosquats to dress up a
 * popular package name. Module-level for B7 reuse.
 *
 * Excluded by design (would conflict with the v7.0.0 typosquat allowlist
 * or trigger false positives on legitimate packages):
 *   - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many
 *     legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The
 *     v7.0.0 allowlist contains `tsx` directly; including the same token
 *     in the suspicious set would create an internal contradiction.
 *   - `pro` — too common as a legitimate edition marker (`vue-pro`,
 *     `tailwindcss-pro`).
 *
 * Kept tokens are the unambiguous typosquat suffixes: utility/helper
 * dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers.
 */
export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([
  'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras',
  'bin', 'cli', 'tool', 'tools',
  'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim',
]);

/**
 * Check if a string looks like base64-encoded data.
 * @param {string} s
 * @returns {boolean}
 */
export function isBase64Like(s) {
  if (s.length < 20) return false;
  // Must be mostly base64 chars and optionally end with =
  return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s);
}

/**
 * Check if a string looks like a hex-encoded blob.
 * @param {string} s
 * @returns {boolean}
 */
export function isHexBlob(s) {
  if (s.length < 32) return false;
  return /^(0x)?[0-9a-fA-F]{32,}$/.test(s);
}

/**
 * Redact a string for safe display — show first 8 and last 4 chars.
 * @param {string} s
 * @param {number} [showStart=8]
 * @param {number} [showEnd=4]
 * @returns {string}
 */
export function redact(s, showStart = 8, showEnd = 4) {
  if (s.length <= showStart + showEnd + 3) return s;
  return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`;
}

/**
 * Extract string literals from a line of code.
 * Handles single-quoted, double-quoted, and backtick strings.
 * @param {string} line
 * @returns {string[]}
 */
export function extractStringLiterals(line) {
  const results = [];
  const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g;
  let match;
  while ((match = regex.exec(line)) !== null) {
    results.push(match[1] ?? match[2] ?? match[3]);
  }
  return results;
}

// ---------------------------------------------------------------------------
// Encoding/obfuscation decoders
// ---------------------------------------------------------------------------

/**
 * Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}.
 * @param {string} s
 * @returns {string}
 */
export function decodeUnicodeEscapes(s) {
  return s
    .replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => {
      const cp = parseInt(hex, 16);
      return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
    })
    .replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) =>
      String.fromCodePoint(parseInt(hex, 16))
    );
}

/**
 * Decode hex escape sequences: \xXX.
 * @param {string} s
 * @returns {string}
 */
export function decodeHexEscapes(s) {
  return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) =>
    String.fromCharCode(parseInt(hex, 16))
  );
}

/**
 * Decode URL percent-encoding: %XX.
 * Uses decodeURIComponent with fallback for malformed sequences.
 * @param {string} s
 * @returns {string}
 */
export function decodeUrlEncoding(s) {
  // Fast path: no percent signs means nothing to decode
  if (!s.includes('%')) return s;
  try {
    return decodeURIComponent(s);
  } catch {
    // Malformed sequences — decode individual %XX pairs
    return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) =>
      String.fromCharCode(parseInt(hex, 16))
    );
  }
}

/**
 * Attempt to decode a base64 string to UTF-8 text.
 * Returns null if the input is not base64-like or decoded result is not readable text.
 * @param {string} s
 * @returns {string|null}
 */
export function tryDecodeBase64(s) {
  if (!isBase64Like(s)) return null;
  try {
    const decoded = Buffer.from(s, 'base64').toString('utf-8');
    // Check if result is mostly printable text (>= 80% printable ASCII)
    const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length;
    if (decoded.length === 0 || printable / decoded.length < 0.8) return null;
    return decoded;
  } catch {
    return null;
  }
}

/**
 * Decode HTML entities: named (&lt; &gt; &amp; &quot; &apos;),
 * decimal (&#105;), and hex (&#x69;).
 * @param {string} s
 * @returns {string}
 */
export function decodeHtmlEntities(s) {
  if (!s.includes('&')) return s;
  const NAMED = {
    '&lt;': '<', '&gt;': '>', '&amp;': '&', '&quot;': '"', '&apos;': "'",
    '&nbsp;': ' ', '&tab;': '\t', '&newline;': '\n',
    '&lpar;': '(', '&rpar;': ')', '&lsqb;': '[', '&rsqb;': ']',
    '&lcub;': '{', '&rcub;': '}', '&sol;': '/', '&bsol;': '\\',
    '&colon;': ':', '&semi;': ';', '&comma;': ',', '&period;': '.',
    '&excl;': '!', '&quest;': '?', '&num;': '#', '&percnt;': '%',
    '&equals;': '=', '&plus;': '+', '&minus;': '-', '&ast;': '*',
    '&vert;': '|', '&tilde;': '~', '&grave;': '`', '&Hat;': '^',
    '&lowbar;': '_', '&at;': '@', '&dollar;': '$',
  };
  return s
    .replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => {
      const cp = parseInt(hex, 16);
      return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
    })
    .replace(/&#(\d{1,7});/g, (_, dec) => {
      const cp = parseInt(dec, 10);
      return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
    })
    .replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity);
}

/**
 * Collapse letter-spaced text: "i g n o r e" → "ignore".
 * Only collapses runs of single letters separated by spaces/tabs.
 * Minimum 4 letters to avoid false positives on normal text.
 * @param {string} s
 * @returns {string}
 */
export function collapseLetterSpacing(s) {
  // Match 4+ single-letter tokens separated by 1+ spaces/tabs
  return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) =>
    match.replace(/ /g, '')
  );
}

// ---------------------------------------------------------------------------
// Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1
// ---------------------------------------------------------------------------

/**
 * Decode Unicode Tags steganography: U+E0001-E007F → ASCII.
 * Unicode Tags (U+E0000 block) can encode invisible ASCII text inside
 * what appears to be empty or normal-looking strings.
 * E.g., U+E0069 U+E0067 U+E006E → "ign"
 *
 * **Note (E1, v7.2.0):** Tag-block characters decode to ASCII via the
 * `cp - 0xE0000` mapping. Private Use Areas (PUA-A: U+F0000-FFFFD;
 * PUA-B: U+100000-10FFFD) are also detected as hidden Unicode by
 * `containsUnicodeTags`, but they have NO standard ASCII mapping —
 * they pass through this function unchanged. Detection of PUA presence
 * is sufficient (HIGH advisory in scanForInjection), no decode needed.
 *
 * @param {string} s
 * @returns {string}
 */
export function decodeUnicodeTags(s) {
  let result = '';
  let decoded = '';
  let inTagSequence = false;

  for (const ch of s) {
    const cp = ch.codePointAt(0);
    if (cp >= 0xE0001 && cp <= 0xE007F) {
      // Tag character — map to ASCII (subtract 0xE0000)
      decoded += String.fromCharCode(cp - 0xE0000);
      inTagSequence = true;
    } else {
      if (inTagSequence && decoded.length > 0) {
        result += decoded;
        decoded = '';
        inTagSequence = false;
      }
      result += ch;
    }
  }
  // Flush remaining tag sequence
  if (decoded.length > 0) {
    result += decoded;
  }
  return result;
}

/**
 * Check if a string contains hidden-Unicode characters that are commonly
 * used for steganography in prompts and tool output.
 *
 * Covered ranges:
 *   - U+E0001-E007F  Unicode Tag block (DeepMind traps kat. 1)
 *   - U+F0000-FFFFD  Supplementary Private Use Area-A (E1, v7.2.0)
 *   - U+100000-10FFFD Supplementary Private Use Area-B (E1, v7.2.0)
 *
 * Presence of any of these characters is suspicious regardless of
 * decoded content — they are invisible in most terminals and survive
 * normalization. The function name `containsUnicodeTags` is preserved
 * for back-compat (existing call sites in injection-patterns.mjs and
 * elsewhere); semantically it is now "containsHiddenUnicode".
 *
 * Tag-block characters decode to ASCII via `decodeUnicodeTags`. PUA
 * characters do NOT — they have no standard mapping and remain
 * detection-only.
 *
 * @param {string} s
 * @returns {boolean}
 */
export function containsUnicodeTags(s) {
  for (const ch of s) {
    const cp = ch.codePointAt(0);
    if (cp >= 0xE0001 && cp <= 0xE007F) return true;       // Tag block
    if (cp >= 0xF0000 && cp <= 0xFFFFD) return true;       // PUA-A (E1)
    if (cp >= 0x100000 && cp <= 0x10FFFD) return true;     // PUA-B (E1)
  }
  return false;
}

// ---------------------------------------------------------------------------
// BIDI override stripping
// ---------------------------------------------------------------------------

/**
 * Strip BIDI override characters that can reorder text visually.
 * U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO),
 * U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI).
 * These can hide injection by making text render differently than it parses.
 * @param {string} s
 * @returns {string}
 */
export function stripBidiOverrides(s) {
  return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, '');
}

// ---------------------------------------------------------------------------
// Homoglyph folding (E16, v7.2.0)
// ---------------------------------------------------------------------------

/**
 * Confusable mapping — characters that LOOK like Latin letters but are
 * different codepoints (most commonly Cyrillic and Greek). Surgical map
 * focused on letters that appear in injection vocabulary
 * (`ignore`, `system`, `you are`, `assistant`, `tool`, `response`).
 *
 * Excluded by design:
 *   - Latin Extended characters (æ, ø, å, é, è, ñ, ü, ö, ä, ç, ß, þ, ð, etc.)
 *     — these are legitimate letters in Norwegian, German, Danish, Spanish,
 *     French, Icelandic, etc., and would generate false positives in
 *     non-English source code or documentation.
 *   - Greek letters that don't visually overlap with Latin (`β`, `γ`, `δ`, ...)
 *   - Cyrillic letters that don't visually overlap (`б`, `г`, `д`, `ж`, ...)
 *   - Mathematical alphanumeric symbols (the U+1D400 block) — covered by
 *     NFKC normalization in `foldHomoglyphs` itself.
 *
 * The map is deliberately small (~25 entries). Adding more risks
 * false-positive escalation on benign multilingual content.
 */
const HOMOGLYPH_MAP = Object.freeze({
  // Cyrillic → Latin (lowercase)
  'а': 'a',  // U+0430
  'е': 'e',  // U+0435
  'о': 'o',  // U+043E
  'с': 'c',  // U+0441
  'р': 'p',  // U+0440
  'х': 'x',  // U+0445
  'у': 'y',  // U+0443
  'і': 'i',  // U+0456 (Ukrainian)
  'ј': 'j',  // U+0458
  'ѕ': 's',  // U+0455
  'ӏ': 'l',  // U+04CF (Cyrillic Palochka)
  // Cyrillic → Latin (uppercase)
  'А': 'A',  // U+0410
  'Е': 'E',  // U+0415
  'О': 'O',  // U+041E
  'С': 'C',  // U+0421
  'Р': 'P',  // U+0420
  'Х': 'X',  // U+0425
  'У': 'Y',  // U+0423
  // Greek → Latin (only the unambiguous Latin-look-alikes)
  'α': 'a',  // U+03B1
  'ο': 'o',  // U+03BF
  'ρ': 'p',  // U+03C1
  'ι': 'i',  // U+03B9
  'ν': 'v',  // U+03BD
  'τ': 't',  // U+03C4
  // Greek uppercase
  'Α': 'A',  // U+0391
  'Ο': 'O',  // U+039F
  'Ρ': 'P',  // U+03A1
  'Τ': 'T',  // U+03A4
});

/**
 * Fold visually-confusable characters to their Latin look-alikes. Used by
 * E16 (v7.2.0) to neutralize homoglyph-substitution injection attacks
 * before pattern matching.
 *
 * Pipeline:
 *   1. NFKC normalize — collapses Mathematical Alphanumeric (U+1D400),
 *      width variants, ligatures, and other compatibility decompositions.
 *   2. Apply HOMOGLYPH_MAP — Cyrillic/Greek look-alikes → Latin.
 *
 * Idempotent: `foldHomoglyphs(foldHomoglyphs(s)) === foldHomoglyphs(s)`.
 *
 * Norwegian/Polish/German/etc. text is NOT affected — characters like
 * æ, ø, å, é, ñ, ü, ö, ä are not in HOMOGLYPH_MAP.
 *
 * Performance: pure-ASCII inputs short-circuit before NFKC, since NFKC is
 * a no-op on ASCII and HOMOGLYPH_MAP only contains non-ASCII keys.
 * scanForInjection calls this on every scan; the fast-path keeps the
 * common-case overhead near zero.
 *
 * @param {string} s
 * @returns {string}
 */
export function foldHomoglyphs(s) {
  if (!s) return s;
  // Fast path: pure ASCII has nothing to fold and NFKC is identity.
  // charCodeAt is cheaper than iterating codepoints.
  let asciiOnly = true;
  for (let i = 0; i < s.length; i++) {
    if (s.charCodeAt(i) > 127) { asciiOnly = false; break; }
  }
  if (asciiOnly) return s;
  const normalized = s.normalize('NFKC');
  let out = '';
  for (const ch of normalized) {
    out += HOMOGLYPH_MAP[ch] || ch;
  }
  return out;
}

/**
 * Normalize a string by decoding all known obfuscation layers.
 * Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded).
 * Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes ->
 *   hex escapes -> URL encoding -> base64.
 * After decoding: collapse letter-spaced text.
 * @param {string} s
 * @returns {string}
 */
export function normalizeForScan(s) {
  let result = s;
  const MAX_ITERATIONS = 3;

  // Pre-decode: Unicode Tags and BIDI overrides (before the main loop)
  result = decodeUnicodeTags(result);
  result = stripBidiOverrides(result);

  for (let i = 0; i < MAX_ITERATIONS; i++) {
    const prev = result;
    result = decodeHtmlEntities(result);
    result = decodeUnicodeEscapes(result);
    result = decodeHexEscapes(result);
    result = decodeUrlEncoding(result);
    const b64decoded = tryDecodeBase64(result);
    if (b64decoded) result = b64decoded;
    // Stable — no further decoding possible
    if (result === prev) break;
  }

  // Post-decode: collapse letter-spaced evasion
  result = collapseLetterSpacing(result);

  return result;
}