ktg-plugin-marketplace/plugins/llm-security/scanners/lib/string-utils.mjs

// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding
// Zero dependencies.

/**
 * Shannon entropy of a string (bits per character).
 * @param {string} s
 * @returns {number}
 */
export function shannonEntropy(s) {
  if (s.length === 0) return 0;
  const freq = new Map();
  for (const ch of s) {
    freq.set(ch, (freq.get(ch) || 0) + 1);
  }
  let H = 0;
  const len = s.length;
  for (const count of freq.values()) {
    const p = count / len;
    H -= p * Math.log2(p);
  }
  return H;
}

/**
 * Levenshtein edit distance between two strings.
 * @param {string} a
 * @param {string} b
 * @returns {number}
 */
export function levenshtein(a, b) {
  if (a === b) return 0;
  if (a.length === 0) return b.length;
  if (b.length === 0) return a.length;

  const m = a.length;
  const n = b.length;
  // Single-row optimization
  let prev = new Array(n + 1);
  let curr = new Array(n + 1);
  for (let j = 0; j <= n; j++) prev[j] = j;

  for (let i = 1; i <= m; i++) {
    curr[0] = i;
    for (let j = 1; j <= n; j++) {
      const cost = a[i - 1] === b[j - 1] ? 0 : 1;
      curr[j] = Math.min(
        prev[j] + 1,       // deletion
        curr[j - 1] + 1,   // insertion
        prev[j - 1] + cost  // substitution
      );
    }
    [prev, curr] = [curr, prev];
  }
  return prev[n];
}

/**
 * Split a package name into lowercase tokens on `-` and `_` boundaries.
 * Used by the B7 typosquat token-overlap heuristic. Empty tokens are
 * dropped. Single-character tokens are kept (some package names like
 * `a-b` are real).
 *
 * @param {string} name
 * @returns {string[]}
 */
export function tokenize(name) {
  if (!name) return [];
  return name
    .toLowerCase()
    .split(/[-_]+/)
    .filter(t => t.length > 0);
}

/**
 * Token-overlap ratio between two package names. Returns the size of the
 * intersection divided by the size of the smaller token set. Returns 0 if
 * either input is empty.
 *
 * Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0
 *          `tokenOverlap('react-router-dom', 'react')` → 1.0
 *          `tokenOverlap('react-helper', 'react-router')` → 0.5
 *          `tokenOverlap('foo', 'bar')` → 0.0
 *
 * Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein —
 * Levenshtein <=2 catches small typos; token-overlap catches
 * popular-name-with-suffix typosquats.
 *
 * @param {string} a
 * @param {string} b
 * @returns {number}  0..1
 */
export function tokenOverlap(a, b) {
  const ta = new Set(tokenize(a));
  const tb = new Set(tokenize(b));
  if (ta.size === 0 || tb.size === 0) return 0;
  let intersection = 0;
  for (const t of ta) if (tb.has(t)) intersection++;
  return intersection / Math.min(ta.size, tb.size);
}

/**
 * Suspicious suffix tokens commonly used by typosquats to dress up a
 * popular package name. Module-level for B7 reuse.
 *
 * Excluded by design (would conflict with the v7.0.0 typosquat allowlist
 * or trigger false positives on legitimate packages):
 *   - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many
 *     legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The
 *     v7.0.0 allowlist contains `tsx` directly; including the same token
 *     in the suspicious set would create an internal contradiction.
 *   - `pro` — too common as a legitimate edition marker (`vue-pro`,
 *     `tailwindcss-pro`).
 *
 * Kept tokens are the unambiguous typosquat suffixes: utility/helper
 * dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers.
 */
export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([
  'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras',
  'bin', 'cli', 'tool', 'tools',
  'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim',
]);

/**
 * Check if a string looks like base64-encoded data.
 * @param {string} s
 * @returns {boolean}
 */
export function isBase64Like(s) {
  if (s.length < 20) return false;
  // Must be mostly base64 chars and optionally end with =
  return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s);
}

/**
 * Check if a string looks like a hex-encoded blob.
 * @param {string} s
 * @returns {boolean}
 */
export function isHexBlob(s) {
  if (s.length < 32) return false;
  return /^(0x)?[0-9a-fA-F]{32,}$/.test(s);
}

/**
 * Redact a string for safe display — show first 8 and last 4 chars.
 * @param {string} s
 * @param {number} [showStart=8]
 * @param {number} [showEnd=4]
 * @returns {string}
 */
export function redact(s, showStart = 8, showEnd = 4) {
  if (s.length <= showStart + showEnd + 3) return s;
  return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`;
}

/**
 * Extract string literals from a line of code.
 * Handles single-quoted, double-quoted, and backtick strings.
 * @param {string} line
 * @returns {string[]}
 */
export function extractStringLiterals(line) {
  const results = [];
  const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g;
  let match;
  while ((match = regex.exec(line)) !== null) {
    results.push(match[1] ?? match[2] ?? match[3]);
  }
  return results;
}

// ---------------------------------------------------------------------------
// Encoding/obfuscation decoders
// ---------------------------------------------------------------------------

/**
 * Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}.
 * @param {string} s
 * @returns {string}
 */
export function decodeUnicodeEscapes(s) {
  return s
    .replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => {
      const cp = parseInt(hex, 16);
      return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
    })
    .replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) =>
      String.fromCodePoint(parseInt(hex, 16))
    );
}

/**
 * Decode hex escape sequences: \xXX.
 * @param {string} s
 * @returns {string}
 */
export function decodeHexEscapes(s) {
  return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) =>
    String.fromCharCode(parseInt(hex, 16))
  );
}

/**
 * Decode URL percent-encoding: %XX.
 * Uses decodeURIComponent with fallback for malformed sequences.
 * @param {string} s
 * @returns {string}
 */
export function decodeUrlEncoding(s) {
  // Fast path: no percent signs means nothing to decode
  if (!s.includes('%')) return s;
  try {
    return decodeURIComponent(s);
  } catch {
    // Malformed sequences — decode individual %XX pairs
    return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) =>
      String.fromCharCode(parseInt(hex, 16))
    );
  }
}

/**
 * Attempt to decode a base64 string to UTF-8 text.
 * Returns null if the input is not base64-like or decoded result is not readable text.
 * @param {string} s
 * @returns {string|null}
 */
export function tryDecodeBase64(s) {
  if (!isBase64Like(s)) return null;
  try {
    const decoded = Buffer.from(s, 'base64').toString('utf-8');
    // Check if result is mostly printable text (>= 80% printable ASCII)
    const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length;
    if (decoded.length === 0 || printable / decoded.length < 0.8) return null;
    return decoded;
  } catch {
    return null;
  }
}

/**
 * Decode HTML entities: named (&lt; &gt; &amp; &quot; &apos;),
 * decimal (&#105;), and hex (&#x69;).
 * @param {string} s
 * @returns {string}
 */
export function decodeHtmlEntities(s) {
  if (!s.includes('&')) return s;
  const NAMED = {
    '&lt;': '<', '&gt;': '>', '&amp;': '&', '&quot;': '"', '&apos;': "'",
    '&nbsp;': ' ', '&tab;': '\t', '&newline;': '\n',
    '&lpar;': '(', '&rpar;': ')', '&lsqb;': '[', '&rsqb;': ']',
    '&lcub;': '{', '&rcub;': '}', '&sol;': '/', '&bsol;': '\\',
    '&colon;': ':', '&semi;': ';', '&comma;': ',', '&period;': '.',
    '&excl;': '!', '&quest;': '?', '&num;': '#', '&percnt;': '%',
    '&equals;': '=', '&plus;': '+', '&minus;': '-', '&ast;': '*',
    '&vert;': '|', '&tilde;': '~', '&grave;': '`', '&Hat;': '^',
    '&lowbar;': '_', '&at;': '@', '&dollar;': '$',
  };
  return s
    .replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => {
      const cp = parseInt(hex, 16);
      return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
    })
    .replace(/&#(\d{1,7});/g, (_, dec) => {
      const cp = parseInt(dec, 10);
      return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
    })
    .replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity);
}

/**
 * Collapse letter-spaced text: "i g n o r e" → "ignore".
 * Only collapses runs of single letters separated by spaces/tabs.
 * Minimum 4 letters to avoid false positives on normal text.
 * @param {string} s
 * @returns {string}
 */
export function collapseLetterSpacing(s) {
  // Match 4+ single-letter tokens separated by 1+ spaces/tabs
  return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) =>
    match.replace(/ /g, '')
  );
}

// ---------------------------------------------------------------------------
// Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1
// ---------------------------------------------------------------------------

/**
 * Decode Unicode Tags steganography: U+E0001-E007F → ASCII.
 * Unicode Tags (U+E0000 block) can encode invisible ASCII text inside
 * what appears to be empty or normal-looking strings.
 * E.g., U+E0069 U+E0067 U+E006E → "ign"
 * @param {string} s
 * @returns {string}
 */
export function decodeUnicodeTags(s) {
  let result = '';
  let decoded = '';
  let inTagSequence = false;

  for (const ch of s) {
    const cp = ch.codePointAt(0);
    if (cp >= 0xE0001 && cp <= 0xE007F) {
      // Tag character — map to ASCII (subtract 0xE0000)
      decoded += String.fromCharCode(cp - 0xE0000);
      inTagSequence = true;
    } else {
      if (inTagSequence && decoded.length > 0) {
        result += decoded;
        decoded = '';
        inTagSequence = false;
      }
      result += ch;
    }
  }
  // Flush remaining tag sequence
  if (decoded.length > 0) {
    result += decoded;
  }
  return result;
}

/**
 * Check if a string contains Unicode Tag characters (U+E0001-E007F).
 * Presence of these characters is suspicious regardless of decoded content.
 * @param {string} s
 * @returns {boolean}
 */
export function containsUnicodeTags(s) {
  for (const ch of s) {
    const cp = ch.codePointAt(0);
    if (cp >= 0xE0001 && cp <= 0xE007F) return true;
  }
  return false;
}

// ---------------------------------------------------------------------------
// BIDI override stripping
// ---------------------------------------------------------------------------

/**
 * Strip BIDI override characters that can reorder text visually.
 * U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO),
 * U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI).
 * These can hide injection by making text render differently than it parses.
 * @param {string} s
 * @returns {string}
 */
export function stripBidiOverrides(s) {
  return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, '');
}

/**
 * Normalize a string by decoding all known obfuscation layers.
 * Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded).
 * Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes ->
 *   hex escapes -> URL encoding -> base64.
 * After decoding: collapse letter-spaced text.
 * @param {string} s
 * @returns {string}
 */
export function normalizeForScan(s) {
  let result = s;
  const MAX_ITERATIONS = 3;

  // Pre-decode: Unicode Tags and BIDI overrides (before the main loop)
  result = decodeUnicodeTags(result);
  result = stripBidiOverrides(result);

  for (let i = 0; i < MAX_ITERATIONS; i++) {
    const prev = result;
    result = decodeHtmlEntities(result);
    result = decodeUnicodeEscapes(result);
    result = decodeHexEscapes(result);
    result = decodeUrlEncoding(result);
    const b64decoded = tryDecodeBase64(result);
    if (b64decoded) result = b64decoded;
    // Stable — no further decoding possible
    if (result === prev) break;
  }

  // Post-decode: collapse letter-spaced evasion
  result = collapseLetterSpacing(result);

  return result;
}