ktg-plugin-marketplace/plugins/llm-security/scanners/unicode-scanner.mjs

// unicode-scanner.mjs — Detects hidden Unicode characters used for prompt injection
// and code obfuscation: zero-width chars, Unicode tag codepoints (steganography),
// BIDI override characters (Trojan Source), and homoglyph mixing.
//
// Zero external dependencies — Node.js builtins only.
// OWASP coverage: LLM01 (Prompt Injection), LLM03 (Supply Chain)

import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';

// ---------------------------------------------------------------------------
// Character sets
// ---------------------------------------------------------------------------

/** U+200B–U+200D, U+FEFF, U+00AD: visually invisible, used to hide content */
const ZERO_WIDTH_CHARS = new Set([
  0x200B, // ZERO WIDTH SPACE
  0x200C, // ZERO WIDTH NON-JOINER
  0x200D, // ZERO WIDTH JOINER
  0xFEFF,  // ZERO WIDTH NO-BREAK SPACE / BOM (when not at position 0)
  0x00AD,  // SOFT HYPHEN
]);

/** Unicode Tags block U+E0001–U+E007F: encodes hidden ASCII via codepoint - 0xE0000 */
const UNICODE_TAG_START = 0xE0001;
const UNICODE_TAG_END   = 0xE007F;

/** BIDI control characters — Trojan Source attack (CVE-2021-42574 class) */
const BIDI_CHARS = new Set([
  0x202A, // LEFT-TO-RIGHT EMBEDDING
  0x202B, // RIGHT-TO-LEFT EMBEDDING
  0x202C, // POP DIRECTIONAL FORMATTING
  0x202D, // LEFT-TO-RIGHT OVERRIDE
  0x202E, // RIGHT-TO-LEFT OVERRIDE
  0x2066, // LEFT-TO-RIGHT ISOLATE
  0x2067, // RIGHT-TO-LEFT ISOLATE
  0x2068, // FIRST STRONG ISOLATE
  0x2069, // POP DIRECTIONAL ISOLATE
]);

/** Cyrillic lookalike codepoints that visually match Latin letters */
const CYRILLIC_CONFUSABLES = new Set([
  0x0430, // а — Cyrillic small letter a  (looks like Latin a)
  0x0435, // е — Cyrillic small letter ie (looks like Latin e)
  0x043E, // о — Cyrillic small letter o  (looks like Latin o)
  0x0441, // с — Cyrillic small letter es (looks like Latin c)
  0x0440, // р — Cyrillic small letter er (looks like Latin p)
  0x0443, // у — Cyrillic small letter u  (looks like Latin y)
  0x0445, // х — Cyrillic small letter ha (looks like Latin x)
  0x0410, // А — Cyrillic capital letter a
  0x0415, // Е — Cyrillic capital letter ie
  0x041E, // О — Cyrillic capital letter o
  0x0421, // С — Cyrillic capital letter es
  0x0420, // Р — Cyrillic capital letter er
  0x0425, // Х — Cyrillic capital letter ha
]);

// ---------------------------------------------------------------------------
// Helper: format hex codepoint list for evidence strings
// ---------------------------------------------------------------------------

/**
 * Format an array of {cp, pos} objects as a readable evidence string.
 * @param {Array<{cp: number, pos: number}>} hits
 * @returns {string}  e.g. "U+200B at col 5, U+200D at col 12"
 */
function formatEvidence(hits) {
  return hits
    .map(h => `U+${h.cp.toString(16).toUpperCase().padStart(4, '0')} at col ${h.pos + 1}`)
    .join(', ');
}

// ---------------------------------------------------------------------------
// Category 1: Zero-Width Character detection
// ---------------------------------------------------------------------------

/**
 * Scan a single line for zero-width characters.
 * Returns an array of findings (0 or 1 per line — one finding per line hit,
 * escalated to CRITICAL if the line is visually empty but has content).
 *
 * @param {string} line        - Raw line content (no newline)
 * @param {number} lineNumber  - 1-indexed
 * @param {string} relPath     - Relative file path for finding metadata
 * @returns {object[]}         - Array of finding objects
 */
function scanLineForZeroWidth(line, lineNumber, relPath) {
  const hits = [];

  let pos = 0;
  for (const char of line) {
    const cp = char.codePointAt(0);
    if (ZERO_WIDTH_CHARS.has(cp)) {
      hits.push({ cp, pos });
    }
    pos += char.length; // codePointAt handles surrogates; advance by JS char count
  }

  if (hits.length === 0) return [];

  // Determine if the line is visually empty (only zero-width chars present).
  // Strip all zero-width chars and common whitespace; if nothing remains → CRITICAL.
  const stripped = [...line]
    .filter(ch => !ZERO_WIDTH_CHARS.has(ch.codePointAt(0)) && !/\s/.test(ch))
    .join('');
  const isVisuallyEmpty = stripped.length === 0;

  const severity = isVisuallyEmpty ? SEVERITY.CRITICAL : SEVERITY.HIGH;
  const title = isVisuallyEmpty
    ? 'Visually empty line with hidden zero-width characters'
    : 'Zero-width characters detected in line';

  const description = isVisuallyEmpty
    ? `Line ${lineNumber} appears blank but contains ${hits.length} zero-width character(s). ` +
      'This is a strong indicator of hidden prompt injection content.'
    : `Line ${lineNumber} contains ${hits.length} zero-width character(s) that are invisible to readers ` +
      'but processed by LLMs. Can be used to smuggle hidden instructions.';

  return [
    finding({
      scanner: 'UNI',
      severity,
      title,
      description,
      file: relPath,
      line: lineNumber,
      evidence: formatEvidence(hits),
      owasp: 'LLM01',
      recommendation:
        'Remove all zero-width characters. Use a hex editor or `cat -A` to reveal them. ' +
        'Consider adding a pre-commit hook that rejects files containing U+200B/200C/200D/FEFF/00AD.',
    }),
  ];
}

// ---------------------------------------------------------------------------
// Category 2: Unicode Tag Codepoints (steganography)
// ---------------------------------------------------------------------------

/**
 * Decode hidden ASCII message embedded in Unicode Tag codepoints.
 * Tag char encodes ASCII as: codepoint - 0xE0000
 * Non-tag chars (in a mixed sequence) are included as "?" in the decoded output.
 *
 * @param {Array<{cp: number, pos: number}>} tagHits
 * @returns {string}  Decoded string, e.g. "rm -rf /"
 */
function decodeTagMessage(tagHits) {
  return tagHits
    .map(h => {
      const ascii = h.cp - 0xE0000;
      // Printable ASCII range
      return ascii >= 0x20 && ascii <= 0x7E ? String.fromCharCode(ascii) : '?';
    })
    .join('');
}

/**
 * Scan a single line for Unicode Tag block codepoints.
 * @param {string} line
 * @param {number} lineNumber
 * @param {string} relPath
 * @returns {object[]}
 */
function scanLineForUnicodeTags(line, lineNumber, relPath) {
  const hits = [];

  let pos = 0;
  for (const char of line) {
    const cp = char.codePointAt(0);
    if (cp >= UNICODE_TAG_START && cp <= UNICODE_TAG_END) {
      hits.push({ cp, pos });
    }
    pos += char.length;
  }

  if (hits.length === 0) return [];

  const decoded = decodeTagMessage(hits);
  const cpList = formatEvidence(hits);

  return [
    finding({
      scanner: 'UNI',
      severity: SEVERITY.CRITICAL,
      title: 'Unicode Tag block codepoints detected (steganographic hidden message)',
      description:
        `Line ${lineNumber} contains ${hits.length} character(s) from the Unicode Tags block ` +
        `(U+E0001–U+E007F). These encode a hidden ASCII message: "${decoded}". ` +
        'This is deliberate steganography and a strong indicator of supply chain attack.',
      file: relPath,
      line: lineNumber,
      evidence: `${cpList} → decoded: "${decoded}"`,
      owasp: 'LLM03',
      recommendation:
        'Remove all Unicode Tag codepoints immediately. This file should not be trusted. ' +
        'Investigate how these characters were introduced — they cannot appear accidentally.',
    }),
  ];
}

// ---------------------------------------------------------------------------
// Category 3: BIDI Override Characters (Trojan Source)
// ---------------------------------------------------------------------------

/**
 * Scan a single line for BIDI override characters.
 * @param {string} line
 * @param {number} lineNumber
 * @param {string} relPath
 * @returns {object[]}
 */
function scanLineForBidi(line, lineNumber, relPath) {
  const hits = [];

  let pos = 0;
  for (const char of line) {
    const cp = char.codePointAt(0);
    if (BIDI_CHARS.has(cp)) {
      hits.push({ cp, pos });
    }
    pos += char.length;
  }

  if (hits.length === 0) return [];

  return [
    finding({
      scanner: 'UNI',
      severity: SEVERITY.HIGH,
      title: 'BIDI override character detected (Trojan Source attack vector)',
      description:
        `Line ${lineNumber} contains ${hits.length} bidirectional override character(s). ` +
        'BIDI controls can make code appear different to humans than to interpreters/LLMs. ' +
        'This is the Trojan Source technique (see CVE-2021-42574 class of vulnerabilities).',
      file: relPath,
      line: lineNumber,
      evidence: formatEvidence(hits),
      owasp: 'LLM01',
      recommendation:
        'Remove all BIDI override characters. Legitimate multilingual text rarely needs ' +
        'explicit BIDI overrides in source code. Enable editor/IDE BIDI character warnings.',
    }),
  ];
}

// ---------------------------------------------------------------------------
// Category 4: Homoglyph Detection (Latin/Cyrillic mixing)
// ---------------------------------------------------------------------------

/** Regex to extract word-like tokens including Unicode letters */
const TOKEN_RE = /[\p{L}\p{N}_]+/gu;

/** Latin letter range check */
function isLatin(cp) {
  return (cp >= 0x0041 && cp <= 0x005A) || // A-Z
         (cp >= 0x0061 && cp <= 0x007A);    // a-z
}

/** Cyrillic block check (U+0400–U+04FF) */
function isCyrillic(cp) {
  return cp >= 0x0400 && cp <= 0x04FF;
}

/**
 * Scan a single line for tokens that mix Latin and Cyrillic characters.
 * Reports one finding per line (consolidating all suspicious tokens).
 * @param {string} line
 * @param {number} lineNumber
 * @param {string} relPath
 * @returns {object[]}
 */
function scanLineForHomoglyphs(line, lineNumber, relPath) {
  const suspiciousTokens = [];

  let match;
  TOKEN_RE.lastIndex = 0;
  while ((match = TOKEN_RE.exec(line)) !== null) {
    const token = match[0];
    let hasLatin = false;
    let hasCyrillic = false;
    const cyrillicChars = [];

    for (const ch of token) {
      const cp = ch.codePointAt(0);
      if (isLatin(cp)) hasLatin = true;
      if (isCyrillic(cp)) {
        hasCyrillic = true;
        cyrillicChars.push(`U+${cp.toString(16).toUpperCase().padStart(4, '0')}`);
      }
    }

    if (hasLatin && hasCyrillic) {
      suspiciousTokens.push({ token, cyrillicChars });
    }
  }

  if (suspiciousTokens.length === 0) return [];

  const tokenList = suspiciousTokens
    .map(t => `"${t.token}" (Cyrillic: ${t.cyrillicChars.join(', ')})`)
    .join('; ');

  return [
    finding({
      scanner: 'UNI',
      severity: SEVERITY.MEDIUM,
      title: 'Homoglyph mixing detected: Latin and Cyrillic in same identifier',
      description:
        `Line ${lineNumber} contains ${suspiciousTokens.length} token(s) that mix Latin and ` +
        'Cyrillic characters. Cyrillic confusables (а, е, о, с, р, у, х) look identical to ' +
        'Latin letters but have different codepoints — enabling invisible identifier spoofing.',
      file: relPath,
      line: lineNumber,
      evidence: tokenList,
      owasp: 'LLM01',
      recommendation:
        'Normalize all identifiers to a single script. Use a Unicode confusables checker ' +
        '(e.g., Unicode CLDR confusable-mappings.txt) and enforce a single-script policy ' +
        'via linter rules (ESLint `no-misleading-character-class`, Rust `confusable_idents`).',
    }),
  ];
}

// ---------------------------------------------------------------------------
// Main scanner export
// ---------------------------------------------------------------------------

/**
 * Scan all discovered text files for hidden Unicode attack characters.
 *
 * @param {string} targetPath        - Absolute root path being scanned
 * @param {{ files: import('./lib/file-discovery.mjs').FileInfo[] }} discovery
 * @returns {Promise<object>}        - scannerResult envelope
 */
export async function scan(targetPath, discovery) {
  const startMs = Date.now();
  const findings = [];
  let filesScanned = 0;

  try {
    for (const fileInfo of discovery.files) {
      const content = await readTextFile(fileInfo.absPath);

      // Skip binary files or unreadable files
      if (content === null) continue;

      filesScanned++;

      // Split preserving empty lines; strip trailing \r for Windows line endings
      const lines = content.split('\n').map(l => l.replace(/\r$/, ''));

      for (let i = 0; i < lines.length; i++) {
        const lineNumber = i + 1;
        const line = lines[i];

        // Skip entirely empty lines early — nothing to detect
        if (line.length === 0) continue;

        // Run all four detectors per line
        findings.push(...scanLineForZeroWidth(line, lineNumber, fileInfo.relPath));
        findings.push(...scanLineForUnicodeTags(line, lineNumber, fileInfo.relPath));
        findings.push(...scanLineForBidi(line, lineNumber, fileInfo.relPath));
        findings.push(...scanLineForHomoglyphs(line, lineNumber, fileInfo.relPath));
      }
    }

    const durationMs = Date.now() - startMs;

    // Determine status: 'ok' even with findings (status reflects execution, not severity)
    return scannerResult('unicode-scanner', 'ok', findings, filesScanned, durationMs);

  } catch (err) {
    const durationMs = Date.now() - startMs;
    return scannerResult(
      'unicode-scanner',
      'error',
      findings,
      filesScanned,
      durationMs,
      err.message,
    );
  }
}