// unicode-scanner.mjs — Detects hidden Unicode characters used for prompt injection // and code obfuscation: zero-width chars, Unicode tag codepoints (steganography), // BIDI override characters (Trojan Source), and homoglyph mixing. // // Zero external dependencies — Node.js builtins only. // OWASP coverage: LLM01 (Prompt Injection), LLM03 (Supply Chain) import { readTextFile } from './lib/file-discovery.mjs'; import { finding, scannerResult } from './lib/output.mjs'; import { SEVERITY } from './lib/severity.mjs'; // --------------------------------------------------------------------------- // Character sets // --------------------------------------------------------------------------- /** U+200B–U+200D, U+FEFF, U+00AD: visually invisible, used to hide content */ const ZERO_WIDTH_CHARS = new Set([ 0x200B, // ZERO WIDTH SPACE 0x200C, // ZERO WIDTH NON-JOINER 0x200D, // ZERO WIDTH JOINER 0xFEFF, // ZERO WIDTH NO-BREAK SPACE / BOM (when not at position 0) 0x00AD, // SOFT HYPHEN ]); /** Unicode Tags block U+E0001–U+E007F: encodes hidden ASCII via codepoint - 0xE0000 */ const UNICODE_TAG_START = 0xE0001; const UNICODE_TAG_END = 0xE007F; /** BIDI control characters — Trojan Source attack (CVE-2021-42574 class) */ const BIDI_CHARS = new Set([ 0x202A, // LEFT-TO-RIGHT EMBEDDING 0x202B, // RIGHT-TO-LEFT EMBEDDING 0x202C, // POP DIRECTIONAL FORMATTING 0x202D, // LEFT-TO-RIGHT OVERRIDE 0x202E, // RIGHT-TO-LEFT OVERRIDE 0x2066, // LEFT-TO-RIGHT ISOLATE 0x2067, // RIGHT-TO-LEFT ISOLATE 0x2068, // FIRST STRONG ISOLATE 0x2069, // POP DIRECTIONAL ISOLATE ]); /** Cyrillic lookalike codepoints that visually match Latin letters */ const CYRILLIC_CONFUSABLES = new Set([ 0x0430, // а — Cyrillic small letter a (looks like Latin a) 0x0435, // е — Cyrillic small letter ie (looks like Latin e) 0x043E, // о — Cyrillic small letter o (looks like Latin o) 0x0441, // с — Cyrillic small letter es (looks like Latin c) 0x0440, // р — Cyrillic small letter er (looks like Latin p) 0x0443, // у — Cyrillic small letter u (looks like Latin y) 0x0445, // х — Cyrillic small letter ha (looks like Latin x) 0x0410, // А — Cyrillic capital letter a 0x0415, // Е — Cyrillic capital letter ie 0x041E, // О — Cyrillic capital letter o 0x0421, // С — Cyrillic capital letter es 0x0420, // Р — Cyrillic capital letter er 0x0425, // Х — Cyrillic capital letter ha ]); // --------------------------------------------------------------------------- // Helper: format hex codepoint list for evidence strings // --------------------------------------------------------------------------- /** * Format an array of {cp, pos} objects as a readable evidence string. * @param {Array<{cp: number, pos: number}>} hits * @returns {string} e.g. "U+200B at col 5, U+200D at col 12" */ function formatEvidence(hits) { return hits .map(h => `U+${h.cp.toString(16).toUpperCase().padStart(4, '0')} at col ${h.pos + 1}`) .join(', '); } // --------------------------------------------------------------------------- // Category 1: Zero-Width Character detection // --------------------------------------------------------------------------- /** * Scan a single line for zero-width characters. * Returns an array of findings (0 or 1 per line — one finding per line hit, * escalated to CRITICAL if the line is visually empty but has content). * * @param {string} line - Raw line content (no newline) * @param {number} lineNumber - 1-indexed * @param {string} relPath - Relative file path for finding metadata * @returns {object[]} - Array of finding objects */ function scanLineForZeroWidth(line, lineNumber, relPath) { const hits = []; let pos = 0; for (const char of line) { const cp = char.codePointAt(0); if (ZERO_WIDTH_CHARS.has(cp)) { hits.push({ cp, pos }); } pos += char.length; // codePointAt handles surrogates; advance by JS char count } if (hits.length === 0) return []; // Determine if the line is visually empty (only zero-width chars present). // Strip all zero-width chars and common whitespace; if nothing remains → CRITICAL. const stripped = [...line] .filter(ch => !ZERO_WIDTH_CHARS.has(ch.codePointAt(0)) && !/\s/.test(ch)) .join(''); const isVisuallyEmpty = stripped.length === 0; const severity = isVisuallyEmpty ? SEVERITY.CRITICAL : SEVERITY.HIGH; const title = isVisuallyEmpty ? 'Visually empty line with hidden zero-width characters' : 'Zero-width characters detected in line'; const description = isVisuallyEmpty ? `Line ${lineNumber} appears blank but contains ${hits.length} zero-width character(s). ` + 'This is a strong indicator of hidden prompt injection content.' : `Line ${lineNumber} contains ${hits.length} zero-width character(s) that are invisible to readers ` + 'but processed by LLMs. Can be used to smuggle hidden instructions.'; return [ finding({ scanner: 'UNI', severity, title, description, file: relPath, line: lineNumber, evidence: formatEvidence(hits), owasp: 'LLM01', recommendation: 'Remove all zero-width characters. Use a hex editor or `cat -A` to reveal them. ' + 'Consider adding a pre-commit hook that rejects files containing U+200B/200C/200D/FEFF/00AD.', }), ]; } // --------------------------------------------------------------------------- // Category 2: Unicode Tag Codepoints (steganography) // --------------------------------------------------------------------------- /** * Decode hidden ASCII message embedded in Unicode Tag codepoints. * Tag char encodes ASCII as: codepoint - 0xE0000 * Non-tag chars (in a mixed sequence) are included as "?" in the decoded output. * * @param {Array<{cp: number, pos: number}>} tagHits * @returns {string} Decoded string, e.g. "rm -rf /" */ function decodeTagMessage(tagHits) { return tagHits .map(h => { const ascii = h.cp - 0xE0000; // Printable ASCII range return ascii >= 0x20 && ascii <= 0x7E ? String.fromCharCode(ascii) : '?'; }) .join(''); } /** * Scan a single line for Unicode Tag block codepoints. * @param {string} line * @param {number} lineNumber * @param {string} relPath * @returns {object[]} */ function scanLineForUnicodeTags(line, lineNumber, relPath) { const hits = []; let pos = 0; for (const char of line) { const cp = char.codePointAt(0); if (cp >= UNICODE_TAG_START && cp <= UNICODE_TAG_END) { hits.push({ cp, pos }); } pos += char.length; } if (hits.length === 0) return []; const decoded = decodeTagMessage(hits); const cpList = formatEvidence(hits); return [ finding({ scanner: 'UNI', severity: SEVERITY.CRITICAL, title: 'Unicode Tag block codepoints detected (steganographic hidden message)', description: `Line ${lineNumber} contains ${hits.length} character(s) from the Unicode Tags block ` + `(U+E0001–U+E007F). These encode a hidden ASCII message: "${decoded}". ` + 'This is deliberate steganography and a strong indicator of supply chain attack.', file: relPath, line: lineNumber, evidence: `${cpList} → decoded: "${decoded}"`, owasp: 'LLM03', recommendation: 'Remove all Unicode Tag codepoints immediately. This file should not be trusted. ' + 'Investigate how these characters were introduced — they cannot appear accidentally.', }), ]; } // --------------------------------------------------------------------------- // Category 3: BIDI Override Characters (Trojan Source) // --------------------------------------------------------------------------- /** * Scan a single line for BIDI override characters. * @param {string} line * @param {number} lineNumber * @param {string} relPath * @returns {object[]} */ function scanLineForBidi(line, lineNumber, relPath) { const hits = []; let pos = 0; for (const char of line) { const cp = char.codePointAt(0); if (BIDI_CHARS.has(cp)) { hits.push({ cp, pos }); } pos += char.length; } if (hits.length === 0) return []; return [ finding({ scanner: 'UNI', severity: SEVERITY.HIGH, title: 'BIDI override character detected (Trojan Source attack vector)', description: `Line ${lineNumber} contains ${hits.length} bidirectional override character(s). ` + 'BIDI controls can make code appear different to humans than to interpreters/LLMs. ' + 'This is the Trojan Source technique (see CVE-2021-42574 class of vulnerabilities).', file: relPath, line: lineNumber, evidence: formatEvidence(hits), owasp: 'LLM01', recommendation: 'Remove all BIDI override characters. Legitimate multilingual text rarely needs ' + 'explicit BIDI overrides in source code. Enable editor/IDE BIDI character warnings.', }), ]; } // --------------------------------------------------------------------------- // Category 4: Homoglyph Detection (Latin/Cyrillic mixing) // --------------------------------------------------------------------------- /** Regex to extract word-like tokens including Unicode letters */ const TOKEN_RE = /[\p{L}\p{N}_]+/gu; /** Latin letter range check */ function isLatin(cp) { return (cp >= 0x0041 && cp <= 0x005A) || // A-Z (cp >= 0x0061 && cp <= 0x007A); // a-z } /** Cyrillic block check (U+0400–U+04FF) */ function isCyrillic(cp) { return cp >= 0x0400 && cp <= 0x04FF; } /** * Scan a single line for tokens that mix Latin and Cyrillic characters. * Reports one finding per line (consolidating all suspicious tokens). * @param {string} line * @param {number} lineNumber * @param {string} relPath * @returns {object[]} */ function scanLineForHomoglyphs(line, lineNumber, relPath) { const suspiciousTokens = []; let match; TOKEN_RE.lastIndex = 0; while ((match = TOKEN_RE.exec(line)) !== null) { const token = match[0]; let hasLatin = false; let hasCyrillic = false; const cyrillicChars = []; for (const ch of token) { const cp = ch.codePointAt(0); if (isLatin(cp)) hasLatin = true; if (isCyrillic(cp)) { hasCyrillic = true; cyrillicChars.push(`U+${cp.toString(16).toUpperCase().padStart(4, '0')}`); } } if (hasLatin && hasCyrillic) { suspiciousTokens.push({ token, cyrillicChars }); } } if (suspiciousTokens.length === 0) return []; const tokenList = suspiciousTokens .map(t => `"${t.token}" (Cyrillic: ${t.cyrillicChars.join(', ')})`) .join('; '); return [ finding({ scanner: 'UNI', severity: SEVERITY.MEDIUM, title: 'Homoglyph mixing detected: Latin and Cyrillic in same identifier', description: `Line ${lineNumber} contains ${suspiciousTokens.length} token(s) that mix Latin and ` + 'Cyrillic characters. Cyrillic confusables (а, е, о, с, р, у, х) look identical to ' + 'Latin letters but have different codepoints — enabling invisible identifier spoofing.', file: relPath, line: lineNumber, evidence: tokenList, owasp: 'LLM01', recommendation: 'Normalize all identifiers to a single script. Use a Unicode confusables checker ' + '(e.g., Unicode CLDR confusable-mappings.txt) and enforce a single-script policy ' + 'via linter rules (ESLint `no-misleading-character-class`, Rust `confusable_idents`).', }), ]; } // --------------------------------------------------------------------------- // Main scanner export // --------------------------------------------------------------------------- /** * Scan all discovered text files for hidden Unicode attack characters. * * @param {string} targetPath - Absolute root path being scanned * @param {{ files: import('./lib/file-discovery.mjs').FileInfo[] }} discovery * @returns {Promise} - scannerResult envelope */ export async function scan(targetPath, discovery) { const startMs = Date.now(); const findings = []; let filesScanned = 0; try { for (const fileInfo of discovery.files) { const content = await readTextFile(fileInfo.absPath); // Skip binary files or unreadable files if (content === null) continue; filesScanned++; // Split preserving empty lines; strip trailing \r for Windows line endings const lines = content.split('\n').map(l => l.replace(/\r$/, '')); for (let i = 0; i < lines.length; i++) { const lineNumber = i + 1; const line = lines[i]; // Skip entirely empty lines early — nothing to detect if (line.length === 0) continue; // Run all four detectors per line findings.push(...scanLineForZeroWidth(line, lineNumber, fileInfo.relPath)); findings.push(...scanLineForUnicodeTags(line, lineNumber, fileInfo.relPath)); findings.push(...scanLineForBidi(line, lineNumber, fileInfo.relPath)); findings.push(...scanLineForHomoglyphs(line, lineNumber, fileInfo.relPath)); } } const durationMs = Date.now() - startMs; // Determine status: 'ok' even with findings (status reflects execution, not severity) return scannerResult('unicode-scanner', 'ok', findings, filesScanned, durationMs); } catch (err) { const durationMs = Date.now() - startMs; return scannerResult( 'unicode-scanner', 'error', findings, filesScanned, durationMs, err.message, ); } }