385 lines
13 KiB
JavaScript
385 lines
13 KiB
JavaScript
// unicode-scanner.mjs — Detects hidden Unicode characters used for prompt injection
|
||
// and code obfuscation: zero-width chars, Unicode tag codepoints (steganography),
|
||
// BIDI override characters (Trojan Source), and homoglyph mixing.
|
||
//
|
||
// Zero external dependencies — Node.js builtins only.
|
||
// OWASP coverage: LLM01 (Prompt Injection), LLM03 (Supply Chain)
|
||
|
||
import { readTextFile } from './lib/file-discovery.mjs';
|
||
import { finding, scannerResult } from './lib/output.mjs';
|
||
import { SEVERITY } from './lib/severity.mjs';
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Character sets
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/** U+200B–U+200D, U+FEFF, U+00AD: visually invisible, used to hide content */
|
||
const ZERO_WIDTH_CHARS = new Set([
|
||
0x200B, // ZERO WIDTH SPACE
|
||
0x200C, // ZERO WIDTH NON-JOINER
|
||
0x200D, // ZERO WIDTH JOINER
|
||
0xFEFF, // ZERO WIDTH NO-BREAK SPACE / BOM (when not at position 0)
|
||
0x00AD, // SOFT HYPHEN
|
||
]);
|
||
|
||
/** Unicode Tags block U+E0001–U+E007F: encodes hidden ASCII via codepoint - 0xE0000 */
|
||
const UNICODE_TAG_START = 0xE0001;
|
||
const UNICODE_TAG_END = 0xE007F;
|
||
|
||
/** BIDI control characters — Trojan Source attack (CVE-2021-42574 class) */
|
||
const BIDI_CHARS = new Set([
|
||
0x202A, // LEFT-TO-RIGHT EMBEDDING
|
||
0x202B, // RIGHT-TO-LEFT EMBEDDING
|
||
0x202C, // POP DIRECTIONAL FORMATTING
|
||
0x202D, // LEFT-TO-RIGHT OVERRIDE
|
||
0x202E, // RIGHT-TO-LEFT OVERRIDE
|
||
0x2066, // LEFT-TO-RIGHT ISOLATE
|
||
0x2067, // RIGHT-TO-LEFT ISOLATE
|
||
0x2068, // FIRST STRONG ISOLATE
|
||
0x2069, // POP DIRECTIONAL ISOLATE
|
||
]);
|
||
|
||
/** Cyrillic lookalike codepoints that visually match Latin letters */
|
||
const CYRILLIC_CONFUSABLES = new Set([
|
||
0x0430, // а — Cyrillic small letter a (looks like Latin a)
|
||
0x0435, // е — Cyrillic small letter ie (looks like Latin e)
|
||
0x043E, // о — Cyrillic small letter o (looks like Latin o)
|
||
0x0441, // с — Cyrillic small letter es (looks like Latin c)
|
||
0x0440, // р — Cyrillic small letter er (looks like Latin p)
|
||
0x0443, // у — Cyrillic small letter u (looks like Latin y)
|
||
0x0445, // х — Cyrillic small letter ha (looks like Latin x)
|
||
0x0410, // А — Cyrillic capital letter a
|
||
0x0415, // Е — Cyrillic capital letter ie
|
||
0x041E, // О — Cyrillic capital letter o
|
||
0x0421, // С — Cyrillic capital letter es
|
||
0x0420, // Р — Cyrillic capital letter er
|
||
0x0425, // Х — Cyrillic capital letter ha
|
||
]);
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Helper: format hex codepoint list for evidence strings
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Format an array of {cp, pos} objects as a readable evidence string.
|
||
* @param {Array<{cp: number, pos: number}>} hits
|
||
* @returns {string} e.g. "U+200B at col 5, U+200D at col 12"
|
||
*/
|
||
function formatEvidence(hits) {
|
||
return hits
|
||
.map(h => `U+${h.cp.toString(16).toUpperCase().padStart(4, '0')} at col ${h.pos + 1}`)
|
||
.join(', ');
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Category 1: Zero-Width Character detection
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Scan a single line for zero-width characters.
|
||
* Returns an array of findings (0 or 1 per line — one finding per line hit,
|
||
* escalated to CRITICAL if the line is visually empty but has content).
|
||
*
|
||
* @param {string} line - Raw line content (no newline)
|
||
* @param {number} lineNumber - 1-indexed
|
||
* @param {string} relPath - Relative file path for finding metadata
|
||
* @returns {object[]} - Array of finding objects
|
||
*/
|
||
function scanLineForZeroWidth(line, lineNumber, relPath) {
|
||
const hits = [];
|
||
|
||
let pos = 0;
|
||
for (const char of line) {
|
||
const cp = char.codePointAt(0);
|
||
if (ZERO_WIDTH_CHARS.has(cp)) {
|
||
hits.push({ cp, pos });
|
||
}
|
||
pos += char.length; // codePointAt handles surrogates; advance by JS char count
|
||
}
|
||
|
||
if (hits.length === 0) return [];
|
||
|
||
// Determine if the line is visually empty (only zero-width chars present).
|
||
// Strip all zero-width chars and common whitespace; if nothing remains → CRITICAL.
|
||
const stripped = [...line]
|
||
.filter(ch => !ZERO_WIDTH_CHARS.has(ch.codePointAt(0)) && !/\s/.test(ch))
|
||
.join('');
|
||
const isVisuallyEmpty = stripped.length === 0;
|
||
|
||
const severity = isVisuallyEmpty ? SEVERITY.CRITICAL : SEVERITY.HIGH;
|
||
const title = isVisuallyEmpty
|
||
? 'Visually empty line with hidden zero-width characters'
|
||
: 'Zero-width characters detected in line';
|
||
|
||
const description = isVisuallyEmpty
|
||
? `Line ${lineNumber} appears blank but contains ${hits.length} zero-width character(s). ` +
|
||
'This is a strong indicator of hidden prompt injection content.'
|
||
: `Line ${lineNumber} contains ${hits.length} zero-width character(s) that are invisible to readers ` +
|
||
'but processed by LLMs. Can be used to smuggle hidden instructions.';
|
||
|
||
return [
|
||
finding({
|
||
scanner: 'UNI',
|
||
severity,
|
||
title,
|
||
description,
|
||
file: relPath,
|
||
line: lineNumber,
|
||
evidence: formatEvidence(hits),
|
||
owasp: 'LLM01',
|
||
recommendation:
|
||
'Remove all zero-width characters. Use a hex editor or `cat -A` to reveal them. ' +
|
||
'Consider adding a pre-commit hook that rejects files containing U+200B/200C/200D/FEFF/00AD.',
|
||
}),
|
||
];
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Category 2: Unicode Tag Codepoints (steganography)
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Decode hidden ASCII message embedded in Unicode Tag codepoints.
|
||
* Tag char encodes ASCII as: codepoint - 0xE0000
|
||
* Non-tag chars (in a mixed sequence) are included as "?" in the decoded output.
|
||
*
|
||
* @param {Array<{cp: number, pos: number}>} tagHits
|
||
* @returns {string} Decoded string, e.g. "rm -rf /"
|
||
*/
|
||
function decodeTagMessage(tagHits) {
|
||
return tagHits
|
||
.map(h => {
|
||
const ascii = h.cp - 0xE0000;
|
||
// Printable ASCII range
|
||
return ascii >= 0x20 && ascii <= 0x7E ? String.fromCharCode(ascii) : '?';
|
||
})
|
||
.join('');
|
||
}
|
||
|
||
/**
|
||
* Scan a single line for Unicode Tag block codepoints.
|
||
* @param {string} line
|
||
* @param {number} lineNumber
|
||
* @param {string} relPath
|
||
* @returns {object[]}
|
||
*/
|
||
function scanLineForUnicodeTags(line, lineNumber, relPath) {
|
||
const hits = [];
|
||
|
||
let pos = 0;
|
||
for (const char of line) {
|
||
const cp = char.codePointAt(0);
|
||
if (cp >= UNICODE_TAG_START && cp <= UNICODE_TAG_END) {
|
||
hits.push({ cp, pos });
|
||
}
|
||
pos += char.length;
|
||
}
|
||
|
||
if (hits.length === 0) return [];
|
||
|
||
const decoded = decodeTagMessage(hits);
|
||
const cpList = formatEvidence(hits);
|
||
|
||
return [
|
||
finding({
|
||
scanner: 'UNI',
|
||
severity: SEVERITY.CRITICAL,
|
||
title: 'Unicode Tag block codepoints detected (steganographic hidden message)',
|
||
description:
|
||
`Line ${lineNumber} contains ${hits.length} character(s) from the Unicode Tags block ` +
|
||
`(U+E0001–U+E007F). These encode a hidden ASCII message: "${decoded}". ` +
|
||
'This is deliberate steganography and a strong indicator of supply chain attack.',
|
||
file: relPath,
|
||
line: lineNumber,
|
||
evidence: `${cpList} → decoded: "${decoded}"`,
|
||
owasp: 'LLM03',
|
||
recommendation:
|
||
'Remove all Unicode Tag codepoints immediately. This file should not be trusted. ' +
|
||
'Investigate how these characters were introduced — they cannot appear accidentally.',
|
||
}),
|
||
];
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Category 3: BIDI Override Characters (Trojan Source)
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Scan a single line for BIDI override characters.
|
||
* @param {string} line
|
||
* @param {number} lineNumber
|
||
* @param {string} relPath
|
||
* @returns {object[]}
|
||
*/
|
||
function scanLineForBidi(line, lineNumber, relPath) {
|
||
const hits = [];
|
||
|
||
let pos = 0;
|
||
for (const char of line) {
|
||
const cp = char.codePointAt(0);
|
||
if (BIDI_CHARS.has(cp)) {
|
||
hits.push({ cp, pos });
|
||
}
|
||
pos += char.length;
|
||
}
|
||
|
||
if (hits.length === 0) return [];
|
||
|
||
return [
|
||
finding({
|
||
scanner: 'UNI',
|
||
severity: SEVERITY.HIGH,
|
||
title: 'BIDI override character detected (Trojan Source attack vector)',
|
||
description:
|
||
`Line ${lineNumber} contains ${hits.length} bidirectional override character(s). ` +
|
||
'BIDI controls can make code appear different to humans than to interpreters/LLMs. ' +
|
||
'This is the Trojan Source technique (see CVE-2021-42574 class of vulnerabilities).',
|
||
file: relPath,
|
||
line: lineNumber,
|
||
evidence: formatEvidence(hits),
|
||
owasp: 'LLM01',
|
||
recommendation:
|
||
'Remove all BIDI override characters. Legitimate multilingual text rarely needs ' +
|
||
'explicit BIDI overrides in source code. Enable editor/IDE BIDI character warnings.',
|
||
}),
|
||
];
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Category 4: Homoglyph Detection (Latin/Cyrillic mixing)
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/** Regex to extract word-like tokens including Unicode letters */
|
||
const TOKEN_RE = /[\p{L}\p{N}_]+/gu;
|
||
|
||
/** Latin letter range check */
|
||
function isLatin(cp) {
|
||
return (cp >= 0x0041 && cp <= 0x005A) || // A-Z
|
||
(cp >= 0x0061 && cp <= 0x007A); // a-z
|
||
}
|
||
|
||
/** Cyrillic block check (U+0400–U+04FF) */
|
||
function isCyrillic(cp) {
|
||
return cp >= 0x0400 && cp <= 0x04FF;
|
||
}
|
||
|
||
/**
|
||
* Scan a single line for tokens that mix Latin and Cyrillic characters.
|
||
* Reports one finding per line (consolidating all suspicious tokens).
|
||
* @param {string} line
|
||
* @param {number} lineNumber
|
||
* @param {string} relPath
|
||
* @returns {object[]}
|
||
*/
|
||
function scanLineForHomoglyphs(line, lineNumber, relPath) {
|
||
const suspiciousTokens = [];
|
||
|
||
let match;
|
||
TOKEN_RE.lastIndex = 0;
|
||
while ((match = TOKEN_RE.exec(line)) !== null) {
|
||
const token = match[0];
|
||
let hasLatin = false;
|
||
let hasCyrillic = false;
|
||
const cyrillicChars = [];
|
||
|
||
for (const ch of token) {
|
||
const cp = ch.codePointAt(0);
|
||
if (isLatin(cp)) hasLatin = true;
|
||
if (isCyrillic(cp)) {
|
||
hasCyrillic = true;
|
||
cyrillicChars.push(`U+${cp.toString(16).toUpperCase().padStart(4, '0')}`);
|
||
}
|
||
}
|
||
|
||
if (hasLatin && hasCyrillic) {
|
||
suspiciousTokens.push({ token, cyrillicChars });
|
||
}
|
||
}
|
||
|
||
if (suspiciousTokens.length === 0) return [];
|
||
|
||
const tokenList = suspiciousTokens
|
||
.map(t => `"${t.token}" (Cyrillic: ${t.cyrillicChars.join(', ')})`)
|
||
.join('; ');
|
||
|
||
return [
|
||
finding({
|
||
scanner: 'UNI',
|
||
severity: SEVERITY.MEDIUM,
|
||
title: 'Homoglyph mixing detected: Latin and Cyrillic in same identifier',
|
||
description:
|
||
`Line ${lineNumber} contains ${suspiciousTokens.length} token(s) that mix Latin and ` +
|
||
'Cyrillic characters. Cyrillic confusables (а, е, о, с, р, у, х) look identical to ' +
|
||
'Latin letters but have different codepoints — enabling invisible identifier spoofing.',
|
||
file: relPath,
|
||
line: lineNumber,
|
||
evidence: tokenList,
|
||
owasp: 'LLM01',
|
||
recommendation:
|
||
'Normalize all identifiers to a single script. Use a Unicode confusables checker ' +
|
||
'(e.g., Unicode CLDR confusable-mappings.txt) and enforce a single-script policy ' +
|
||
'via linter rules (ESLint `no-misleading-character-class`, Rust `confusable_idents`).',
|
||
}),
|
||
];
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Main scanner export
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Scan all discovered text files for hidden Unicode attack characters.
|
||
*
|
||
* @param {string} targetPath - Absolute root path being scanned
|
||
* @param {{ files: import('./lib/file-discovery.mjs').FileInfo[] }} discovery
|
||
* @returns {Promise<object>} - scannerResult envelope
|
||
*/
|
||
export async function scan(targetPath, discovery) {
|
||
const startMs = Date.now();
|
||
const findings = [];
|
||
let filesScanned = 0;
|
||
|
||
try {
|
||
for (const fileInfo of discovery.files) {
|
||
const content = await readTextFile(fileInfo.absPath);
|
||
|
||
// Skip binary files or unreadable files
|
||
if (content === null) continue;
|
||
|
||
filesScanned++;
|
||
|
||
// Split preserving empty lines; strip trailing \r for Windows line endings
|
||
const lines = content.split('\n').map(l => l.replace(/\r$/, ''));
|
||
|
||
for (let i = 0; i < lines.length; i++) {
|
||
const lineNumber = i + 1;
|
||
const line = lines[i];
|
||
|
||
// Skip entirely empty lines early — nothing to detect
|
||
if (line.length === 0) continue;
|
||
|
||
// Run all four detectors per line
|
||
findings.push(...scanLineForZeroWidth(line, lineNumber, fileInfo.relPath));
|
||
findings.push(...scanLineForUnicodeTags(line, lineNumber, fileInfo.relPath));
|
||
findings.push(...scanLineForBidi(line, lineNumber, fileInfo.relPath));
|
||
findings.push(...scanLineForHomoglyphs(line, lineNumber, fileInfo.relPath));
|
||
}
|
||
}
|
||
|
||
const durationMs = Date.now() - startMs;
|
||
|
||
// Determine status: 'ok' even with findings (status reflects execution, not severity)
|
||
return scannerResult('unicode-scanner', 'ok', findings, filesScanned, durationMs);
|
||
|
||
} catch (err) {
|
||
const durationMs = Date.now() - startMs;
|
||
return scannerResult(
|
||
'unicode-scanner',
|
||
'error',
|
||
findings,
|
||
filesScanned,
|
||
durationMs,
|
||
err.message,
|
||
);
|
||
}
|
||
}
|