ktg-plugin-marketplace/plugins/llm-security/scanners/unicode-scanner.mjs

385 lines
13 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// unicode-scanner.mjs — Detects hidden Unicode characters used for prompt injection
// and code obfuscation: zero-width chars, Unicode tag codepoints (steganography),
// BIDI override characters (Trojan Source), and homoglyph mixing.
//
// Zero external dependencies — Node.js builtins only.
// OWASP coverage: LLM01 (Prompt Injection), LLM03 (Supply Chain)
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
// ---------------------------------------------------------------------------
// Character sets
// ---------------------------------------------------------------------------
/** U+200BU+200D, U+FEFF, U+00AD: visually invisible, used to hide content */
const ZERO_WIDTH_CHARS = new Set([
0x200B, // ZERO WIDTH SPACE
0x200C, // ZERO WIDTH NON-JOINER
0x200D, // ZERO WIDTH JOINER
0xFEFF, // ZERO WIDTH NO-BREAK SPACE / BOM (when not at position 0)
0x00AD, // SOFT HYPHEN
]);
/** Unicode Tags block U+E0001U+E007F: encodes hidden ASCII via codepoint - 0xE0000 */
const UNICODE_TAG_START = 0xE0001;
const UNICODE_TAG_END = 0xE007F;
/** BIDI control characters — Trojan Source attack (CVE-2021-42574 class) */
const BIDI_CHARS = new Set([
0x202A, // LEFT-TO-RIGHT EMBEDDING
0x202B, // RIGHT-TO-LEFT EMBEDDING
0x202C, // POP DIRECTIONAL FORMATTING
0x202D, // LEFT-TO-RIGHT OVERRIDE
0x202E, // RIGHT-TO-LEFT OVERRIDE
0x2066, // LEFT-TO-RIGHT ISOLATE
0x2067, // RIGHT-TO-LEFT ISOLATE
0x2068, // FIRST STRONG ISOLATE
0x2069, // POP DIRECTIONAL ISOLATE
]);
/** Cyrillic lookalike codepoints that visually match Latin letters */
const CYRILLIC_CONFUSABLES = new Set([
0x0430, // а — Cyrillic small letter a (looks like Latin a)
0x0435, // е — Cyrillic small letter ie (looks like Latin e)
0x043E, // о — Cyrillic small letter o (looks like Latin o)
0x0441, // с — Cyrillic small letter es (looks like Latin c)
0x0440, // р — Cyrillic small letter er (looks like Latin p)
0x0443, // у — Cyrillic small letter u (looks like Latin y)
0x0445, // х — Cyrillic small letter ha (looks like Latin x)
0x0410, // А — Cyrillic capital letter a
0x0415, // Е — Cyrillic capital letter ie
0x041E, // О — Cyrillic capital letter o
0x0421, // С — Cyrillic capital letter es
0x0420, // Р — Cyrillic capital letter er
0x0425, // Х — Cyrillic capital letter ha
]);
// ---------------------------------------------------------------------------
// Helper: format hex codepoint list for evidence strings
// ---------------------------------------------------------------------------
/**
* Format an array of {cp, pos} objects as a readable evidence string.
* @param {Array<{cp: number, pos: number}>} hits
* @returns {string} e.g. "U+200B at col 5, U+200D at col 12"
*/
function formatEvidence(hits) {
return hits
.map(h => `U+${h.cp.toString(16).toUpperCase().padStart(4, '0')} at col ${h.pos + 1}`)
.join(', ');
}
// ---------------------------------------------------------------------------
// Category 1: Zero-Width Character detection
// ---------------------------------------------------------------------------
/**
* Scan a single line for zero-width characters.
* Returns an array of findings (0 or 1 per line — one finding per line hit,
* escalated to CRITICAL if the line is visually empty but has content).
*
* @param {string} line - Raw line content (no newline)
* @param {number} lineNumber - 1-indexed
* @param {string} relPath - Relative file path for finding metadata
* @returns {object[]} - Array of finding objects
*/
function scanLineForZeroWidth(line, lineNumber, relPath) {
const hits = [];
let pos = 0;
for (const char of line) {
const cp = char.codePointAt(0);
if (ZERO_WIDTH_CHARS.has(cp)) {
hits.push({ cp, pos });
}
pos += char.length; // codePointAt handles surrogates; advance by JS char count
}
if (hits.length === 0) return [];
// Determine if the line is visually empty (only zero-width chars present).
// Strip all zero-width chars and common whitespace; if nothing remains → CRITICAL.
const stripped = [...line]
.filter(ch => !ZERO_WIDTH_CHARS.has(ch.codePointAt(0)) && !/\s/.test(ch))
.join('');
const isVisuallyEmpty = stripped.length === 0;
const severity = isVisuallyEmpty ? SEVERITY.CRITICAL : SEVERITY.HIGH;
const title = isVisuallyEmpty
? 'Visually empty line with hidden zero-width characters'
: 'Zero-width characters detected in line';
const description = isVisuallyEmpty
? `Line ${lineNumber} appears blank but contains ${hits.length} zero-width character(s). ` +
'This is a strong indicator of hidden prompt injection content.'
: `Line ${lineNumber} contains ${hits.length} zero-width character(s) that are invisible to readers ` +
'but processed by LLMs. Can be used to smuggle hidden instructions.';
return [
finding({
scanner: 'UNI',
severity,
title,
description,
file: relPath,
line: lineNumber,
evidence: formatEvidence(hits),
owasp: 'LLM01',
recommendation:
'Remove all zero-width characters. Use a hex editor or `cat -A` to reveal them. ' +
'Consider adding a pre-commit hook that rejects files containing U+200B/200C/200D/FEFF/00AD.',
}),
];
}
// ---------------------------------------------------------------------------
// Category 2: Unicode Tag Codepoints (steganography)
// ---------------------------------------------------------------------------
/**
* Decode hidden ASCII message embedded in Unicode Tag codepoints.
* Tag char encodes ASCII as: codepoint - 0xE0000
* Non-tag chars (in a mixed sequence) are included as "?" in the decoded output.
*
* @param {Array<{cp: number, pos: number}>} tagHits
* @returns {string} Decoded string, e.g. "rm -rf /"
*/
function decodeTagMessage(tagHits) {
return tagHits
.map(h => {
const ascii = h.cp - 0xE0000;
// Printable ASCII range
return ascii >= 0x20 && ascii <= 0x7E ? String.fromCharCode(ascii) : '?';
})
.join('');
}
/**
* Scan a single line for Unicode Tag block codepoints.
* @param {string} line
* @param {number} lineNumber
* @param {string} relPath
* @returns {object[]}
*/
function scanLineForUnicodeTags(line, lineNumber, relPath) {
const hits = [];
let pos = 0;
for (const char of line) {
const cp = char.codePointAt(0);
if (cp >= UNICODE_TAG_START && cp <= UNICODE_TAG_END) {
hits.push({ cp, pos });
}
pos += char.length;
}
if (hits.length === 0) return [];
const decoded = decodeTagMessage(hits);
const cpList = formatEvidence(hits);
return [
finding({
scanner: 'UNI',
severity: SEVERITY.CRITICAL,
title: 'Unicode Tag block codepoints detected (steganographic hidden message)',
description:
`Line ${lineNumber} contains ${hits.length} character(s) from the Unicode Tags block ` +
`(U+E0001U+E007F). These encode a hidden ASCII message: "${decoded}". ` +
'This is deliberate steganography and a strong indicator of supply chain attack.',
file: relPath,
line: lineNumber,
evidence: `${cpList} → decoded: "${decoded}"`,
owasp: 'LLM03',
recommendation:
'Remove all Unicode Tag codepoints immediately. This file should not be trusted. ' +
'Investigate how these characters were introduced — they cannot appear accidentally.',
}),
];
}
// ---------------------------------------------------------------------------
// Category 3: BIDI Override Characters (Trojan Source)
// ---------------------------------------------------------------------------
/**
* Scan a single line for BIDI override characters.
* @param {string} line
* @param {number} lineNumber
* @param {string} relPath
* @returns {object[]}
*/
function scanLineForBidi(line, lineNumber, relPath) {
const hits = [];
let pos = 0;
for (const char of line) {
const cp = char.codePointAt(0);
if (BIDI_CHARS.has(cp)) {
hits.push({ cp, pos });
}
pos += char.length;
}
if (hits.length === 0) return [];
return [
finding({
scanner: 'UNI',
severity: SEVERITY.HIGH,
title: 'BIDI override character detected (Trojan Source attack vector)',
description:
`Line ${lineNumber} contains ${hits.length} bidirectional override character(s). ` +
'BIDI controls can make code appear different to humans than to interpreters/LLMs. ' +
'This is the Trojan Source technique (see CVE-2021-42574 class of vulnerabilities).',
file: relPath,
line: lineNumber,
evidence: formatEvidence(hits),
owasp: 'LLM01',
recommendation:
'Remove all BIDI override characters. Legitimate multilingual text rarely needs ' +
'explicit BIDI overrides in source code. Enable editor/IDE BIDI character warnings.',
}),
];
}
// ---------------------------------------------------------------------------
// Category 4: Homoglyph Detection (Latin/Cyrillic mixing)
// ---------------------------------------------------------------------------
/** Regex to extract word-like tokens including Unicode letters */
const TOKEN_RE = /[\p{L}\p{N}_]+/gu;
/** Latin letter range check */
function isLatin(cp) {
return (cp >= 0x0041 && cp <= 0x005A) || // A-Z
(cp >= 0x0061 && cp <= 0x007A); // a-z
}
/** Cyrillic block check (U+0400U+04FF) */
function isCyrillic(cp) {
return cp >= 0x0400 && cp <= 0x04FF;
}
/**
* Scan a single line for tokens that mix Latin and Cyrillic characters.
* Reports one finding per line (consolidating all suspicious tokens).
* @param {string} line
* @param {number} lineNumber
* @param {string} relPath
* @returns {object[]}
*/
function scanLineForHomoglyphs(line, lineNumber, relPath) {
const suspiciousTokens = [];
let match;
TOKEN_RE.lastIndex = 0;
while ((match = TOKEN_RE.exec(line)) !== null) {
const token = match[0];
let hasLatin = false;
let hasCyrillic = false;
const cyrillicChars = [];
for (const ch of token) {
const cp = ch.codePointAt(0);
if (isLatin(cp)) hasLatin = true;
if (isCyrillic(cp)) {
hasCyrillic = true;
cyrillicChars.push(`U+${cp.toString(16).toUpperCase().padStart(4, '0')}`);
}
}
if (hasLatin && hasCyrillic) {
suspiciousTokens.push({ token, cyrillicChars });
}
}
if (suspiciousTokens.length === 0) return [];
const tokenList = suspiciousTokens
.map(t => `"${t.token}" (Cyrillic: ${t.cyrillicChars.join(', ')})`)
.join('; ');
return [
finding({
scanner: 'UNI',
severity: SEVERITY.MEDIUM,
title: 'Homoglyph mixing detected: Latin and Cyrillic in same identifier',
description:
`Line ${lineNumber} contains ${suspiciousTokens.length} token(s) that mix Latin and ` +
'Cyrillic characters. Cyrillic confusables (а, е, о, с, р, у, х) look identical to ' +
'Latin letters but have different codepoints — enabling invisible identifier spoofing.',
file: relPath,
line: lineNumber,
evidence: tokenList,
owasp: 'LLM01',
recommendation:
'Normalize all identifiers to a single script. Use a Unicode confusables checker ' +
'(e.g., Unicode CLDR confusable-mappings.txt) and enforce a single-script policy ' +
'via linter rules (ESLint `no-misleading-character-class`, Rust `confusable_idents`).',
}),
];
}
// ---------------------------------------------------------------------------
// Main scanner export
// ---------------------------------------------------------------------------
/**
* Scan all discovered text files for hidden Unicode attack characters.
*
* @param {string} targetPath - Absolute root path being scanned
* @param {{ files: import('./lib/file-discovery.mjs').FileInfo[] }} discovery
* @returns {Promise<object>} - scannerResult envelope
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
const findings = [];
let filesScanned = 0;
try {
for (const fileInfo of discovery.files) {
const content = await readTextFile(fileInfo.absPath);
// Skip binary files or unreadable files
if (content === null) continue;
filesScanned++;
// Split preserving empty lines; strip trailing \r for Windows line endings
const lines = content.split('\n').map(l => l.replace(/\r$/, ''));
for (let i = 0; i < lines.length; i++) {
const lineNumber = i + 1;
const line = lines[i];
// Skip entirely empty lines early — nothing to detect
if (line.length === 0) continue;
// Run all four detectors per line
findings.push(...scanLineForZeroWidth(line, lineNumber, fileInfo.relPath));
findings.push(...scanLineForUnicodeTags(line, lineNumber, fileInfo.relPath));
findings.push(...scanLineForBidi(line, lineNumber, fileInfo.relPath));
findings.push(...scanLineForHomoglyphs(line, lineNumber, fileInfo.relPath));
}
}
const durationMs = Date.now() - startMs;
// Determine status: 'ok' even with findings (status reflects execution, not severity)
return scannerResult('unicode-scanner', 'ok', findings, filesScanned, durationMs);
} catch (err) {
const durationMs = Date.now() - startMs;
return scannerResult(
'unicode-scanner',
'error',
findings,
filesScanned,
durationMs,
err.message,
);
}
}