feat: initial open marketplace with llm-security, config-audit, ultraplan-local

This commit is contained in:
Kjell Tore Guttormsen 2026-04-06 18:47:49 +02:00
commit f93d6abdae
380 changed files with 65935 additions and 0 deletions

View file

@ -0,0 +1,385 @@
// unicode-scanner.mjs — Detects hidden Unicode characters used for prompt injection
// and code obfuscation: zero-width chars, Unicode tag codepoints (steganography),
// BIDI override characters (Trojan Source), and homoglyph mixing.
//
// Zero external dependencies — Node.js builtins only.
// OWASP coverage: LLM01 (Prompt Injection), LLM03 (Supply Chain)
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
// ---------------------------------------------------------------------------
// Character sets
// ---------------------------------------------------------------------------
/** U+200BU+200D, U+FEFF, U+00AD: visually invisible, used to hide content */
const ZERO_WIDTH_CHARS = new Set([
0x200B, // ZERO WIDTH SPACE
0x200C, // ZERO WIDTH NON-JOINER
0x200D, // ZERO WIDTH JOINER
0xFEFF, // ZERO WIDTH NO-BREAK SPACE / BOM (when not at position 0)
0x00AD, // SOFT HYPHEN
]);
/** Unicode Tags block U+E0001U+E007F: encodes hidden ASCII via codepoint - 0xE0000 */
const UNICODE_TAG_START = 0xE0001;
const UNICODE_TAG_END = 0xE007F;
/** BIDI control characters — Trojan Source attack (CVE-2021-42574 class) */
const BIDI_CHARS = new Set([
0x202A, // LEFT-TO-RIGHT EMBEDDING
0x202B, // RIGHT-TO-LEFT EMBEDDING
0x202C, // POP DIRECTIONAL FORMATTING
0x202D, // LEFT-TO-RIGHT OVERRIDE
0x202E, // RIGHT-TO-LEFT OVERRIDE
0x2066, // LEFT-TO-RIGHT ISOLATE
0x2067, // RIGHT-TO-LEFT ISOLATE
0x2068, // FIRST STRONG ISOLATE
0x2069, // POP DIRECTIONAL ISOLATE
]);
/** Cyrillic lookalike codepoints that visually match Latin letters */
const CYRILLIC_CONFUSABLES = new Set([
0x0430, // а — Cyrillic small letter a (looks like Latin a)
0x0435, // е — Cyrillic small letter ie (looks like Latin e)
0x043E, // о — Cyrillic small letter o (looks like Latin o)
0x0441, // с — Cyrillic small letter es (looks like Latin c)
0x0440, // р — Cyrillic small letter er (looks like Latin p)
0x0443, // у — Cyrillic small letter u (looks like Latin y)
0x0445, // х — Cyrillic small letter ha (looks like Latin x)
0x0410, // А — Cyrillic capital letter a
0x0415, // Е — Cyrillic capital letter ie
0x041E, // О — Cyrillic capital letter o
0x0421, // С — Cyrillic capital letter es
0x0420, // Р — Cyrillic capital letter er
0x0425, // Х — Cyrillic capital letter ha
]);
// ---------------------------------------------------------------------------
// Helper: format hex codepoint list for evidence strings
// ---------------------------------------------------------------------------
/**
* Format an array of {cp, pos} objects as a readable evidence string.
* @param {Array<{cp: number, pos: number}>} hits
* @returns {string} e.g. "U+200B at col 5, U+200D at col 12"
*/
function formatEvidence(hits) {
return hits
.map(h => `U+${h.cp.toString(16).toUpperCase().padStart(4, '0')} at col ${h.pos + 1}`)
.join(', ');
}
// ---------------------------------------------------------------------------
// Category 1: Zero-Width Character detection
// ---------------------------------------------------------------------------
/**
* Scan a single line for zero-width characters.
* Returns an array of findings (0 or 1 per line one finding per line hit,
* escalated to CRITICAL if the line is visually empty but has content).
*
* @param {string} line - Raw line content (no newline)
* @param {number} lineNumber - 1-indexed
* @param {string} relPath - Relative file path for finding metadata
* @returns {object[]} - Array of finding objects
*/
function scanLineForZeroWidth(line, lineNumber, relPath) {
const hits = [];
let pos = 0;
for (const char of line) {
const cp = char.codePointAt(0);
if (ZERO_WIDTH_CHARS.has(cp)) {
hits.push({ cp, pos });
}
pos += char.length; // codePointAt handles surrogates; advance by JS char count
}
if (hits.length === 0) return [];
// Determine if the line is visually empty (only zero-width chars present).
// Strip all zero-width chars and common whitespace; if nothing remains → CRITICAL.
const stripped = [...line]
.filter(ch => !ZERO_WIDTH_CHARS.has(ch.codePointAt(0)) && !/\s/.test(ch))
.join('');
const isVisuallyEmpty = stripped.length === 0;
const severity = isVisuallyEmpty ? SEVERITY.CRITICAL : SEVERITY.HIGH;
const title = isVisuallyEmpty
? 'Visually empty line with hidden zero-width characters'
: 'Zero-width characters detected in line';
const description = isVisuallyEmpty
? `Line ${lineNumber} appears blank but contains ${hits.length} zero-width character(s). ` +
'This is a strong indicator of hidden prompt injection content.'
: `Line ${lineNumber} contains ${hits.length} zero-width character(s) that are invisible to readers ` +
'but processed by LLMs. Can be used to smuggle hidden instructions.';
return [
finding({
scanner: 'UNI',
severity,
title,
description,
file: relPath,
line: lineNumber,
evidence: formatEvidence(hits),
owasp: 'LLM01',
recommendation:
'Remove all zero-width characters. Use a hex editor or `cat -A` to reveal them. ' +
'Consider adding a pre-commit hook that rejects files containing U+200B/200C/200D/FEFF/00AD.',
}),
];
}
// ---------------------------------------------------------------------------
// Category 2: Unicode Tag Codepoints (steganography)
// ---------------------------------------------------------------------------
/**
* Decode hidden ASCII message embedded in Unicode Tag codepoints.
* Tag char encodes ASCII as: codepoint - 0xE0000
* Non-tag chars (in a mixed sequence) are included as "?" in the decoded output.
*
* @param {Array<{cp: number, pos: number}>} tagHits
* @returns {string} Decoded string, e.g. "rm -rf /"
*/
function decodeTagMessage(tagHits) {
return tagHits
.map(h => {
const ascii = h.cp - 0xE0000;
// Printable ASCII range
return ascii >= 0x20 && ascii <= 0x7E ? String.fromCharCode(ascii) : '?';
})
.join('');
}
/**
* Scan a single line for Unicode Tag block codepoints.
* @param {string} line
* @param {number} lineNumber
* @param {string} relPath
* @returns {object[]}
*/
function scanLineForUnicodeTags(line, lineNumber, relPath) {
const hits = [];
let pos = 0;
for (const char of line) {
const cp = char.codePointAt(0);
if (cp >= UNICODE_TAG_START && cp <= UNICODE_TAG_END) {
hits.push({ cp, pos });
}
pos += char.length;
}
if (hits.length === 0) return [];
const decoded = decodeTagMessage(hits);
const cpList = formatEvidence(hits);
return [
finding({
scanner: 'UNI',
severity: SEVERITY.CRITICAL,
title: 'Unicode Tag block codepoints detected (steganographic hidden message)',
description:
`Line ${lineNumber} contains ${hits.length} character(s) from the Unicode Tags block ` +
`(U+E0001U+E007F). These encode a hidden ASCII message: "${decoded}". ` +
'This is deliberate steganography and a strong indicator of supply chain attack.',
file: relPath,
line: lineNumber,
evidence: `${cpList} → decoded: "${decoded}"`,
owasp: 'LLM03',
recommendation:
'Remove all Unicode Tag codepoints immediately. This file should not be trusted. ' +
'Investigate how these characters were introduced — they cannot appear accidentally.',
}),
];
}
// ---------------------------------------------------------------------------
// Category 3: BIDI Override Characters (Trojan Source)
// ---------------------------------------------------------------------------
/**
* Scan a single line for BIDI override characters.
* @param {string} line
* @param {number} lineNumber
* @param {string} relPath
* @returns {object[]}
*/
function scanLineForBidi(line, lineNumber, relPath) {
const hits = [];
let pos = 0;
for (const char of line) {
const cp = char.codePointAt(0);
if (BIDI_CHARS.has(cp)) {
hits.push({ cp, pos });
}
pos += char.length;
}
if (hits.length === 0) return [];
return [
finding({
scanner: 'UNI',
severity: SEVERITY.HIGH,
title: 'BIDI override character detected (Trojan Source attack vector)',
description:
`Line ${lineNumber} contains ${hits.length} bidirectional override character(s). ` +
'BIDI controls can make code appear different to humans than to interpreters/LLMs. ' +
'This is the Trojan Source technique (see CVE-2021-42574 class of vulnerabilities).',
file: relPath,
line: lineNumber,
evidence: formatEvidence(hits),
owasp: 'LLM01',
recommendation:
'Remove all BIDI override characters. Legitimate multilingual text rarely needs ' +
'explicit BIDI overrides in source code. Enable editor/IDE BIDI character warnings.',
}),
];
}
// ---------------------------------------------------------------------------
// Category 4: Homoglyph Detection (Latin/Cyrillic mixing)
// ---------------------------------------------------------------------------
/** Regex to extract word-like tokens including Unicode letters */
const TOKEN_RE = /[\p{L}\p{N}_]+/gu;
/** Latin letter range check */
function isLatin(cp) {
return (cp >= 0x0041 && cp <= 0x005A) || // A-Z
(cp >= 0x0061 && cp <= 0x007A); // a-z
}
/** Cyrillic block check (U+0400U+04FF) */
function isCyrillic(cp) {
return cp >= 0x0400 && cp <= 0x04FF;
}
/**
* Scan a single line for tokens that mix Latin and Cyrillic characters.
* Reports one finding per line (consolidating all suspicious tokens).
* @param {string} line
* @param {number} lineNumber
* @param {string} relPath
* @returns {object[]}
*/
function scanLineForHomoglyphs(line, lineNumber, relPath) {
const suspiciousTokens = [];
let match;
TOKEN_RE.lastIndex = 0;
while ((match = TOKEN_RE.exec(line)) !== null) {
const token = match[0];
let hasLatin = false;
let hasCyrillic = false;
const cyrillicChars = [];
for (const ch of token) {
const cp = ch.codePointAt(0);
if (isLatin(cp)) hasLatin = true;
if (isCyrillic(cp)) {
hasCyrillic = true;
cyrillicChars.push(`U+${cp.toString(16).toUpperCase().padStart(4, '0')}`);
}
}
if (hasLatin && hasCyrillic) {
suspiciousTokens.push({ token, cyrillicChars });
}
}
if (suspiciousTokens.length === 0) return [];
const tokenList = suspiciousTokens
.map(t => `"${t.token}" (Cyrillic: ${t.cyrillicChars.join(', ')})`)
.join('; ');
return [
finding({
scanner: 'UNI',
severity: SEVERITY.MEDIUM,
title: 'Homoglyph mixing detected: Latin and Cyrillic in same identifier',
description:
`Line ${lineNumber} contains ${suspiciousTokens.length} token(s) that mix Latin and ` +
'Cyrillic characters. Cyrillic confusables (а, е, о, с, р, у, х) look identical to ' +
'Latin letters but have different codepoints — enabling invisible identifier spoofing.',
file: relPath,
line: lineNumber,
evidence: tokenList,
owasp: 'LLM01',
recommendation:
'Normalize all identifiers to a single script. Use a Unicode confusables checker ' +
'(e.g., Unicode CLDR confusable-mappings.txt) and enforce a single-script policy ' +
'via linter rules (ESLint `no-misleading-character-class`, Rust `confusable_idents`).',
}),
];
}
// ---------------------------------------------------------------------------
// Main scanner export
// ---------------------------------------------------------------------------
/**
* Scan all discovered text files for hidden Unicode attack characters.
*
* @param {string} targetPath - Absolute root path being scanned
* @param {{ files: import('./lib/file-discovery.mjs').FileInfo[] }} discovery
* @returns {Promise<object>} - scannerResult envelope
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
const findings = [];
let filesScanned = 0;
try {
for (const fileInfo of discovery.files) {
const content = await readTextFile(fileInfo.absPath);
// Skip binary files or unreadable files
if (content === null) continue;
filesScanned++;
// Split preserving empty lines; strip trailing \r for Windows line endings
const lines = content.split('\n').map(l => l.replace(/\r$/, ''));
for (let i = 0; i < lines.length; i++) {
const lineNumber = i + 1;
const line = lines[i];
// Skip entirely empty lines early — nothing to detect
if (line.length === 0) continue;
// Run all four detectors per line
findings.push(...scanLineForZeroWidth(line, lineNumber, fileInfo.relPath));
findings.push(...scanLineForUnicodeTags(line, lineNumber, fileInfo.relPath));
findings.push(...scanLineForBidi(line, lineNumber, fileInfo.relPath));
findings.push(...scanLineForHomoglyphs(line, lineNumber, fileInfo.relPath));
}
}
const durationMs = Date.now() - startMs;
// Determine status: 'ok' even with findings (status reflects execution, not severity)
return scannerResult('unicode-scanner', 'ok', findings, filesScanned, durationMs);
} catch (err) {
const durationMs = Date.now() - startMs;
return scannerResult(
'unicode-scanner',
'error',
findings,
filesScanned,
durationMs,
err.message,
);
}
}