feat: initial open marketplace with llm-security, config-audit, ultraplan-local

2026-04-06 18:47:49 +02:00 · 2026-04-06 18:47:49 +02:00 · f93d6abdae
commit f93d6abdae
380 changed files with 65935 additions and 0 deletions
--- a/plugins/llm-security/scanners/unicode-scanner.mjs
+++ b/plugins/llm-security/scanners/unicode-scanner.mjs
@ -0,0 +1,385 @@
+// unicode-scanner.mjs — Detects hidden Unicode characters used for prompt injection
+// and code obfuscation: zero-width chars, Unicode tag codepoints (steganography),
+// BIDI override characters (Trojan Source), and homoglyph mixing.
+//
+// Zero external dependencies — Node.js builtins only.
+// OWASP coverage: LLM01 (Prompt Injection), LLM03 (Supply Chain)
+
+import { readTextFile } from './lib/file-discovery.mjs';
+import { finding, scannerResult } from './lib/output.mjs';
+import { SEVERITY } from './lib/severity.mjs';
+
+// ---------------------------------------------------------------------------
+// Character sets
+// ---------------------------------------------------------------------------
+
+/** U+200B–U+200D, U+FEFF, U+00AD: visually invisible, used to hide content */
+const ZERO_WIDTH_CHARS = new Set([
+  0x200B, // ZERO WIDTH SPACE
+  0x200C, // ZERO WIDTH NON-JOINER
+  0x200D, // ZERO WIDTH JOINER
+  0xFEFF,  // ZERO WIDTH NO-BREAK SPACE / BOM (when not at position 0)
+  0x00AD,  // SOFT HYPHEN
+]);
+
+/** Unicode Tags block U+E0001–U+E007F: encodes hidden ASCII via codepoint - 0xE0000 */
+const UNICODE_TAG_START = 0xE0001;
+const UNICODE_TAG_END   = 0xE007F;
+
+/** BIDI control characters — Trojan Source attack (CVE-2021-42574 class) */
+const BIDI_CHARS = new Set([
+  0x202A, // LEFT-TO-RIGHT EMBEDDING
+  0x202B, // RIGHT-TO-LEFT EMBEDDING
+  0x202C, // POP DIRECTIONAL FORMATTING
+  0x202D, // LEFT-TO-RIGHT OVERRIDE
+  0x202E, // RIGHT-TO-LEFT OVERRIDE
+  0x2066, // LEFT-TO-RIGHT ISOLATE
+  0x2067, // RIGHT-TO-LEFT ISOLATE
+  0x2068, // FIRST STRONG ISOLATE
+  0x2069, // POP DIRECTIONAL ISOLATE
+]);
+
+/** Cyrillic lookalike codepoints that visually match Latin letters */
+const CYRILLIC_CONFUSABLES = new Set([
+  0x0430, // а — Cyrillic small letter a  (looks like Latin a)
+  0x0435, // е — Cyrillic small letter ie (looks like Latin e)
+  0x043E, // о — Cyrillic small letter o  (looks like Latin o)
+  0x0441, // с — Cyrillic small letter es (looks like Latin c)
+  0x0440, // р — Cyrillic small letter er (looks like Latin p)
+  0x0443, // у — Cyrillic small letter u  (looks like Latin y)
+  0x0445, // х — Cyrillic small letter ha (looks like Latin x)
+  0x0410, // А — Cyrillic capital letter a
+  0x0415, // Е — Cyrillic capital letter ie
+  0x041E, // О — Cyrillic capital letter o
+  0x0421, // С — Cyrillic capital letter es
+  0x0420, // Р — Cyrillic capital letter er
+  0x0425, // Х — Cyrillic capital letter ha
+]);
+
+// ---------------------------------------------------------------------------
+// Helper: format hex codepoint list for evidence strings
+// ---------------------------------------------------------------------------
+
+/**
+ * Format an array of {cp, pos} objects as a readable evidence string.
+ * @param {Array<{cp: number, pos: number}>} hits
+ * @returns {string}  e.g. "U+200B at col 5, U+200D at col 12"
+ */
+function formatEvidence(hits) {
+  return hits
+    .map(h => `U+${h.cp.toString(16).toUpperCase().padStart(4, '0')} at col ${h.pos + 1}`)
+    .join(', ');
+}
+
+// ---------------------------------------------------------------------------
+// Category 1: Zero-Width Character detection
+// ---------------------------------------------------------------------------
+
+/**
+ * Scan a single line for zero-width characters.
+ * Returns an array of findings (0 or 1 per line — one finding per line hit,
+ * escalated to CRITICAL if the line is visually empty but has content).
+ *
+ * @param {string} line        - Raw line content (no newline)
+ * @param {number} lineNumber  - 1-indexed
+ * @param {string} relPath     - Relative file path for finding metadata
+ * @returns {object[]}         - Array of finding objects
+ */
+function scanLineForZeroWidth(line, lineNumber, relPath) {
+  const hits = [];
+
+  let pos = 0;
+  for (const char of line) {
+    const cp = char.codePointAt(0);
+    if (ZERO_WIDTH_CHARS.has(cp)) {
+      hits.push({ cp, pos });
+    }
+    pos += char.length; // codePointAt handles surrogates; advance by JS char count
+  }
+
+  if (hits.length === 0) return [];
+
+  // Determine if the line is visually empty (only zero-width chars present).
+  // Strip all zero-width chars and common whitespace; if nothing remains → CRITICAL.
+  const stripped = [...line]
+    .filter(ch => !ZERO_WIDTH_CHARS.has(ch.codePointAt(0)) && !/\s/.test(ch))
+    .join('');
+  const isVisuallyEmpty = stripped.length === 0;
+
+  const severity = isVisuallyEmpty ? SEVERITY.CRITICAL : SEVERITY.HIGH;
+  const title = isVisuallyEmpty
+    ? 'Visually empty line with hidden zero-width characters'
+    : 'Zero-width characters detected in line';
+
+  const description = isVisuallyEmpty
+    ? `Line ${lineNumber} appears blank but contains ${hits.length} zero-width character(s). ` +
+      'This is a strong indicator of hidden prompt injection content.'
+    : `Line ${lineNumber} contains ${hits.length} zero-width character(s) that are invisible to readers ` +
+      'but processed by LLMs. Can be used to smuggle hidden instructions.';
+
+  return [
+    finding({
+      scanner: 'UNI',
+      severity,
+      title,
+      description,
+      file: relPath,
+      line: lineNumber,
+      evidence: formatEvidence(hits),
+      owasp: 'LLM01',
+      recommendation:
+        'Remove all zero-width characters. Use a hex editor or `cat -A` to reveal them. ' +
+        'Consider adding a pre-commit hook that rejects files containing U+200B/200C/200D/FEFF/00AD.',
+    }),
+  ];
+}
+
+// ---------------------------------------------------------------------------
+// Category 2: Unicode Tag Codepoints (steganography)
+// ---------------------------------------------------------------------------
+
+/**
+ * Decode hidden ASCII message embedded in Unicode Tag codepoints.
+ * Tag char encodes ASCII as: codepoint - 0xE0000
+ * Non-tag chars (in a mixed sequence) are included as "?" in the decoded output.
+ *
+ * @param {Array<{cp: number, pos: number}>} tagHits
+ * @returns {string}  Decoded string, e.g. "rm -rf /"
+ */
+function decodeTagMessage(tagHits) {
+  return tagHits
+    .map(h => {
+      const ascii = h.cp - 0xE0000;
+      // Printable ASCII range
+      return ascii >= 0x20 && ascii <= 0x7E ? String.fromCharCode(ascii) : '?';
+    })
+    .join('');
+}
+
+/**
+ * Scan a single line for Unicode Tag block codepoints.
+ * @param {string} line
+ * @param {number} lineNumber
+ * @param {string} relPath
+ * @returns {object[]}
+ */
+function scanLineForUnicodeTags(line, lineNumber, relPath) {
+  const hits = [];
+
+  let pos = 0;
+  for (const char of line) {
+    const cp = char.codePointAt(0);
+    if (cp >= UNICODE_TAG_START && cp <= UNICODE_TAG_END) {
+      hits.push({ cp, pos });
+    }
+    pos += char.length;
+  }
+
+  if (hits.length === 0) return [];
+
+  const decoded = decodeTagMessage(hits);
+  const cpList = formatEvidence(hits);
+
+  return [
+    finding({
+      scanner: 'UNI',
+      severity: SEVERITY.CRITICAL,
+      title: 'Unicode Tag block codepoints detected (steganographic hidden message)',
+      description:
+        `Line ${lineNumber} contains ${hits.length} character(s) from the Unicode Tags block ` +
+        `(U+E0001–U+E007F). These encode a hidden ASCII message: "${decoded}". ` +
+        'This is deliberate steganography and a strong indicator of supply chain attack.',
+      file: relPath,
+      line: lineNumber,
+      evidence: `${cpList} → decoded: "${decoded}"`,
+      owasp: 'LLM03',
+      recommendation:
+        'Remove all Unicode Tag codepoints immediately. This file should not be trusted. ' +
+        'Investigate how these characters were introduced — they cannot appear accidentally.',
+    }),
+  ];
+}
+
+// ---------------------------------------------------------------------------
+// Category 3: BIDI Override Characters (Trojan Source)
+// ---------------------------------------------------------------------------
+
+/**
+ * Scan a single line for BIDI override characters.
+ * @param {string} line
+ * @param {number} lineNumber
+ * @param {string} relPath
+ * @returns {object[]}
+ */
+function scanLineForBidi(line, lineNumber, relPath) {
+  const hits = [];
+
+  let pos = 0;
+  for (const char of line) {
+    const cp = char.codePointAt(0);
+    if (BIDI_CHARS.has(cp)) {
+      hits.push({ cp, pos });
+    }
+    pos += char.length;
+  }
+
+  if (hits.length === 0) return [];
+
+  return [
+    finding({
+      scanner: 'UNI',
+      severity: SEVERITY.HIGH,
+      title: 'BIDI override character detected (Trojan Source attack vector)',
+      description:
+        `Line ${lineNumber} contains ${hits.length} bidirectional override character(s). ` +
+        'BIDI controls can make code appear different to humans than to interpreters/LLMs. ' +
+        'This is the Trojan Source technique (see CVE-2021-42574 class of vulnerabilities).',
+      file: relPath,
+      line: lineNumber,
+      evidence: formatEvidence(hits),
+      owasp: 'LLM01',
+      recommendation:
+        'Remove all BIDI override characters. Legitimate multilingual text rarely needs ' +
+        'explicit BIDI overrides in source code. Enable editor/IDE BIDI character warnings.',
+    }),
+  ];
+}
+
+// ---------------------------------------------------------------------------
+// Category 4: Homoglyph Detection (Latin/Cyrillic mixing)
+// ---------------------------------------------------------------------------
+
+/** Regex to extract word-like tokens including Unicode letters */
+const TOKEN_RE = /[\p{L}\p{N}_]+/gu;
+
+/** Latin letter range check */
+function isLatin(cp) {
+  return (cp >= 0x0041 && cp <= 0x005A) || // A-Z
+         (cp >= 0x0061 && cp <= 0x007A);    // a-z
+}
+
+/** Cyrillic block check (U+0400–U+04FF) */
+function isCyrillic(cp) {
+  return cp >= 0x0400 && cp <= 0x04FF;
+}
+
+/**
+ * Scan a single line for tokens that mix Latin and Cyrillic characters.
+ * Reports one finding per line (consolidating all suspicious tokens).
+ * @param {string} line
+ * @param {number} lineNumber
+ * @param {string} relPath
+ * @returns {object[]}
+ */
+function scanLineForHomoglyphs(line, lineNumber, relPath) {
+  const suspiciousTokens = [];
+
+  let match;
+  TOKEN_RE.lastIndex = 0;
+  while ((match = TOKEN_RE.exec(line)) !== null) {
+    const token = match[0];
+    let hasLatin = false;
+    let hasCyrillic = false;
+    const cyrillicChars = [];
+
+    for (const ch of token) {
+      const cp = ch.codePointAt(0);
+      if (isLatin(cp)) hasLatin = true;
+      if (isCyrillic(cp)) {
+        hasCyrillic = true;
+        cyrillicChars.push(`U+${cp.toString(16).toUpperCase().padStart(4, '0')}`);
+      }
+    }
+
+    if (hasLatin && hasCyrillic) {
+      suspiciousTokens.push({ token, cyrillicChars });
+    }
+  }
+
+  if (suspiciousTokens.length === 0) return [];
+
+  const tokenList = suspiciousTokens
+    .map(t => `"${t.token}" (Cyrillic: ${t.cyrillicChars.join(', ')})`)
+    .join('; ');
+
+  return [
+    finding({
+      scanner: 'UNI',
+      severity: SEVERITY.MEDIUM,
+      title: 'Homoglyph mixing detected: Latin and Cyrillic in same identifier',
+      description:
+        `Line ${lineNumber} contains ${suspiciousTokens.length} token(s) that mix Latin and ` +
+        'Cyrillic characters. Cyrillic confusables (а, е, о, с, р, у, х) look identical to ' +
+        'Latin letters but have different codepoints — enabling invisible identifier spoofing.',
+      file: relPath,
+      line: lineNumber,
+      evidence: tokenList,
+      owasp: 'LLM01',
+      recommendation:
+        'Normalize all identifiers to a single script. Use a Unicode confusables checker ' +
+        '(e.g., Unicode CLDR confusable-mappings.txt) and enforce a single-script policy ' +
+        'via linter rules (ESLint `no-misleading-character-class`, Rust `confusable_idents`).',
+    }),
+  ];
+}
+
+// ---------------------------------------------------------------------------
+// Main scanner export
+// ---------------------------------------------------------------------------
+
+/**
+ * Scan all discovered text files for hidden Unicode attack characters.
+ *
+ * @param {string} targetPath        - Absolute root path being scanned
+ * @param {{ files: import('./lib/file-discovery.mjs').FileInfo[] }} discovery
+ * @returns {Promise<object>}        - scannerResult envelope
+ */
+export async function scan(targetPath, discovery) {
+  const startMs = Date.now();
+  const findings = [];
+  let filesScanned = 0;
+
+  try {
+    for (const fileInfo of discovery.files) {
+      const content = await readTextFile(fileInfo.absPath);
+
+      // Skip binary files or unreadable files
+      if (content === null) continue;
+
+      filesScanned++;
+
+      // Split preserving empty lines; strip trailing \r for Windows line endings
+      const lines = content.split('\n').map(l => l.replace(/\r$/, ''));
+
+      for (let i = 0; i < lines.length; i++) {
+        const lineNumber = i + 1;
+        const line = lines[i];
+
+        // Skip entirely empty lines early — nothing to detect
+        if (line.length === 0) continue;
+
+        // Run all four detectors per line
+        findings.push(...scanLineForZeroWidth(line, lineNumber, fileInfo.relPath));
+        findings.push(...scanLineForUnicodeTags(line, lineNumber, fileInfo.relPath));
+        findings.push(...scanLineForBidi(line, lineNumber, fileInfo.relPath));
+        findings.push(...scanLineForHomoglyphs(line, lineNumber, fileInfo.relPath));
+      }
+    }
+
+    const durationMs = Date.now() - startMs;
+
+    // Determine status: 'ok' even with findings (status reflects execution, not severity)
+    return scannerResult('unicode-scanner', 'ok', findings, filesScanned, durationMs);
+
+  } catch (err) {
+    const durationMs = Date.now() - startMs;
+    return scannerResult(
+      'unicode-scanner',
+      'error',
+      findings,
+      filesScanned,
+      durationMs,
+      err.message,
+    );
+  }
+}