feat: initial open marketplace with llm-security, config-audit, ultraplan-local

2026-04-06 18:47:49 +02:00 · 2026-04-06 18:47:49 +02:00 · f93d6abdae
commit f93d6abdae
380 changed files with 65935 additions and 0 deletions
--- a/plugins/llm-security/scanners/entropy-scanner.mjs
+++ b/plugins/llm-security/scanners/entropy-scanner.mjs
@ -0,0 +1,329 @@
+// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
+// Zero dependencies (Node.js builtins only via lib helpers).
+//
+// Rationale: Malicious skills and MCP servers often hide injected instructions,
+// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
+// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
+//
+// References:
+//   - OWASP LLM01 (Prompt Injection via encoded payloads)
+//   - OWASP LLM03 (Supply Chain — obfuscated dependencies)
+//   - ToxicSkills research: evasion via base64-wrapped instructions
+
+import { readTextFile } from './lib/file-discovery.mjs';
+import { finding, scannerResult } from './lib/output.mjs';
+import { SEVERITY } from './lib/severity.mjs';
+import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
+ *
+ *  Plaintext prose:       H ≈ 3.5–4.2 (len 20–50)
+ *  Structured code/JSON:  H ≈ 3.9–4.4 (len 40–80)
+ *  SQL queries:           H ≈ 4.2–4.5 (len 50–100)
+ *  Base64 len=40:         H ≈ 4.4–5.2 (avg 4.8, p90 5.0)
+ *  Base64 len=64:         H ≈ 4.9–5.4 (avg 5.2, p90 5.3)
+ *  Base64 len=80:         H ≈ 5.0–5.6 (avg 5.3, p90 5.5)
+ *  Base64 len=128:        H ≈ 5.4–5.8 (avg 5.6, p90 5.7)
+ *
+ *  Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
+ *  Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
+ *  the length-dependent entropy ceiling.
+ *
+ *  Conservative design: prefer low false-negative rate (catch real payloads) at the cost
+ *  of some false positives that the analyst reviews. The false-positive suppression rules
+ *  above handle the most common benign cases.
+ */
+const THRESHOLDS = {
+  // Large random-looking blob: very likely encoded/encrypted payload
+  CRITICAL: { entropy: 5.4, minLen: 128 },
+  // Medium-sized high-entropy string: likely encoded secret or payload fragment
+  HIGH:     { entropy: 5.1, minLen: 64 },
+  // Shorter elevated-entropy string: suspicious but may be dense data/config
+  MEDIUM:   { entropy: 4.7, minLen: 40 },
+};
+
+/** Known hash/checksum filename patterns — false positive suppression. */
+const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;
+
+/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
+const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;
+
+/** Integrity hash value prefixes (SRI format). */
+const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;
+
+/** Known base64 image/font data-URI prefixes. */
+const DATA_URI_PREFIXES = [
+  'iVBORw0KGgo',  // PNG
+  '/9j/',          // JPEG
+  'R0lGOD',       // GIF
+  'PHN2Zy',       // SVG
+  'AAABAA',       // ICO
+  'T2dnUw',       // OGG (audio)
+  'AAAAFGZ0',     // MP4
+  'UklGR',        // WebP/RIFF
+  'd09G',         // WOFF font
+  'AAEAAAALAAI',  // TTF font
+];
+
+/** UUID v4 pattern for false positive suppression. */
+const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
+
+/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
+const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;
+
+// ---------------------------------------------------------------------------
+// False-positive suppression helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Decide whether a candidate string should be suppressed (likely a false positive).
+ *
+ * @param {string}  str      - The extracted string literal value
+ * @param {string}  line     - The full source line it came from
+ * @param {string}  absPath  - Absolute file path
+ * @returns {boolean}        - true if this string should be skipped
+ */
+function isFalsePositive(str, line, absPath) {
+  // 1. URLs — entropy is misleading for long query strings / JWTs in URLs
+  if (str.startsWith('http://') || str.startsWith('https://')) return true;
+
+  // 2. File/system paths
+  if (
+    str.startsWith('/') ||
+    str.startsWith('./') ||
+    str.startsWith('../') ||
+    /^[A-Za-z]:[/\\]/.test(str)  // Windows drive letter, e.g. C:\
+  ) return true;
+
+  // 3. Known hash formats in lock/checksum contexts
+  if (HEX_HASH_PATTERN.test(str)) {
+    if (
+      LOCK_FILE_PATTERN.test(absPath) ||
+      INTEGRITY_KEYWORDS.test(line)
+    ) return true;
+  }
+
+  // 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
+  if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;
+
+  // 5. UUID patterns
+  if (UUID_PATTERN.test(str)) return true;
+
+  // 6. CSS / SVG / font data URIs embedded in source
+  if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;
+
+  // 7. Import / require paths — the string is a module specifier, not a payload
+  if (
+    /^\s*import\s/i.test(line) ||
+    /\brequire\s*\(/i.test(line)
+  ) return true;
+
+  // 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
+  if (SRI_PREFIX.test(str)) return true;
+
+  // 9. Line-level integrity keyword context (catches SRI in HTML <link> / <script> tags)
+  if (INTEGRITY_KEYWORDS.test(line)) return true;
+
+  // 10. Base64 image data-URI content (raw prefix check, separate from the line check above)
+  for (const prefix of DATA_URI_PREFIXES) {
+    if (str.startsWith(prefix)) return true;
+  }
+
+  return false;
+}
+
+// ---------------------------------------------------------------------------
+// Severity classification
+// ---------------------------------------------------------------------------
+
+/**
+ * Derive severity from entropy and string length.
+ * Returns null if below all thresholds.
+ *
+ * @param {number} H   - Shannon entropy
+ * @param {number} len - String length
+ * @returns {string|null}
+ */
+function classifyEntropy(H, len) {
+  if (H >= THRESHOLDS.CRITICAL.entropy && len >= THRESHOLDS.CRITICAL.minLen) {
+    return SEVERITY.CRITICAL;
+  }
+  if (H >= THRESHOLDS.HIGH.entropy && len >= THRESHOLDS.HIGH.minLen) {
+    return SEVERITY.HIGH;
+  }
+  if (H >= THRESHOLDS.MEDIUM.entropy && len >= THRESHOLDS.MEDIUM.minLen) {
+    return SEVERITY.MEDIUM;
+  }
+  return null;
+}
+
+/**
+ * Merge two severities, keeping the higher one.
+ * @param {string|null} a
+ * @param {string|null} b
+ * @returns {string|null}
+ */
+function maxSeverity(a, b) {
+  const order = [SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO];
+  const rank = (s) => (s === null ? Infinity : order.indexOf(s));
+  return rank(a) <= rank(b) ? a : b;
+}
+
+// ---------------------------------------------------------------------------
+// Per-file scanning
+// ---------------------------------------------------------------------------
+
+/**
+ * Scan a single file's content for high-entropy strings.
+ *
+ * @param {string}   content  - File text content
+ * @param {string}   absPath  - Absolute file path (for suppression checks)
+ * @param {string}   relPath  - Relative path (for finding output)
+ * @returns {object[]}        - Array of finding objects
+ */
+function scanFileContent(content, absPath, relPath) {
+  const findings = [];
+  const lines = content.split('\n');
+
+  // De-duplicate: track (line, evidence) pairs to avoid reporting the same
+  // string twice when it appears in both extractStringLiterals and assignment
+  // value extraction.
+  const seen = new Set();
+
+  for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
+    const line = lines[lineIdx];
+    const lineNo = lineIdx + 1;
+
+    // Collect candidates: string literals from the standard extractor
+    const literalCandidates = extractStringLiterals(line);
+
+    // Additional extraction: assignment RHS values not caught by quote-matching
+    // (e.g., lines like: const TOKEN = "AQIB3j0..." or yaml: key: AQIB3j0...)
+    // We re-use the literal extractor which already handles these cases since it
+    // scans the full line. No extra pass needed — extractStringLiterals is
+    // comprehensive for quoted strings. Unquoted YAML values can appear here:
+    const unquotedYamlMatch = line.match(/^\s*\w[\w.-]*\s*:\s*([A-Za-z0-9+/=]{20,})(?:\s*#.*)?$/);
+    if (unquotedYamlMatch) {
+      literalCandidates.push(unquotedYamlMatch[1]);
+    }
+
+    for (const str of literalCandidates) {
+      if (!str || str.length < 10) continue;
+
+      // False positive suppression
+      if (isFalsePositive(str, line, absPath)) continue;
+
+      const H = shannonEntropy(str);
+      let severity = classifyEntropy(H, str.length);
+
+      // Additional detection: base64-like blobs and hex blobs get at least MEDIUM
+      // even if entropy alone didn't trigger (very structured encodings can have
+      // slightly lower H than random but are still suspicious at length >100/64).
+      if (severity === null) {
+        if (isBase64Like(str) && str.length > 100) {
+          severity = SEVERITY.MEDIUM;
+        } else if (isHexBlob(str) && str.length > 64) {
+          severity = SEVERITY.MEDIUM;
+        }
+      } else {
+        // Structured encoding can upgrade or confirm severity
+        if (isBase64Like(str) && str.length > 100) {
+          severity = maxSeverity(severity, SEVERITY.MEDIUM);
+        }
+        if (isHexBlob(str) && str.length > 64) {
+          severity = maxSeverity(severity, SEVERITY.MEDIUM);
+        }
+      }
+
+      if (severity === null) continue;
+
+      // De-duplicate
+      const key = `${lineNo}:${str.slice(0, 16)}`;
+      if (seen.has(key)) continue;
+      seen.add(key);
+
+      // Determine OWASP mapping:
+      //   - Very high entropy (>=5.5) with base64 → likely injection payload → LLM01
+      //   - Encoded hex deps / supply chain obfuscation → LLM03
+      //   - Default to LLM01 for encoded content that could carry instructions
+      const isLikelyPayload = H >= THRESHOLDS.CRITICAL.entropy || isBase64Like(str);
+      const owasp = isLikelyPayload ? 'LLM01' : 'LLM03';
+
+      const evidencePreview = redact(str, 8, 4);
+      const evidence = `H=${H.toFixed(2)}, len=${str.length}: ${evidencePreview}`;
+
+      findings.push(
+        finding({
+          scanner: 'ENT',
+          severity,
+          title: `High-entropy string (H=${H.toFixed(2)}, len=${str.length})`,
+          description:
+            `A string with unusually high Shannon entropy was detected. ` +
+            `High entropy (H>=${THRESHOLDS.MEDIUM.entropy}) in strings of this length ` +
+            `is characteristic of base64-encoded payloads, AES-encrypted blobs, ` +
+            `hardcoded secrets, or obfuscated instructions embedded in code or config.`,
+          file: relPath,
+          line: lineNo,
+          evidence,
+          owasp,
+          recommendation:
+            'Inspect this high-entropy string — it may contain an encoded payload, ' +
+            'hardcoded secret, or obfuscated code',
+        })
+      );
+    }
+  }
+
+  return findings;
+}
+
+// ---------------------------------------------------------------------------
+// Public scanner entry point
+// ---------------------------------------------------------------------------
+
+/**
+ * Scan a target path for high-entropy encoded strings.
+ *
+ * @param {string}  targetPath  - Absolute path to scan (file or directory root)
+ * @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
+ *   - Pre-computed file discovery result from the orchestrator
+ * @returns {Promise<object>}   - Scanner result envelope
+ */
+export async function scan(targetPath, discovery) {
+  const startMs = Date.now();
+  const allFindings = [];
+  let filesScanned = 0;
+
+  try {
+    for (const fileInfo of discovery.files) {
+      const content = await readTextFile(fileInfo.absPath);
+
+      // readTextFile returns null for binary files or unreadable paths — skip silently
+      if (content === null) continue;
+
+      filesScanned++;
+
+      const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
+      allFindings.push(...fileFindings);
+    }
+
+    const durationMs = Date.now() - startMs;
+    const status = 'ok';
+
+    return scannerResult('entropy-scanner', status, allFindings, filesScanned, durationMs);
+  } catch (err) {
+    const durationMs = Date.now() - startMs;
+    return scannerResult(
+      'entropy-scanner',
+      'error',
+      allFindings,
+      filesScanned,
+      durationMs,
+      String(err?.message || err)
+    );
+  }
+}