// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis // Zero dependencies (Node.js builtins only via lib helpers). // // Rationale: Malicious skills and MCP servers often hide injected instructions, // exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs // (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review. // // References: // - OWASP LLM01 (Prompt Injection via encoded payloads) // - OWASP LLM03 (Supply Chain — obfuscated dependencies) // - ToxicSkills research: evasion via base64-wrapped instructions import { readTextFile } from './lib/file-discovery.mjs'; import { finding, scannerResult } from './lib/output.mjs'; import { SEVERITY } from './lib/severity.mjs'; import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs'; // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- /** Entropy thresholds (bits/char). Empirically calibrated against real distributions: * * Plaintext prose: H ≈ 3.5–4.2 (len 20–50) * Structured code/JSON: H ≈ 3.9–4.4 (len 40–80) * SQL queries: H ≈ 4.2–4.5 (len 50–100) * Base64 len=40: H ≈ 4.4–5.2 (avg 4.8, p90 5.0) * Base64 len=64: H ≈ 4.9–5.4 (avg 5.2, p90 5.3) * Base64 len=80: H ≈ 5.0–5.6 (avg 5.3, p90 5.5) * Base64 len=128: H ≈ 5.4–5.8 (avg 5.6, p90 5.7) * * Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02. * Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for * the length-dependent entropy ceiling. * * Conservative design: prefer low false-negative rate (catch real payloads) at the cost * of some false positives that the analyst reviews. The false-positive suppression rules * above handle the most common benign cases. */ const THRESHOLDS = { // Large random-looking blob: very likely encoded/encrypted payload CRITICAL: { entropy: 5.4, minLen: 128 }, // Medium-sized high-entropy string: likely encoded secret or payload fragment HIGH: { entropy: 5.1, minLen: 64 }, // Shorter elevated-entropy string: suspicious but may be dense data/config MEDIUM: { entropy: 4.7, minLen: 40 }, }; /** Known hash/checksum filename patterns — false positive suppression. */ const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i; /** Line-level keywords that suggest integrity hashes rather than encoded payloads. */ const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i; /** Integrity hash value prefixes (SRI format). */ const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/; /** Known base64 image/font data-URI prefixes. */ const DATA_URI_PREFIXES = [ 'iVBORw0KGgo', // PNG '/9j/', // JPEG 'R0lGOD', // GIF 'PHN2Zy', // SVG 'AAABAA', // ICO 'T2dnUw', // OGG (audio) 'AAAAFGZ0', // MP4 'UklGR', // WebP/RIFF 'd09G', // WOFF font 'AAEAAAALAAI', // TTF font ]; /** UUID v4 pattern for false positive suppression. */ const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; /** Pure lowercase hex that could be a hash digest (not obfuscated code). */ const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i; // --------------------------------------------------------------------------- // False-positive suppression helpers // --------------------------------------------------------------------------- /** * Decide whether a candidate string should be suppressed (likely a false positive). * * @param {string} str - The extracted string literal value * @param {string} line - The full source line it came from * @param {string} absPath - Absolute file path * @returns {boolean} - true if this string should be skipped */ function isFalsePositive(str, line, absPath) { // 1. URLs — entropy is misleading for long query strings / JWTs in URLs if (str.startsWith('http://') || str.startsWith('https://')) return true; // 2. File/system paths if ( str.startsWith('/') || str.startsWith('./') || str.startsWith('../') || /^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\ ) return true; // 3. Known hash formats in lock/checksum contexts if (HEX_HASH_PATTERN.test(str)) { if ( LOCK_FILE_PATTERN.test(absPath) || INTEGRITY_KEYWORDS.test(line) ) return true; } // 4. Test/fixture files — intentionally contain example secrets, tokens, etc. if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true; // 5. UUID patterns if (UUID_PATTERN.test(str)) return true; // 6. CSS / SVG / font data URIs embedded in source if (/data:image\/|data:font\/|data:application\//i.test(line)) return true; // 7. Import / require paths — the string is a module specifier, not a payload if ( /^\s*import\s/i.test(line) || /\brequire\s*\(/i.test(line) ) return true; // 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...) if (SRI_PREFIX.test(str)) return true; // 9. Line-level integrity keyword context (catches SRI in HTML /