// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis // Zero dependencies (Node.js builtins only via lib helpers). // // Rationale: Malicious skills and MCP servers often hide injected instructions, // exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs // (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review. // // References: // - OWASP LLM01 (Prompt Injection via encoded payloads) // - OWASP LLM03 (Supply Chain — obfuscated dependencies) // - ToxicSkills research: evasion via base64-wrapped instructions import { existsSync } from 'node:fs'; import { join } from 'node:path'; import { readTextFile } from './lib/file-discovery.mjs'; import { finding, scannerResult } from './lib/output.mjs'; import { SEVERITY } from './lib/severity.mjs'; import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs'; import { loadPolicy } from './lib/policy-loader.mjs'; // --------------------------------------------------------------------------- // File-extension suppression (context-aware, v7.0.0+) // --------------------------------------------------------------------------- /** * Extensions whose contents are almost always benign high-entropy noise: * GPU shaders, stylesheets, SVG markup. Scanning these produces massive * false-positive rates (observed 70% FP on hyperframes renderer codebase). */ const ENTROPY_SKIP_EXTENSIONS = new Set([ '.glsl', '.frag', '.vert', '.shader', '.wgsl', // GPU shaders '.css', '.scss', '.sass', '.less', // stylesheets '.svg', // SVG markup ]); /** * @param {{ relPath: string, ext: string }} fileInfo * @returns {boolean} true if the file should be skipped entirely */ function shouldSkipByExtension(fileInfo) { const lowerPath = (fileInfo.relPath || '').toLowerCase(); if (lowerPath.endsWith('.min.js') || lowerPath.endsWith('.min.css')) return true; const ext = (fileInfo.ext || '').toLowerCase(); if (ENTROPY_SKIP_EXTENSIONS.has(ext)) return true; if (USER_SUPPRESS_EXTENSIONS.has(ext)) return true; return false; } /** * @param {{ relPath: string }} fileInfo * @returns {boolean} true if the file's relative path matches any user-policy skip-path substring. */ function shouldSkipByPath(fileInfo) { if (USER_SUPPRESS_PATHS.length === 0) return false; const rel = fileInfo.relPath || ''; for (const needle of USER_SUPPRESS_PATHS) { if (typeof needle === 'string' && needle.length > 0 && rel.includes(needle)) return true; } return false; } // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- /** Entropy thresholds (bits/char). Empirically calibrated against real distributions: * * Plaintext prose: H ≈ 3.5–4.2 (len 20–50) * Structured code/JSON: H ≈ 3.9–4.4 (len 40–80) * SQL queries: H ≈ 4.2–4.5 (len 50–100) * Base64 len=40: H ≈ 4.4–5.2 (avg 4.8, p90 5.0) * Base64 len=64: H ≈ 4.9–5.4 (avg 5.2, p90 5.3) * Base64 len=80: H ≈ 5.0–5.6 (avg 5.3, p90 5.5) * Base64 len=128: H ≈ 5.4–5.8 (avg 5.6, p90 5.7) * * Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02. * Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for * the length-dependent entropy ceiling. * * Conservative design: prefer low false-negative rate (catch real payloads) at the cost * of some false positives that the analyst reviews. The false-positive suppression rules * above handle the most common benign cases. */ const DEFAULT_THRESHOLDS = { // Large random-looking blob: very likely encoded/encrypted payload CRITICAL: { entropy: 5.4, minLen: 128 }, // Medium-sized high-entropy string: likely encoded secret or payload fragment HIGH: { entropy: 5.1, minLen: 64 }, // Shorter elevated-entropy string: suspicious but may be dense data/config MEDIUM: { entropy: 4.7, minLen: 40 }, }; /** * Merge policy.entropy.thresholds over defaults. Policy keys are lowercase * (critical/high/medium) to match other policy sections; defaults use uppercase * internally. * * @param {object|undefined} policyThresholds * @returns {typeof DEFAULT_THRESHOLDS} */ function resolveThresholds(policyThresholds) { if (!policyThresholds) return DEFAULT_THRESHOLDS; return { CRITICAL: { ...DEFAULT_THRESHOLDS.CRITICAL, ...(policyThresholds.critical || {}) }, HIGH: { ...DEFAULT_THRESHOLDS.HIGH, ...(policyThresholds.high || {}) }, MEDIUM: { ...DEFAULT_THRESHOLDS.MEDIUM, ...(policyThresholds.medium || {}) }, }; } // Effective thresholds after policy-merge (set at scan() entry, read by classifyEntropy). let THRESHOLDS = DEFAULT_THRESHOLDS; /** User-extensible line-level regex patterns compiled from policy. Set per scan. */ let USER_SUPPRESS_LINE_PATTERNS = []; /** User-extensible relative-path substrings to skip entirely. Set per scan. */ let USER_SUPPRESS_PATHS = []; /** User-extensible extension suppress list (merged with built-in). Set per scan. */ let USER_SUPPRESS_EXTENSIONS = new Set(); /** Known hash/checksum filename patterns — false positive suppression. */ const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i; /** Line-level keywords that suggest integrity hashes rather than encoded payloads. */ const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i; /** Integrity hash value prefixes (SRI format). */ const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/; /** Known base64 image/font data-URI prefixes. */ const DATA_URI_PREFIXES = [ 'iVBORw0KGgo', // PNG '/9j/', // JPEG 'R0lGOD', // GIF 'PHN2Zy', // SVG 'AAABAA', // ICO 'T2dnUw', // OGG (audio) 'AAAAFGZ0', // MP4 'UklGR', // WebP/RIFF 'd09G', // WOFF font 'AAEAAAALAAI', // TTF font ]; /** UUID v4 pattern for false positive suppression. */ const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; /** Pure lowercase hex that could be a hash digest (not obfuscated code). */ const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i; /** GLSL/WGSL shader keywords — suppress inline shader source (gl_Position, vec3, uniform, ...). */ const GLSL_KEYWORDS = /\b(?:gl_(?:Position|FragColor|FragCoord|PointSize|PointCoord)|vec[234]|mat[234]|uniform|varying|attribute|precision\s+(?:high|medium|low)p|smoothstep|mix|clamp|texture2D|textureCube|sampler[123]D)\b/; /** CSS-in-JS patterns (styled-components, emotion, vanilla-extract, @keyframes). */ const CSS_IN_JS_PATTERN = /\b(?:styled\.[a-z]+|css)\s*`|@(?:keyframes|media|supports)\s|:\s*(?:hover|focus|active|before|after|visited|root)\b/; /** Inline HTML/SVG markup in source (tags with attributes on the same line). */ const INLINE_MARKUP = /<(?:svg|path|defs|g\s|rect\s|circle\s|polygon|polyline|ellipse|line\s|use\s|symbol\s|clipPath|linearGradient|radialGradient|div\s+[a-z-]+|span\s+[a-z-]+|style>|script>|template\s)/i; /** ffmpeg filter-graph syntax (stream selectors + filter chains). */ const FFMPEG_SYNTAX = /\[\d+:[avs]\]|(?:scale|crop|concat|overlay|psnr|drawtext|setpts|atempo|filter_complex|format|pad|trim|setdar|setsar)\s*=/; /** Browser User-Agent strings (hardcoded in source — long but structured, not encoded). */ const USER_AGENT_PATTERN = /Mozilla\/\d|AppleWebKit|Chrome\/\d+|Safari\/\d+|Firefox\/\d+|Edg\/\d+|OPR\/\d+/; /** SQL DDL/DML statements (long structured strings, not encoded payloads). */ const SQL_STATEMENT = /^\s*(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE|GRANT|REVOKE)\s+/i; /** Error-message templates with embedded HTML/markup (throw new Error("