// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis // Zero dependencies (Node.js builtins only via lib helpers). // // Rationale: Malicious skills and MCP servers often hide injected instructions, // exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs // (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review. // // References: // - OWASP LLM01 (Prompt Injection via encoded payloads) // - OWASP LLM03 (Supply Chain — obfuscated dependencies) // - ToxicSkills research: evasion via base64-wrapped instructions import { existsSync } from 'node:fs'; import { join } from 'node:path'; import { readTextFile } from './lib/file-discovery.mjs'; import { finding, scannerResult } from './lib/output.mjs'; import { SEVERITY } from './lib/severity.mjs'; import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs'; import { loadPolicy } from './lib/policy-loader.mjs'; // --------------------------------------------------------------------------- // File-extension suppression (context-aware, v7.0.0+) // --------------------------------------------------------------------------- /** * Extensions whose contents are almost always benign high-entropy noise: * GPU shaders, stylesheets, SVG markup. Scanning these produces massive * false-positive rates (observed 70% FP on hyperframes renderer codebase). */ const ENTROPY_SKIP_EXTENSIONS = new Set([ '.glsl', '.frag', '.vert', '.shader', '.wgsl', // GPU shaders '.css', '.scss', '.sass', '.less', // stylesheets '.svg', // SVG markup ]); /** * @param {{ relPath: string, ext: string }} fileInfo * @returns {boolean} true if the file should be skipped entirely */ function shouldSkipByExtension(fileInfo) { const lowerPath = (fileInfo.relPath || '').toLowerCase(); if (lowerPath.endsWith('.min.js') || lowerPath.endsWith('.min.css')) return true; const ext = (fileInfo.ext || '').toLowerCase(); if (ENTROPY_SKIP_EXTENSIONS.has(ext)) return true; if (USER_SUPPRESS_EXTENSIONS.has(ext)) return true; return false; } /** * @param {{ relPath: string }} fileInfo * @returns {boolean} true if the file's relative path matches any user-policy skip-path substring. */ function shouldSkipByPath(fileInfo) { if (USER_SUPPRESS_PATHS.length === 0) return false; const rel = fileInfo.relPath || ''; for (const needle of USER_SUPPRESS_PATHS) { if (typeof needle === 'string' && needle.length > 0 && rel.includes(needle)) return true; } return false; } // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- /** Entropy thresholds (bits/char). Empirically calibrated against real distributions: * * Plaintext prose: H ≈ 3.5–4.2 (len 20–50) * Structured code/JSON: H ≈ 3.9–4.4 (len 40–80) * SQL queries: H ≈ 4.2–4.5 (len 50–100) * Base64 len=40: H ≈ 4.4–5.2 (avg 4.8, p90 5.0) * Base64 len=64: H ≈ 4.9–5.4 (avg 5.2, p90 5.3) * Base64 len=80: H ≈ 5.0–5.6 (avg 5.3, p90 5.5) * Base64 len=128: H ≈ 5.4–5.8 (avg 5.6, p90 5.7) * * Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02. * Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for * the length-dependent entropy ceiling. * * Conservative design: prefer low false-negative rate (catch real payloads) at the cost * of some false positives that the analyst reviews. The false-positive suppression rules * above handle the most common benign cases. */ const DEFAULT_THRESHOLDS = { // Large random-looking blob: very likely encoded/encrypted payload CRITICAL: { entropy: 5.4, minLen: 128 }, // Medium-sized high-entropy string: likely encoded secret or payload fragment HIGH: { entropy: 5.1, minLen: 64 }, // Shorter elevated-entropy string: suspicious but may be dense data/config MEDIUM: { entropy: 4.7, minLen: 40 }, }; /** * Merge policy.entropy.thresholds over defaults. Policy keys are lowercase * (critical/high/medium) to match other policy sections; defaults use uppercase * internally. * * @param {object|undefined} policyThresholds * @returns {typeof DEFAULT_THRESHOLDS} */ function resolveThresholds(policyThresholds) { if (!policyThresholds) return DEFAULT_THRESHOLDS; return { CRITICAL: { ...DEFAULT_THRESHOLDS.CRITICAL, ...(policyThresholds.critical || {}) }, HIGH: { ...DEFAULT_THRESHOLDS.HIGH, ...(policyThresholds.high || {}) }, MEDIUM: { ...DEFAULT_THRESHOLDS.MEDIUM, ...(policyThresholds.medium || {}) }, }; } // Effective thresholds after policy-merge (set at scan() entry, read by classifyEntropy). let THRESHOLDS = DEFAULT_THRESHOLDS; /** User-extensible line-level regex patterns compiled from policy. Set per scan. */ let USER_SUPPRESS_LINE_PATTERNS = []; /** User-extensible relative-path substrings to skip entirely. Set per scan. */ let USER_SUPPRESS_PATHS = []; /** User-extensible extension suppress list (merged with built-in). Set per scan. */ let USER_SUPPRESS_EXTENSIONS = new Set(); /** Known hash/checksum filename patterns — false positive suppression. */ const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i; /** Line-level keywords that suggest integrity hashes rather than encoded payloads. */ const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i; /** Integrity hash value prefixes (SRI format). */ const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/; /** Known base64 image/font data-URI prefixes. */ const DATA_URI_PREFIXES = [ 'iVBORw0KGgo', // PNG '/9j/', // JPEG 'R0lGOD', // GIF 'PHN2Zy', // SVG 'AAABAA', // ICO 'T2dnUw', // OGG (audio) 'AAAAFGZ0', // MP4 'UklGR', // WebP/RIFF 'd09G', // WOFF font 'AAEAAAALAAI', // TTF font ]; /** UUID v4 pattern for false positive suppression. */ const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; /** Pure lowercase hex that could be a hash digest (not obfuscated code). */ const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i; /** GLSL/WGSL shader keywords — suppress inline shader source (gl_Position, vec3, uniform, ...). */ const GLSL_KEYWORDS = /\b(?:gl_(?:Position|FragColor|FragCoord|PointSize|PointCoord)|vec[234]|mat[234]|uniform|varying|attribute|precision\s+(?:high|medium|low)p|smoothstep|mix|clamp|texture2D|textureCube|sampler[123]D)\b/; /** CSS-in-JS patterns (styled-components, emotion, vanilla-extract, @keyframes). */ const CSS_IN_JS_PATTERN = /\b(?:styled\.[a-z]+|css)\s*`|@(?:keyframes|media|supports)\s|:\s*(?:hover|focus|active|before|after|visited|root)\b/; /** Inline HTML/SVG markup in source (tags with attributes on the same line). */ const INLINE_MARKUP = /<(?:svg|path|defs|g\s|rect\s|circle\s|polygon|polyline|ellipse|line\s|use\s|symbol\s|clipPath|linearGradient|radialGradient|div\s+[a-z-]+|span\s+[a-z-]+|style>|script>|template\s)/i; /** ffmpeg filter-graph syntax (stream selectors + filter chains). */ const FFMPEG_SYNTAX = /\[\d+:[avs]\]|(?:scale|crop|concat|overlay|psnr|drawtext|setpts|atempo|filter_complex|format|pad|trim|setdar|setsar)\s*=/; /** Browser User-Agent strings (hardcoded in source — long but structured, not encoded). */ const USER_AGENT_PATTERN = /Mozilla\/\d|AppleWebKit|Chrome\/\d+|Safari\/\d+|Firefox\/\d+|Edg\/\d+|OPR\/\d+/; /** SQL DDL/DML statements (long structured strings, not encoded payloads). */ const SQL_STATEMENT = /^\s*(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE|GRANT|REVOKE)\s+/i; /** Error-message templates with embedded HTML/markup (throw new Error("
...
")). */ const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxError)|new\s+Error\s*\()\s*[`'"]/; /** * Markdown image syntax with external URL — `![alt](https://cdn.../hash.ext)`. * Common in JSON data indexes / article metadata; CDN URL hash segments * produce high Shannon entropy but are not credentials. Captures the full * URL so rule 18 can apply CDN-host + secret-in-query checks (E18, v7.2.0). */ const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*(https?:\/\/[^)\s]+)/; /** * Hosts that legitimately serve high-entropy hashed image URLs. Suppression * via rule 18 only applies when the markdown image URL host matches this * pattern AND the URL does not carry a secret-shaped token in its query * string. Anything else falls through to entropy classification (E18). */ const MARKDOWN_IMAGE_CDN_HOSTS = /^https?:\/\/(?:cdn\.|images\.|media\.|assets\.|static\.|[^/]*\.cdn\.|[^/]*\.amazonaws\.com\/(?:s3|cloudfront)\/|[^/]*\.cloudflare\.|[^/]*\.fastly\.|[^/]*\.akamaized\.|raw\.githubusercontent\.com\/|[^/]*\.imgix\.net\/|[^/]*\.cloudinary\.com\/)/i; /** * Secret-shaped tokens that disqualify an otherwise-CDN markdown image from * suppression — query keys (`?token=`, `&api_key=`, etc.) and well-known * provider prefixes (AWS Access Key ID, Bearer header, GitHub PAT, npm * token, Stripe live key). */ const MARKDOWN_IMAGE_QUERY_SECRET = /(?:^|[?&])(?:token|key|secret|password|passwd|api[_-]?key|access[_-]?token|auth)=|AKIA[0-9A-Z]{14,}|Bearer\s|sk_live_|ghp_|ghs_|ghu_|gho_|ghr_|npm_/i; /** @param {string} url */ function urlHasSecretInQuery(url) { const qIdx = url.indexOf('?'); if (qIdx < 0) return false; const query = url.slice(qIdx + 1); return MARKDOWN_IMAGE_QUERY_SECRET.test(query); } // --------------------------------------------------------------------------- // File-context classification (B5, v7.2.0) // --------------------------------------------------------------------------- /** File extensions treated as pure shader/markup/code by classifyFileContext. */ const SHADER_EXTENSIONS = new Set(['.glsl', '.frag', '.vert', '.shader', '.wgsl']); const MARKUP_EXTENSIONS = new Set(['.html', '.htm', '.svg', '.xml', '.md', '.markdown', '.mdx']); const CODE_EXTENSIONS = new Set([ '.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs', '.py', '.go', '.rs', '.rb', '.java', '.cs', '.kt', '.scala', '.swift', '.cpp', '.c', '.h', '.hpp', '.php', ]); /** * Classify a file as shader-dominant, markup-dominant, code-dominant, or mixed. * * Used by isFalsePositive() to gate rules 11-13 (GLSL/CSS-in-JS/inline-markup * line-proximity suppressions). Those rules fire only when context !== 'code-dominant' * — preventing the v7.0.0 polyglot false-negative (a real credential on a line * with an inline GLSL keyword would be incorrectly suppressed). * * Conservative defaults to minimize regression risk: * - Unknown extensions → 'mixed' (all rules apply) * - Code-extension files with very few non-blank lines (<5 sampled) → 'mixed' * - Code-extension files where ≥50% of sampled lines match GLSL/inline-markup → 'mixed' * - Code-extension files otherwise → 'code-dominant' * * @param {string} absPath * @param {string[]} lines * @returns {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'} */ function classifyFileContext(absPath, lines) { const lower = absPath.toLowerCase(); // Pull the actual extension; supports compound names by taking last dot. const slashIdx = Math.max(lower.lastIndexOf('/'), lower.lastIndexOf('\\')); const baseName = slashIdx >= 0 ? lower.slice(slashIdx + 1) : lower; const dotIdx = baseName.lastIndexOf('.'); const ext = dotIdx >= 0 ? baseName.slice(dotIdx) : ''; if (SHADER_EXTENSIONS.has(ext)) return 'shader-dominant'; if (MARKUP_EXTENSIONS.has(ext)) return 'markup-dominant'; if (CODE_EXTENSIONS.has(ext)) { let sampled = 0; let suppressionHits = 0; for (let i = 0; i < lines.length && sampled < 50; i++) { const trimmed = lines[i].trim(); if (trimmed.length === 0) continue; sampled++; if (GLSL_KEYWORDS.test(trimmed) || INLINE_MARKUP.test(trimmed)) { suppressionHits++; } } // Too few non-blank lines to classify confidently → conservative default. if (sampled < 5) return 'mixed'; // Mostly shader/markup despite the code extension → conservative default. if (suppressionHits / sampled >= 0.5) return 'mixed'; return 'code-dominant'; } return 'mixed'; } // --------------------------------------------------------------------------- // False-positive suppression helpers // --------------------------------------------------------------------------- /** * Decide whether a candidate string should be suppressed (likely a false positive). * * v7.2.0 (B5): rules 11-13 (GLSL/CSS-in-JS/inline-markup line-proximity) are * gated on `context !== 'code-dominant'`. In code-dominant files, an inline * shader keyword next to a credential-shaped string is no longer a reason * to suppress — that was the v7.0.0 polyglot false-negative (e.g. a `.ts` * file with embedded GLSL block hiding a real secret on the next line). * * @param {string} str - The extracted string literal value * @param {string} line - The full source line it came from * @param {string} absPath - Absolute file path * @param {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'} [context='mixed'] * File-level classification from classifyFileContext. * @returns {boolean} - true if this string should be skipped */ function isFalsePositive(str, line, absPath, context = 'mixed') { // 1. URLs — entropy is misleading for long query strings / JWTs in URLs if (str.startsWith('http://') || str.startsWith('https://')) return true; // 2. File/system paths if ( str.startsWith('/') || str.startsWith('./') || str.startsWith('../') || /^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\ ) return true; // 3. Known hash formats in lock/checksum contexts if (HEX_HASH_PATTERN.test(str)) { if ( LOCK_FILE_PATTERN.test(absPath) || INTEGRITY_KEYWORDS.test(line) ) return true; } // 4. Test/fixture files — intentionally contain example secrets, tokens, etc. if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true; // 5. UUID patterns if (UUID_PATTERN.test(str)) return true; // 6. CSS / SVG / font data URIs embedded in source if (/data:image\/|data:font\/|data:application\//i.test(line)) return true; // 7. Import / require paths — the string is a module specifier, not a payload if ( /^\s*import\s/i.test(line) || /\brequire\s*\(/i.test(line) ) return true; // 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...) if (SRI_PREFIX.test(str)) return true; // 9. Line-level integrity keyword context (catches SRI in HTML /