// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
// Zero dependencies (Node.js builtins only via lib helpers).
//
// Rationale: Malicious skills and MCP servers often hide injected instructions,
// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
//
// References:
// - OWASP LLM01 (Prompt Injection via encoded payloads)
// - OWASP LLM03 (Supply Chain — obfuscated dependencies)
// - ToxicSkills research: evasion via base64-wrapped instructions
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
*
* Plaintext prose: H ≈ 3.5–4.2 (len 20–50)
* Structured code/JSON: H ≈ 3.9–4.4 (len 40–80)
* SQL queries: H ≈ 4.2–4.5 (len 50–100)
* Base64 len=40: H ≈ 4.4–5.2 (avg 4.8, p90 5.0)
* Base64 len=64: H ≈ 4.9–5.4 (avg 5.2, p90 5.3)
* Base64 len=80: H ≈ 5.0–5.6 (avg 5.3, p90 5.5)
* Base64 len=128: H ≈ 5.4–5.8 (avg 5.6, p90 5.7)
*
* Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
* Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
* the length-dependent entropy ceiling.
*
* Conservative design: prefer low false-negative rate (catch real payloads) at the cost
* of some false positives that the analyst reviews. The false-positive suppression rules
* above handle the most common benign cases.
*/
const THRESHOLDS = {
// Large random-looking blob: very likely encoded/encrypted payload
CRITICAL: { entropy: 5.4, minLen: 128 },
// Medium-sized high-entropy string: likely encoded secret or payload fragment
HIGH: { entropy: 5.1, minLen: 64 },
// Shorter elevated-entropy string: suspicious but may be dense data/config
MEDIUM: { entropy: 4.7, minLen: 40 },
};
/** Known hash/checksum filename patterns — false positive suppression. */
const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;
/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;
/** Integrity hash value prefixes (SRI format). */
const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;
/** Known base64 image/font data-URI prefixes. */
const DATA_URI_PREFIXES = [
'iVBORw0KGgo', // PNG
'/9j/', // JPEG
'R0lGOD', // GIF
'PHN2Zy', // SVG
'AAABAA', // ICO
'T2dnUw', // OGG (audio)
'AAAAFGZ0', // MP4
'UklGR', // WebP/RIFF
'd09G', // WOFF font
'AAEAAAALAAI', // TTF font
];
/** UUID v4 pattern for false positive suppression. */
const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;
// ---------------------------------------------------------------------------
// False-positive suppression helpers
// ---------------------------------------------------------------------------
/**
* Decide whether a candidate string should be suppressed (likely a false positive).
*
* @param {string} str - The extracted string literal value
* @param {string} line - The full source line it came from
* @param {string} absPath - Absolute file path
* @returns {boolean} - true if this string should be skipped
*/
function isFalsePositive(str, line, absPath) {
// 1. URLs — entropy is misleading for long query strings / JWTs in URLs
if (str.startsWith('http://') || str.startsWith('https://')) return true;
// 2. File/system paths
if (
str.startsWith('/') ||
str.startsWith('./') ||
str.startsWith('../') ||
/^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\
) return true;
// 3. Known hash formats in lock/checksum contexts
if (HEX_HASH_PATTERN.test(str)) {
if (
LOCK_FILE_PATTERN.test(absPath) ||
INTEGRITY_KEYWORDS.test(line)
) return true;
}
// 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;
// 5. UUID patterns
if (UUID_PATTERN.test(str)) return true;
// 6. CSS / SVG / font data URIs embedded in source
if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;
// 7. Import / require paths — the string is a module specifier, not a payload
if (
/^\s*import\s/i.test(line) ||
/\brequire\s*\(/i.test(line)
) return true;
// 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
if (SRI_PREFIX.test(str)) return true;
// 9. Line-level integrity keyword context (catches SRI in HTML /