ktg-plugin-marketplace/plugins/llm-security/scanners/entropy-scanner.mjs
Kjell Tore Guttormsen a9e377570c feat(llm-security): v7.0.0 commit 3 — policy-driven entropy thresholds
Adds entropy section to DEFAULT_POLICY and wires it into entropy-scanner.
Users can now tune false-positive tradeoffs without forking the scanner.

Policy shape (.llm-security/policy.json):
  entropy:
    thresholds.{critical,high,medium}.{entropy,minLen}  — numeric overrides
    suppress_extensions[]                               — additive ext skip
    suppress_line_patterns[]                            — additional regex
    suppress_paths[]                                    — relPath substrings

Wiring: entropy-scanner calls loadPolicy(targetPath) at scan entry (not
orchestrator-passed — avoids signature churn across 10 scanners). Module-
level state is reset per scan invocation. Scanner envelope now includes
calibration.{policy_source, thresholds, files_skipped_by_*} for
synthesizer transparency (Commit 5).

Malformed user regex silently skipped. Missing policy.json → built-in
defaults (backwards-compatible).

entropy.test.mjs: 9/9 still green.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-19 22:02:52 +02:00

524 lines
20 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
// Zero dependencies (Node.js builtins only via lib helpers).
//
// Rationale: Malicious skills and MCP servers often hide injected instructions,
// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
//
// References:
// - OWASP LLM01 (Prompt Injection via encoded payloads)
// - OWASP LLM03 (Supply Chain — obfuscated dependencies)
// - ToxicSkills research: evasion via base64-wrapped instructions
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
import { loadPolicy } from './lib/policy-loader.mjs';
// ---------------------------------------------------------------------------
// File-extension suppression (context-aware, v7.0.0+)
// ---------------------------------------------------------------------------
/**
* Extensions whose contents are almost always benign high-entropy noise:
* GPU shaders, stylesheets, SVG markup. Scanning these produces massive
* false-positive rates (observed 70% FP on hyperframes renderer codebase).
*/
const ENTROPY_SKIP_EXTENSIONS = new Set([
'.glsl', '.frag', '.vert', '.shader', '.wgsl', // GPU shaders
'.css', '.scss', '.sass', '.less', // stylesheets
'.svg', // SVG markup
]);
/**
* @param {{ relPath: string, ext: string }} fileInfo
* @returns {boolean} true if the file should be skipped entirely
*/
function shouldSkipByExtension(fileInfo) {
const lowerPath = (fileInfo.relPath || '').toLowerCase();
if (lowerPath.endsWith('.min.js') || lowerPath.endsWith('.min.css')) return true;
const ext = (fileInfo.ext || '').toLowerCase();
if (ENTROPY_SKIP_EXTENSIONS.has(ext)) return true;
if (USER_SUPPRESS_EXTENSIONS.has(ext)) return true;
return false;
}
/**
* @param {{ relPath: string }} fileInfo
* @returns {boolean} true if the file's relative path matches any user-policy skip-path substring.
*/
function shouldSkipByPath(fileInfo) {
if (USER_SUPPRESS_PATHS.length === 0) return false;
const rel = fileInfo.relPath || '';
for (const needle of USER_SUPPRESS_PATHS) {
if (typeof needle === 'string' && needle.length > 0 && rel.includes(needle)) return true;
}
return false;
}
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
*
* Plaintext prose: H ≈ 3.54.2 (len 2050)
* Structured code/JSON: H ≈ 3.94.4 (len 4080)
* SQL queries: H ≈ 4.24.5 (len 50100)
* Base64 len=40: H ≈ 4.45.2 (avg 4.8, p90 5.0)
* Base64 len=64: H ≈ 4.95.4 (avg 5.2, p90 5.3)
* Base64 len=80: H ≈ 5.05.6 (avg 5.3, p90 5.5)
* Base64 len=128: H ≈ 5.45.8 (avg 5.6, p90 5.7)
*
* Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
* Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
* the length-dependent entropy ceiling.
*
* Conservative design: prefer low false-negative rate (catch real payloads) at the cost
* of some false positives that the analyst reviews. The false-positive suppression rules
* above handle the most common benign cases.
*/
const DEFAULT_THRESHOLDS = {
// Large random-looking blob: very likely encoded/encrypted payload
CRITICAL: { entropy: 5.4, minLen: 128 },
// Medium-sized high-entropy string: likely encoded secret or payload fragment
HIGH: { entropy: 5.1, minLen: 64 },
// Shorter elevated-entropy string: suspicious but may be dense data/config
MEDIUM: { entropy: 4.7, minLen: 40 },
};
/**
* Merge policy.entropy.thresholds over defaults. Policy keys are lowercase
* (critical/high/medium) to match other policy sections; defaults use uppercase
* internally.
*
* @param {object|undefined} policyThresholds
* @returns {typeof DEFAULT_THRESHOLDS}
*/
function resolveThresholds(policyThresholds) {
if (!policyThresholds) return DEFAULT_THRESHOLDS;
return {
CRITICAL: { ...DEFAULT_THRESHOLDS.CRITICAL, ...(policyThresholds.critical || {}) },
HIGH: { ...DEFAULT_THRESHOLDS.HIGH, ...(policyThresholds.high || {}) },
MEDIUM: { ...DEFAULT_THRESHOLDS.MEDIUM, ...(policyThresholds.medium || {}) },
};
}
// Effective thresholds after policy-merge (set at scan() entry, read by classifyEntropy).
let THRESHOLDS = DEFAULT_THRESHOLDS;
/** User-extensible line-level regex patterns compiled from policy. Set per scan. */
let USER_SUPPRESS_LINE_PATTERNS = [];
/** User-extensible relative-path substrings to skip entirely. Set per scan. */
let USER_SUPPRESS_PATHS = [];
/** User-extensible extension suppress list (merged with built-in). Set per scan. */
let USER_SUPPRESS_EXTENSIONS = new Set();
/** Known hash/checksum filename patterns — false positive suppression. */
const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;
/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;
/** Integrity hash value prefixes (SRI format). */
const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;
/** Known base64 image/font data-URI prefixes. */
const DATA_URI_PREFIXES = [
'iVBORw0KGgo', // PNG
'/9j/', // JPEG
'R0lGOD', // GIF
'PHN2Zy', // SVG
'AAABAA', // ICO
'T2dnUw', // OGG (audio)
'AAAAFGZ0', // MP4
'UklGR', // WebP/RIFF
'd09G', // WOFF font
'AAEAAAALAAI', // TTF font
];
/** UUID v4 pattern for false positive suppression. */
const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;
/** GLSL/WGSL shader keywords — suppress inline shader source (gl_Position, vec3, uniform, ...). */
const GLSL_KEYWORDS = /\b(?:gl_(?:Position|FragColor|FragCoord|PointSize|PointCoord)|vec[234]|mat[234]|uniform|varying|attribute|precision\s+(?:high|medium|low)p|smoothstep|mix|clamp|texture2D|textureCube|sampler[123]D)\b/;
/** CSS-in-JS patterns (styled-components, emotion, vanilla-extract, @keyframes). */
const CSS_IN_JS_PATTERN = /\b(?:styled\.[a-z]+|css)\s*`|@(?:keyframes|media|supports)\s|:\s*(?:hover|focus|active|before|after|visited|root)\b/;
/** Inline HTML/SVG markup in source (tags with attributes on the same line). */
const INLINE_MARKUP = /<(?:svg|path|defs|g\s|rect\s|circle\s|polygon|polyline|ellipse|line\s|use\s|symbol\s|clipPath|linearGradient|radialGradient|div\s+[a-z-]+|span\s+[a-z-]+|style>|script>|template\s)/i;
/** ffmpeg filter-graph syntax (stream selectors + filter chains). */
const FFMPEG_SYNTAX = /\[\d+:[avs]\]|(?:scale|crop|concat|overlay|psnr|drawtext|setpts|atempo|filter_complex|format|pad|trim|setdar|setsar)\s*=/;
/** Browser User-Agent strings (hardcoded in source — long but structured, not encoded). */
const USER_AGENT_PATTERN = /Mozilla\/\d|AppleWebKit|Chrome\/\d+|Safari\/\d+|Firefox\/\d+|Edg\/\d+|OPR\/\d+/;
/** SQL DDL/DML statements (long structured strings, not encoded payloads). */
const SQL_STATEMENT = /^\s*(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE|GRANT|REVOKE)\s+/i;
/** Error-message templates with embedded HTML/markup (throw new Error("<div>...</div>")). */
const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxError)|new\s+Error\s*\()\s*[`'"]/;
// ---------------------------------------------------------------------------
// False-positive suppression helpers
// ---------------------------------------------------------------------------
/**
* Decide whether a candidate string should be suppressed (likely a false positive).
*
* @param {string} str - The extracted string literal value
* @param {string} line - The full source line it came from
* @param {string} absPath - Absolute file path
* @returns {boolean} - true if this string should be skipped
*/
function isFalsePositive(str, line, absPath) {
// 1. URLs — entropy is misleading for long query strings / JWTs in URLs
if (str.startsWith('http://') || str.startsWith('https://')) return true;
// 2. File/system paths
if (
str.startsWith('/') ||
str.startsWith('./') ||
str.startsWith('../') ||
/^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\
) return true;
// 3. Known hash formats in lock/checksum contexts
if (HEX_HASH_PATTERN.test(str)) {
if (
LOCK_FILE_PATTERN.test(absPath) ||
INTEGRITY_KEYWORDS.test(line)
) return true;
}
// 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;
// 5. UUID patterns
if (UUID_PATTERN.test(str)) return true;
// 6. CSS / SVG / font data URIs embedded in source
if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;
// 7. Import / require paths — the string is a module specifier, not a payload
if (
/^\s*import\s/i.test(line) ||
/\brequire\s*\(/i.test(line)
) return true;
// 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
if (SRI_PREFIX.test(str)) return true;
// 9. Line-level integrity keyword context (catches SRI in HTML <link> / <script> tags)
if (INTEGRITY_KEYWORDS.test(line)) return true;
// 10. Base64 image data-URI content (raw prefix check, separate from the line check above)
for (const prefix of DATA_URI_PREFIXES) {
if (str.startsWith(prefix)) return true;
}
// 11. GLSL/WGSL shader keywords on the line — inline shader source
if (GLSL_KEYWORDS.test(line)) return true;
// 12. CSS-in-JS (styled-components, emotion, vanilla-extract)
if (CSS_IN_JS_PATTERN.test(line)) return true;
// 13. Inline HTML/SVG markup — React/Vue components, email templates
if (INLINE_MARKUP.test(line)) return true;
// 14. ffmpeg filter-graph syntax — long structured strings, not encoded
if (FFMPEG_SYNTAX.test(line)) return true;
// 15. Browser User-Agent strings — hardcoded but structured, not a payload
if (USER_AGENT_PATTERN.test(line)) return true;
// 16. SQL DDL/DML — long SELECT/INSERT/... lines
if (SQL_STATEMENT.test(line)) return true;
// 17. Error-message templates (throw new Error("<html>...</html>"))
if (ERROR_TEMPLATE.test(line)) return true;
// 18. User-policy regex patterns from .llm-security/policy.json
for (const pattern of USER_SUPPRESS_LINE_PATTERNS) {
if (pattern.test(line)) return true;
}
return false;
}
/**
* Compile a list of regex sources (strings) into RegExp objects.
* Invalid patterns are silently skipped (policy is best-effort).
*
* @param {string[]} sources
* @returns {RegExp[]}
*/
function compilePatterns(sources) {
if (!Array.isArray(sources)) return [];
const compiled = [];
for (const src of sources) {
if (typeof src !== 'string' || src.length === 0) continue;
try {
compiled.push(new RegExp(src));
} catch { /* malformed regex — skip */ }
}
return compiled;
}
// ---------------------------------------------------------------------------
// Severity classification
// ---------------------------------------------------------------------------
/**
* Derive severity from entropy and string length.
* Returns null if below all thresholds.
*
* @param {number} H - Shannon entropy
* @param {number} len - String length
* @returns {string|null}
*/
function classifyEntropy(H, len) {
if (H >= THRESHOLDS.CRITICAL.entropy && len >= THRESHOLDS.CRITICAL.minLen) {
return SEVERITY.CRITICAL;
}
if (H >= THRESHOLDS.HIGH.entropy && len >= THRESHOLDS.HIGH.minLen) {
return SEVERITY.HIGH;
}
if (H >= THRESHOLDS.MEDIUM.entropy && len >= THRESHOLDS.MEDIUM.minLen) {
return SEVERITY.MEDIUM;
}
return null;
}
/**
* Merge two severities, keeping the higher one.
* @param {string|null} a
* @param {string|null} b
* @returns {string|null}
*/
function maxSeverity(a, b) {
const order = [SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO];
const rank = (s) => (s === null ? Infinity : order.indexOf(s));
return rank(a) <= rank(b) ? a : b;
}
// ---------------------------------------------------------------------------
// Per-file scanning
// ---------------------------------------------------------------------------
/**
* Scan a single file's content for high-entropy strings.
*
* @param {string} content - File text content
* @param {string} absPath - Absolute file path (for suppression checks)
* @param {string} relPath - Relative path (for finding output)
* @returns {object[]} - Array of finding objects
*/
function scanFileContent(content, absPath, relPath) {
const findings = [];
const lines = content.split('\n');
// De-duplicate: track (line, evidence) pairs to avoid reporting the same
// string twice when it appears in both extractStringLiterals and assignment
// value extraction.
const seen = new Set();
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
const line = lines[lineIdx];
const lineNo = lineIdx + 1;
// Collect candidates: string literals from the standard extractor
const literalCandidates = extractStringLiterals(line);
// Additional extraction: assignment RHS values not caught by quote-matching
// (e.g., lines like: const TOKEN = "AQIB3j0..." or yaml: key: AQIB3j0...)
// We re-use the literal extractor which already handles these cases since it
// scans the full line. No extra pass needed — extractStringLiterals is
// comprehensive for quoted strings. Unquoted YAML values can appear here:
const unquotedYamlMatch = line.match(/^\s*\w[\w.-]*\s*:\s*([A-Za-z0-9+/=]{20,})(?:\s*#.*)?$/);
if (unquotedYamlMatch) {
literalCandidates.push(unquotedYamlMatch[1]);
}
for (const str of literalCandidates) {
if (!str || str.length < 10) continue;
// False positive suppression
if (isFalsePositive(str, line, absPath)) continue;
const H = shannonEntropy(str);
let severity = classifyEntropy(H, str.length);
// Additional detection: base64-like blobs and hex blobs get at least MEDIUM
// even if entropy alone didn't trigger (very structured encodings can have
// slightly lower H than random but are still suspicious at length >100/64).
if (severity === null) {
if (isBase64Like(str) && str.length > 100) {
severity = SEVERITY.MEDIUM;
} else if (isHexBlob(str) && str.length > 64) {
severity = SEVERITY.MEDIUM;
}
} else {
// Structured encoding can upgrade or confirm severity
if (isBase64Like(str) && str.length > 100) {
severity = maxSeverity(severity, SEVERITY.MEDIUM);
}
if (isHexBlob(str) && str.length > 64) {
severity = maxSeverity(severity, SEVERITY.MEDIUM);
}
}
if (severity === null) continue;
// De-duplicate
const key = `${lineNo}:${str.slice(0, 16)}`;
if (seen.has(key)) continue;
seen.add(key);
// Determine OWASP mapping:
// - Very high entropy (>=5.5) with base64 → likely injection payload → LLM01
// - Encoded hex deps / supply chain obfuscation → LLM03
// - Default to LLM01 for encoded content that could carry instructions
const isLikelyPayload = H >= THRESHOLDS.CRITICAL.entropy || isBase64Like(str);
const owasp = isLikelyPayload ? 'LLM01' : 'LLM03';
const evidencePreview = redact(str, 8, 4);
const evidence = `H=${H.toFixed(2)}, len=${str.length}: ${evidencePreview}`;
findings.push(
finding({
scanner: 'ENT',
severity,
title: `High-entropy string (H=${H.toFixed(2)}, len=${str.length})`,
description:
`A string with unusually high Shannon entropy was detected. ` +
`High entropy (H>=${THRESHOLDS.MEDIUM.entropy}) in strings of this length ` +
`is characteristic of base64-encoded payloads, AES-encrypted blobs, ` +
`hardcoded secrets, or obfuscated instructions embedded in code or config.`,
file: relPath,
line: lineNo,
evidence,
owasp,
recommendation:
'Inspect this high-entropy string — it may contain an encoded payload, ' +
'hardcoded secret, or obfuscated code',
})
);
}
}
return findings;
}
// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------
/**
* Scan a target path for high-entropy encoded strings.
*
* @param {string} targetPath - Absolute path to scan (file or directory root)
* @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
* - Pre-computed file discovery result from the orchestrator
* @returns {Promise<object>} - Scanner result envelope
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
const allFindings = [];
let filesScanned = 0;
// Load policy for this target and apply overrides to module-level state.
// Best-effort — on any error we fall back to built-in defaults.
let policySource = 'defaults';
try {
const policy = loadPolicy(targetPath);
const ent = policy?.entropy || {};
THRESHOLDS = resolveThresholds(ent.thresholds);
USER_SUPPRESS_LINE_PATTERNS = compilePatterns(ent.suppress_line_patterns);
USER_SUPPRESS_PATHS = Array.isArray(ent.suppress_paths) ? ent.suppress_paths.slice() : [];
USER_SUPPRESS_EXTENSIONS = new Set(
(Array.isArray(ent.suppress_extensions) ? ent.suppress_extensions : [])
.filter((e) => typeof e === 'string')
.map((e) => e.toLowerCase()),
);
if (
ent.thresholds ||
(ent.suppress_line_patterns && ent.suppress_line_patterns.length > 0) ||
(ent.suppress_paths && ent.suppress_paths.length > 0) ||
(ent.suppress_extensions && ent.suppress_extensions.length > 0)
) {
policySource = 'policy.json';
}
} catch {
THRESHOLDS = DEFAULT_THRESHOLDS;
USER_SUPPRESS_LINE_PATTERNS = [];
USER_SUPPRESS_PATHS = [];
USER_SUPPRESS_EXTENSIONS = new Set();
}
let filesSkippedByExtension = 0;
let filesSkippedByPath = 0;
try {
for (const fileInfo of discovery.files) {
// Context-aware skip: GPU shaders, stylesheets, SVG, minified bundles.
// These file types produce ~70% false-positive rate on real codebases.
if (shouldSkipByExtension(fileInfo)) {
filesSkippedByExtension++;
continue;
}
// User-policy path-substring skip (additive, for project-specific noise).
if (shouldSkipByPath(fileInfo)) {
filesSkippedByPath++;
continue;
}
const content = await readTextFile(fileInfo.absPath);
// readTextFile returns null for binary files or unreadable paths — skip silently
if (content === null) continue;
filesScanned++;
const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
allFindings.push(...fileFindings);
}
const durationMs = Date.now() - startMs;
const status = 'ok';
const result = scannerResult('entropy-scanner', status, allFindings, filesScanned, durationMs);
// Calibration stats for synthesizer — suppression & policy provenance.
result.calibration = {
files_skipped_by_extension: filesSkippedByExtension,
files_skipped_by_path: filesSkippedByPath,
skip_extensions: [...ENTROPY_SKIP_EXTENSIONS, '.min.js', '.min.css'],
policy_source: policySource,
thresholds: {
critical: { entropy: THRESHOLDS.CRITICAL.entropy, minLen: THRESHOLDS.CRITICAL.minLen },
high: { entropy: THRESHOLDS.HIGH.entropy, minLen: THRESHOLDS.HIGH.minLen },
medium: { entropy: THRESHOLDS.MEDIUM.entropy, minLen: THRESHOLDS.MEDIUM.minLen },
},
};
return result;
} catch (err) {
const durationMs = Date.now() - startMs;
return scannerResult(
'entropy-scanner',
'error',
allFindings,
filesScanned,
durationMs,
String(err?.message || err)
);
}
}