ktg-plugin-marketplace/plugins/llm-security/scanners/entropy-scanner.mjs
Kjell Tore Guttormsen f0fb7505fb fix(entropy): E18 — rule 18 markdown-image CDN-aware + secret pre-check
The v7.0.0 entropy-scanner rule 18 suppressed every line whose pattern
matched ![…](https?://…) — regardless of the URL host or what the URL
carried. A markdown image URL pointing at a non-CDN host (or carrying a
secret-shaped token in its query string) would therefore mask a real
high-entropy credential.

Refactor:

  * MARKDOWN_IMAGE now captures the full URL (was a host-only prefix
    matcher), so rule 18 can inspect host and query.
  * MARKDOWN_IMAGE_CDN_HOSTS allowlist constant covers cdn./images./
    media./assets./static./*.cdn./*.amazonaws.com/{s3,cloudfront}/
    *.cloudflare./*.fastly./*.akamaized./raw.githubusercontent.com/
    *.imgix.net/*.cloudinary.com/.
  * MARKDOWN_IMAGE_QUERY_SECRET catches secret-shaped query keys
    (token, key, secret, password, api_key, access_token, auth) plus
    well-known provider prefixes (AKIA, Bearer, sk_live_, ghp_, ghs_,
    ghu_, gho_, ghr_, npm_).
  * Rule 18 now suppresses iff (host matches CDN allowlist) AND
    (query has no secret-shaped token). Anything else falls through
    to entropy classification.

+4 tests in tests/scanners/entropy-context.test.mjs (29 → 33).
Existing rule 18 fixture (cdn.example.com, no secret query) still
suppresses, so no regression on the legitimate path.

Refs: Batch B Wave 5 / Step 13 / v7.2.0
critical-review-2026-04-20.md §E18
2026-04-29 15:18:37 +02:00

648 lines
26 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
// Zero dependencies (Node.js builtins only via lib helpers).
//
// Rationale: Malicious skills and MCP servers often hide injected instructions,
// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
//
// References:
// - OWASP LLM01 (Prompt Injection via encoded payloads)
// - OWASP LLM03 (Supply Chain — obfuscated dependencies)
// - ToxicSkills research: evasion via base64-wrapped instructions
import { existsSync } from 'node:fs';
import { join } from 'node:path';
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
import { loadPolicy } from './lib/policy-loader.mjs';
// ---------------------------------------------------------------------------
// File-extension suppression (context-aware, v7.0.0+)
// ---------------------------------------------------------------------------
/**
* Extensions whose contents are almost always benign high-entropy noise:
* GPU shaders, stylesheets, SVG markup. Scanning these produces massive
* false-positive rates (observed 70% FP on hyperframes renderer codebase).
*/
const ENTROPY_SKIP_EXTENSIONS = new Set([
'.glsl', '.frag', '.vert', '.shader', '.wgsl', // GPU shaders
'.css', '.scss', '.sass', '.less', // stylesheets
'.svg', // SVG markup
]);
/**
* @param {{ relPath: string, ext: string }} fileInfo
* @returns {boolean} true if the file should be skipped entirely
*/
function shouldSkipByExtension(fileInfo) {
const lowerPath = (fileInfo.relPath || '').toLowerCase();
if (lowerPath.endsWith('.min.js') || lowerPath.endsWith('.min.css')) return true;
const ext = (fileInfo.ext || '').toLowerCase();
if (ENTROPY_SKIP_EXTENSIONS.has(ext)) return true;
if (USER_SUPPRESS_EXTENSIONS.has(ext)) return true;
return false;
}
/**
* @param {{ relPath: string }} fileInfo
* @returns {boolean} true if the file's relative path matches any user-policy skip-path substring.
*/
function shouldSkipByPath(fileInfo) {
if (USER_SUPPRESS_PATHS.length === 0) return false;
const rel = fileInfo.relPath || '';
for (const needle of USER_SUPPRESS_PATHS) {
if (typeof needle === 'string' && needle.length > 0 && rel.includes(needle)) return true;
}
return false;
}
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
*
* Plaintext prose: H ≈ 3.54.2 (len 2050)
* Structured code/JSON: H ≈ 3.94.4 (len 4080)
* SQL queries: H ≈ 4.24.5 (len 50100)
* Base64 len=40: H ≈ 4.45.2 (avg 4.8, p90 5.0)
* Base64 len=64: H ≈ 4.95.4 (avg 5.2, p90 5.3)
* Base64 len=80: H ≈ 5.05.6 (avg 5.3, p90 5.5)
* Base64 len=128: H ≈ 5.45.8 (avg 5.6, p90 5.7)
*
* Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
* Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
* the length-dependent entropy ceiling.
*
* Conservative design: prefer low false-negative rate (catch real payloads) at the cost
* of some false positives that the analyst reviews. The false-positive suppression rules
* above handle the most common benign cases.
*/
const DEFAULT_THRESHOLDS = {
// Large random-looking blob: very likely encoded/encrypted payload
CRITICAL: { entropy: 5.4, minLen: 128 },
// Medium-sized high-entropy string: likely encoded secret or payload fragment
HIGH: { entropy: 5.1, minLen: 64 },
// Shorter elevated-entropy string: suspicious but may be dense data/config
MEDIUM: { entropy: 4.7, minLen: 40 },
};
/**
* Merge policy.entropy.thresholds over defaults. Policy keys are lowercase
* (critical/high/medium) to match other policy sections; defaults use uppercase
* internally.
*
* @param {object|undefined} policyThresholds
* @returns {typeof DEFAULT_THRESHOLDS}
*/
function resolveThresholds(policyThresholds) {
if (!policyThresholds) return DEFAULT_THRESHOLDS;
return {
CRITICAL: { ...DEFAULT_THRESHOLDS.CRITICAL, ...(policyThresholds.critical || {}) },
HIGH: { ...DEFAULT_THRESHOLDS.HIGH, ...(policyThresholds.high || {}) },
MEDIUM: { ...DEFAULT_THRESHOLDS.MEDIUM, ...(policyThresholds.medium || {}) },
};
}
// Effective thresholds after policy-merge (set at scan() entry, read by classifyEntropy).
let THRESHOLDS = DEFAULT_THRESHOLDS;
/** User-extensible line-level regex patterns compiled from policy. Set per scan. */
let USER_SUPPRESS_LINE_PATTERNS = [];
/** User-extensible relative-path substrings to skip entirely. Set per scan. */
let USER_SUPPRESS_PATHS = [];
/** User-extensible extension suppress list (merged with built-in). Set per scan. */
let USER_SUPPRESS_EXTENSIONS = new Set();
/** Known hash/checksum filename patterns — false positive suppression. */
const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;
/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;
/** Integrity hash value prefixes (SRI format). */
const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;
/** Known base64 image/font data-URI prefixes. */
const DATA_URI_PREFIXES = [
'iVBORw0KGgo', // PNG
'/9j/', // JPEG
'R0lGOD', // GIF
'PHN2Zy', // SVG
'AAABAA', // ICO
'T2dnUw', // OGG (audio)
'AAAAFGZ0', // MP4
'UklGR', // WebP/RIFF
'd09G', // WOFF font
'AAEAAAALAAI', // TTF font
];
/** UUID v4 pattern for false positive suppression. */
const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;
/** GLSL/WGSL shader keywords — suppress inline shader source (gl_Position, vec3, uniform, ...). */
const GLSL_KEYWORDS = /\b(?:gl_(?:Position|FragColor|FragCoord|PointSize|PointCoord)|vec[234]|mat[234]|uniform|varying|attribute|precision\s+(?:high|medium|low)p|smoothstep|mix|clamp|texture2D|textureCube|sampler[123]D)\b/;
/** CSS-in-JS patterns (styled-components, emotion, vanilla-extract, @keyframes). */
const CSS_IN_JS_PATTERN = /\b(?:styled\.[a-z]+|css)\s*`|@(?:keyframes|media|supports)\s|:\s*(?:hover|focus|active|before|after|visited|root)\b/;
/** Inline HTML/SVG markup in source (tags with attributes on the same line). */
const INLINE_MARKUP = /<(?:svg|path|defs|g\s|rect\s|circle\s|polygon|polyline|ellipse|line\s|use\s|symbol\s|clipPath|linearGradient|radialGradient|div\s+[a-z-]+|span\s+[a-z-]+|style>|script>|template\s)/i;
/** ffmpeg filter-graph syntax (stream selectors + filter chains). */
const FFMPEG_SYNTAX = /\[\d+:[avs]\]|(?:scale|crop|concat|overlay|psnr|drawtext|setpts|atempo|filter_complex|format|pad|trim|setdar|setsar)\s*=/;
/** Browser User-Agent strings (hardcoded in source — long but structured, not encoded). */
const USER_AGENT_PATTERN = /Mozilla\/\d|AppleWebKit|Chrome\/\d+|Safari\/\d+|Firefox\/\d+|Edg\/\d+|OPR\/\d+/;
/** SQL DDL/DML statements (long structured strings, not encoded payloads). */
const SQL_STATEMENT = /^\s*(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE|GRANT|REVOKE)\s+/i;
/** Error-message templates with embedded HTML/markup (throw new Error("<div>...</div>")). */
const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxError)|new\s+Error\s*\()\s*[`'"]/;
/**
* Markdown image syntax with external URL — `![alt](https://cdn.../hash.ext)`.
* Common in JSON data indexes / article metadata; CDN URL hash segments
* produce high Shannon entropy but are not credentials. Captures the full
* URL so rule 18 can apply CDN-host + secret-in-query checks (E18, v7.2.0).
*/
const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*(https?:\/\/[^)\s]+)/;
/**
* Hosts that legitimately serve high-entropy hashed image URLs. Suppression
* via rule 18 only applies when the markdown image URL host matches this
* pattern AND the URL does not carry a secret-shaped token in its query
* string. Anything else falls through to entropy classification (E18).
*/
const MARKDOWN_IMAGE_CDN_HOSTS = /^https?:\/\/(?:cdn\.|images\.|media\.|assets\.|static\.|[^/]*\.cdn\.|[^/]*\.amazonaws\.com\/(?:s3|cloudfront)\/|[^/]*\.cloudflare\.|[^/]*\.fastly\.|[^/]*\.akamaized\.|raw\.githubusercontent\.com\/|[^/]*\.imgix\.net\/|[^/]*\.cloudinary\.com\/)/i;
/**
* Secret-shaped tokens that disqualify an otherwise-CDN markdown image from
* suppression — query keys (`?token=`, `&api_key=`, etc.) and well-known
* provider prefixes (AWS Access Key ID, Bearer header, GitHub PAT, npm
* token, Stripe live key).
*/
const MARKDOWN_IMAGE_QUERY_SECRET = /(?:^|[?&])(?:token|key|secret|password|passwd|api[_-]?key|access[_-]?token|auth)=|AKIA[0-9A-Z]{14,}|Bearer\s|sk_live_|ghp_|ghs_|ghu_|gho_|ghr_|npm_/i;
/** @param {string} url */
function urlHasSecretInQuery(url) {
const qIdx = url.indexOf('?');
if (qIdx < 0) return false;
const query = url.slice(qIdx + 1);
return MARKDOWN_IMAGE_QUERY_SECRET.test(query);
}
// ---------------------------------------------------------------------------
// File-context classification (B5, v7.2.0)
// ---------------------------------------------------------------------------
/** File extensions treated as pure shader/markup/code by classifyFileContext. */
const SHADER_EXTENSIONS = new Set(['.glsl', '.frag', '.vert', '.shader', '.wgsl']);
const MARKUP_EXTENSIONS = new Set(['.html', '.htm', '.svg', '.xml', '.md', '.markdown', '.mdx']);
const CODE_EXTENSIONS = new Set([
'.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs',
'.py', '.go', '.rs', '.rb', '.java', '.cs',
'.kt', '.scala', '.swift', '.cpp', '.c', '.h', '.hpp', '.php',
]);
/**
* Classify a file as shader-dominant, markup-dominant, code-dominant, or mixed.
*
* Used by isFalsePositive() to gate rules 11-13 (GLSL/CSS-in-JS/inline-markup
* line-proximity suppressions). Those rules fire only when context !== 'code-dominant'
* — preventing the v7.0.0 polyglot false-negative (a real credential on a line
* with an inline GLSL keyword would be incorrectly suppressed).
*
* Conservative defaults to minimize regression risk:
* - Unknown extensions → 'mixed' (all rules apply)
* - Code-extension files with very few non-blank lines (<5 sampled) → 'mixed'
* - Code-extension files where ≥50% of sampled lines match GLSL/inline-markup → 'mixed'
* - Code-extension files otherwise → 'code-dominant'
*
* @param {string} absPath
* @param {string[]} lines
* @returns {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'}
*/
function classifyFileContext(absPath, lines) {
const lower = absPath.toLowerCase();
// Pull the actual extension; supports compound names by taking last dot.
const slashIdx = Math.max(lower.lastIndexOf('/'), lower.lastIndexOf('\\'));
const baseName = slashIdx >= 0 ? lower.slice(slashIdx + 1) : lower;
const dotIdx = baseName.lastIndexOf('.');
const ext = dotIdx >= 0 ? baseName.slice(dotIdx) : '';
if (SHADER_EXTENSIONS.has(ext)) return 'shader-dominant';
if (MARKUP_EXTENSIONS.has(ext)) return 'markup-dominant';
if (CODE_EXTENSIONS.has(ext)) {
let sampled = 0;
let suppressionHits = 0;
for (let i = 0; i < lines.length && sampled < 50; i++) {
const trimmed = lines[i].trim();
if (trimmed.length === 0) continue;
sampled++;
if (GLSL_KEYWORDS.test(trimmed) || INLINE_MARKUP.test(trimmed)) {
suppressionHits++;
}
}
// Too few non-blank lines to classify confidently → conservative default.
if (sampled < 5) return 'mixed';
// Mostly shader/markup despite the code extension → conservative default.
if (suppressionHits / sampled >= 0.5) return 'mixed';
return 'code-dominant';
}
return 'mixed';
}
// ---------------------------------------------------------------------------
// False-positive suppression helpers
// ---------------------------------------------------------------------------
/**
* Decide whether a candidate string should be suppressed (likely a false positive).
*
* v7.2.0 (B5): rules 11-13 (GLSL/CSS-in-JS/inline-markup line-proximity) are
* gated on `context !== 'code-dominant'`. In code-dominant files, an inline
* shader keyword next to a credential-shaped string is no longer a reason
* to suppress — that was the v7.0.0 polyglot false-negative (e.g. a `.ts`
* file with embedded GLSL block hiding a real secret on the next line).
*
* @param {string} str - The extracted string literal value
* @param {string} line - The full source line it came from
* @param {string} absPath - Absolute file path
* @param {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'} [context='mixed']
* File-level classification from classifyFileContext.
* @returns {boolean} - true if this string should be skipped
*/
function isFalsePositive(str, line, absPath, context = 'mixed') {
// 1. URLs — entropy is misleading for long query strings / JWTs in URLs
if (str.startsWith('http://') || str.startsWith('https://')) return true;
// 2. File/system paths
if (
str.startsWith('/') ||
str.startsWith('./') ||
str.startsWith('../') ||
/^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\
) return true;
// 3. Known hash formats in lock/checksum contexts
if (HEX_HASH_PATTERN.test(str)) {
if (
LOCK_FILE_PATTERN.test(absPath) ||
INTEGRITY_KEYWORDS.test(line)
) return true;
}
// 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;
// 5. UUID patterns
if (UUID_PATTERN.test(str)) return true;
// 6. CSS / SVG / font data URIs embedded in source
if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;
// 7. Import / require paths — the string is a module specifier, not a payload
if (
/^\s*import\s/i.test(line) ||
/\brequire\s*\(/i.test(line)
) return true;
// 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
if (SRI_PREFIX.test(str)) return true;
// 9. Line-level integrity keyword context (catches SRI in HTML <link> / <script> tags)
if (INTEGRITY_KEYWORDS.test(line)) return true;
// 10. Base64 image data-URI content (raw prefix check, separate from the line check above)
for (const prefix of DATA_URI_PREFIXES) {
if (str.startsWith(prefix)) return true;
}
// Rules 11-13 (v7.2.0 B5): line-proximity suppressions for shader/CSS/markup.
// Gated on context !== 'code-dominant' so that a credential adjacent to an
// inline GLSL keyword in a `.ts` file is no longer suppressed.
if (context !== 'code-dominant') {
// 11. GLSL/WGSL shader keywords on the line — inline shader source
if (GLSL_KEYWORDS.test(line)) return true;
// 12. CSS-in-JS (styled-components, emotion, vanilla-extract)
if (CSS_IN_JS_PATTERN.test(line)) return true;
// 13. Inline HTML/SVG markup — React/Vue components, email templates
if (INLINE_MARKUP.test(line)) return true;
}
// 14. ffmpeg filter-graph syntax — long structured strings, not encoded
if (FFMPEG_SYNTAX.test(line)) return true;
// 15. Browser User-Agent strings — hardcoded but structured, not a payload
if (USER_AGENT_PATTERN.test(line)) return true;
// 16. SQL DDL/DML — long SELECT/INSERT/... lines
if (SQL_STATEMENT.test(line)) return true;
// 17. Error-message templates (throw new Error("<html>...</html>"))
if (ERROR_TEMPLATE.test(line)) return true;
// 18. Markdown image with external URL (E18, v7.2.0): suppress only when the
// URL host matches a known CDN allowlist AND the URL has no secret-shaped
// token in its query string. Non-CDN hosts and CDN hosts carrying
// secret-looking query parameters fall through to entropy classification.
const mdImgMatch = MARKDOWN_IMAGE.exec(line);
if (mdImgMatch) {
const url = mdImgMatch[1];
if (MARKDOWN_IMAGE_CDN_HOSTS.test(url) && !urlHasSecretInQuery(url)) {
return true;
}
}
// 19. User-policy regex patterns from .llm-security/policy.json
for (const pattern of USER_SUPPRESS_LINE_PATTERNS) {
if (pattern.test(line)) return true;
}
return false;
}
/**
* Compile a list of regex sources (strings) into RegExp objects.
* Invalid patterns are silently skipped (policy is best-effort).
*
* @param {string[]} sources
* @returns {RegExp[]}
*/
function compilePatterns(sources) {
if (!Array.isArray(sources)) return [];
const compiled = [];
for (const src of sources) {
if (typeof src !== 'string' || src.length === 0) continue;
try {
compiled.push(new RegExp(src));
} catch { /* malformed regex — skip */ }
}
return compiled;
}
// ---------------------------------------------------------------------------
// Severity classification
// ---------------------------------------------------------------------------
/**
* Derive severity from entropy and string length.
* Returns null if below all thresholds.
*
* @param {number} H - Shannon entropy
* @param {number} len - String length
* @returns {string|null}
*/
function classifyEntropy(H, len) {
if (H >= THRESHOLDS.CRITICAL.entropy && len >= THRESHOLDS.CRITICAL.minLen) {
return SEVERITY.CRITICAL;
}
if (H >= THRESHOLDS.HIGH.entropy && len >= THRESHOLDS.HIGH.minLen) {
return SEVERITY.HIGH;
}
if (H >= THRESHOLDS.MEDIUM.entropy && len >= THRESHOLDS.MEDIUM.minLen) {
return SEVERITY.MEDIUM;
}
return null;
}
/**
* Merge two severities, keeping the higher one.
* @param {string|null} a
* @param {string|null} b
* @returns {string|null}
*/
function maxSeverity(a, b) {
const order = [SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO];
const rank = (s) => (s === null ? Infinity : order.indexOf(s));
return rank(a) <= rank(b) ? a : b;
}
// ---------------------------------------------------------------------------
// Per-file scanning
// ---------------------------------------------------------------------------
/**
* Scan a single file's content for high-entropy strings.
*
* @param {string} content - File text content
* @param {string} absPath - Absolute file path (for suppression checks)
* @param {string} relPath - Relative path (for finding output)
* @returns {object[]} - Array of finding objects
*/
function scanFileContent(content, absPath, relPath) {
const findings = [];
const lines = content.split('\n');
// v7.2.0 (B5): classify the file once per scan; rules 11-13 inside
// isFalsePositive are gated on this context.
const fileContext = classifyFileContext(absPath, lines);
// De-duplicate: track (line, evidence) pairs to avoid reporting the same
// string twice when it appears in both extractStringLiterals and assignment
// value extraction.
const seen = new Set();
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
const line = lines[lineIdx];
const lineNo = lineIdx + 1;
// Collect candidates: string literals from the standard extractor
const literalCandidates = extractStringLiterals(line);
// Additional extraction: assignment RHS values not caught by quote-matching
// (e.g., lines like: const TOKEN = "AQIB3j0..." or yaml: key: AQIB3j0...)
// We re-use the literal extractor which already handles these cases since it
// scans the full line. No extra pass needed — extractStringLiterals is
// comprehensive for quoted strings. Unquoted YAML values can appear here:
const unquotedYamlMatch = line.match(/^\s*\w[\w.-]*\s*:\s*([A-Za-z0-9+/=]{20,})(?:\s*#.*)?$/);
if (unquotedYamlMatch) {
literalCandidates.push(unquotedYamlMatch[1]);
}
for (const str of literalCandidates) {
if (!str || str.length < 10) continue;
// False positive suppression
if (isFalsePositive(str, line, absPath, fileContext)) continue;
const H = shannonEntropy(str);
let severity = classifyEntropy(H, str.length);
// Additional detection: base64-like blobs and hex blobs get at least MEDIUM
// even if entropy alone didn't trigger (very structured encodings can have
// slightly lower H than random but are still suspicious at length >100/64).
if (severity === null) {
if (isBase64Like(str) && str.length > 100) {
severity = SEVERITY.MEDIUM;
} else if (isHexBlob(str) && str.length > 64) {
severity = SEVERITY.MEDIUM;
}
} else {
// Structured encoding can upgrade or confirm severity
if (isBase64Like(str) && str.length > 100) {
severity = maxSeverity(severity, SEVERITY.MEDIUM);
}
if (isHexBlob(str) && str.length > 64) {
severity = maxSeverity(severity, SEVERITY.MEDIUM);
}
}
if (severity === null) continue;
// De-duplicate
const key = `${lineNo}:${str.slice(0, 16)}`;
if (seen.has(key)) continue;
seen.add(key);
// Determine OWASP mapping:
// - Very high entropy (>=5.5) with base64 → likely injection payload → LLM01
// - Encoded hex deps / supply chain obfuscation → LLM03
// - Default to LLM01 for encoded content that could carry instructions
const isLikelyPayload = H >= THRESHOLDS.CRITICAL.entropy || isBase64Like(str);
const owasp = isLikelyPayload ? 'LLM01' : 'LLM03';
const evidencePreview = redact(str, 8, 4);
const evidence = `H=${H.toFixed(2)}, len=${str.length}: ${evidencePreview}`;
findings.push(
finding({
scanner: 'ENT',
severity,
title: `High-entropy string (H=${H.toFixed(2)}, len=${str.length})`,
description:
`A string with unusually high Shannon entropy was detected. ` +
`High entropy (H>=${THRESHOLDS.MEDIUM.entropy}) in strings of this length ` +
`is characteristic of base64-encoded payloads, AES-encrypted blobs, ` +
`hardcoded secrets, or obfuscated instructions embedded in code or config.`,
file: relPath,
line: lineNo,
evidence,
owasp,
recommendation:
'Inspect this high-entropy string — it may contain an encoded payload, ' +
'hardcoded secret, or obfuscated code',
})
);
}
}
return findings;
}
// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------
/**
* Scan a target path for high-entropy encoded strings.
*
* @param {string} targetPath - Absolute path to scan (file or directory root)
* @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
* - Pre-computed file discovery result from the orchestrator
* @returns {Promise<object>} - Scanner result envelope
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
const allFindings = [];
let filesScanned = 0;
// Load policy for this target and apply overrides to module-level state.
// Best-effort — on any error we fall back to built-in defaults. Provenance
// tracked via file-existence check, not by comparing merged values (defaults
// always include an entropy section so a value-based check would always
// report 'policy.json').
let policySource = 'defaults';
try {
if (existsSync(join(targetPath, '.llm-security', 'policy.json'))) {
policySource = 'policy.json';
}
const policy = loadPolicy(targetPath);
const ent = policy?.entropy || {};
THRESHOLDS = resolveThresholds(ent.thresholds);
USER_SUPPRESS_LINE_PATTERNS = compilePatterns(ent.suppress_line_patterns);
USER_SUPPRESS_PATHS = Array.isArray(ent.suppress_paths) ? ent.suppress_paths.slice() : [];
USER_SUPPRESS_EXTENSIONS = new Set(
(Array.isArray(ent.suppress_extensions) ? ent.suppress_extensions : [])
.filter((e) => typeof e === 'string')
.map((e) => e.toLowerCase()),
);
} catch {
THRESHOLDS = DEFAULT_THRESHOLDS;
USER_SUPPRESS_LINE_PATTERNS = [];
USER_SUPPRESS_PATHS = [];
USER_SUPPRESS_EXTENSIONS = new Set();
policySource = 'defaults';
}
let filesSkippedByExtension = 0;
let filesSkippedByPath = 0;
try {
for (const fileInfo of discovery.files) {
// Context-aware skip: GPU shaders, stylesheets, SVG, minified bundles.
// These file types produce ~70% false-positive rate on real codebases.
if (shouldSkipByExtension(fileInfo)) {
filesSkippedByExtension++;
continue;
}
// User-policy path-substring skip (additive, for project-specific noise).
if (shouldSkipByPath(fileInfo)) {
filesSkippedByPath++;
continue;
}
const content = await readTextFile(fileInfo.absPath);
// readTextFile returns null for binary files or unreadable paths — skip silently
if (content === null) continue;
filesScanned++;
const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
allFindings.push(...fileFindings);
}
const durationMs = Date.now() - startMs;
const status = 'ok';
const result = scannerResult('entropy-scanner', status, allFindings, filesScanned, durationMs);
// Calibration stats for synthesizer — suppression & policy provenance.
result.calibration = {
files_skipped_by_extension: filesSkippedByExtension,
files_skipped_by_path: filesSkippedByPath,
skip_extensions: [...ENTROPY_SKIP_EXTENSIONS, '.min.js', '.min.css'],
policy_source: policySource,
thresholds: {
critical: { entropy: THRESHOLDS.CRITICAL.entropy, minLen: THRESHOLDS.CRITICAL.minLen },
high: { entropy: THRESHOLDS.HIGH.entropy, minLen: THRESHOLDS.HIGH.minLen },
medium: { entropy: THRESHOLDS.MEDIUM.entropy, minLen: THRESHOLDS.MEDIUM.minLen },
},
};
return result;
} catch (err) {
const durationMs = Date.now() - startMs;
return scannerResult(
'entropy-scanner',
'error',
allFindings,
filesScanned,
durationMs,
String(err?.message || err)
);
}
}