The v7.0.0 entropy-scanner rule 18 suppressed every line whose pattern
matched  — regardless of the URL host or what the URL
carried. A markdown image URL pointing at a non-CDN host (or carrying a
secret-shaped token in its query string) would therefore mask a real
high-entropy credential.
Refactor:
* MARKDOWN_IMAGE now captures the full URL (was a host-only prefix
matcher), so rule 18 can inspect host and query.
* MARKDOWN_IMAGE_CDN_HOSTS allowlist constant covers cdn./images./
media./assets./static./*.cdn./*.amazonaws.com/{s3,cloudfront}/
*.cloudflare./*.fastly./*.akamaized./raw.githubusercontent.com/
*.imgix.net/*.cloudinary.com/.
* MARKDOWN_IMAGE_QUERY_SECRET catches secret-shaped query keys
(token, key, secret, password, api_key, access_token, auth) plus
well-known provider prefixes (AKIA, Bearer, sk_live_, ghp_, ghs_,
ghu_, gho_, ghr_, npm_).
* Rule 18 now suppresses iff (host matches CDN allowlist) AND
(query has no secret-shaped token). Anything else falls through
to entropy classification.
+4 tests in tests/scanners/entropy-context.test.mjs (29 → 33).
Existing rule 18 fixture (cdn.example.com, no secret query) still
suppresses, so no regression on the legitimate path.
Refs: Batch B Wave 5 / Step 13 / v7.2.0
critical-review-2026-04-20.md §E18
648 lines
26 KiB
JavaScript
648 lines
26 KiB
JavaScript
// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
|
||
// Zero dependencies (Node.js builtins only via lib helpers).
|
||
//
|
||
// Rationale: Malicious skills and MCP servers often hide injected instructions,
|
||
// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
|
||
// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
|
||
//
|
||
// References:
|
||
// - OWASP LLM01 (Prompt Injection via encoded payloads)
|
||
// - OWASP LLM03 (Supply Chain — obfuscated dependencies)
|
||
// - ToxicSkills research: evasion via base64-wrapped instructions
|
||
|
||
import { existsSync } from 'node:fs';
|
||
import { join } from 'node:path';
|
||
import { readTextFile } from './lib/file-discovery.mjs';
|
||
import { finding, scannerResult } from './lib/output.mjs';
|
||
import { SEVERITY } from './lib/severity.mjs';
|
||
import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
|
||
import { loadPolicy } from './lib/policy-loader.mjs';
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// File-extension suppression (context-aware, v7.0.0+)
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Extensions whose contents are almost always benign high-entropy noise:
|
||
* GPU shaders, stylesheets, SVG markup. Scanning these produces massive
|
||
* false-positive rates (observed 70% FP on hyperframes renderer codebase).
|
||
*/
|
||
const ENTROPY_SKIP_EXTENSIONS = new Set([
|
||
'.glsl', '.frag', '.vert', '.shader', '.wgsl', // GPU shaders
|
||
'.css', '.scss', '.sass', '.less', // stylesheets
|
||
'.svg', // SVG markup
|
||
]);
|
||
|
||
/**
|
||
* @param {{ relPath: string, ext: string }} fileInfo
|
||
* @returns {boolean} true if the file should be skipped entirely
|
||
*/
|
||
function shouldSkipByExtension(fileInfo) {
|
||
const lowerPath = (fileInfo.relPath || '').toLowerCase();
|
||
if (lowerPath.endsWith('.min.js') || lowerPath.endsWith('.min.css')) return true;
|
||
const ext = (fileInfo.ext || '').toLowerCase();
|
||
if (ENTROPY_SKIP_EXTENSIONS.has(ext)) return true;
|
||
if (USER_SUPPRESS_EXTENSIONS.has(ext)) return true;
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* @param {{ relPath: string }} fileInfo
|
||
* @returns {boolean} true if the file's relative path matches any user-policy skip-path substring.
|
||
*/
|
||
function shouldSkipByPath(fileInfo) {
|
||
if (USER_SUPPRESS_PATHS.length === 0) return false;
|
||
const rel = fileInfo.relPath || '';
|
||
for (const needle of USER_SUPPRESS_PATHS) {
|
||
if (typeof needle === 'string' && needle.length > 0 && rel.includes(needle)) return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Constants
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
|
||
*
|
||
* Plaintext prose: H ≈ 3.5–4.2 (len 20–50)
|
||
* Structured code/JSON: H ≈ 3.9–4.4 (len 40–80)
|
||
* SQL queries: H ≈ 4.2–4.5 (len 50–100)
|
||
* Base64 len=40: H ≈ 4.4–5.2 (avg 4.8, p90 5.0)
|
||
* Base64 len=64: H ≈ 4.9–5.4 (avg 5.2, p90 5.3)
|
||
* Base64 len=80: H ≈ 5.0–5.6 (avg 5.3, p90 5.5)
|
||
* Base64 len=128: H ≈ 5.4–5.8 (avg 5.6, p90 5.7)
|
||
*
|
||
* Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
|
||
* Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
|
||
* the length-dependent entropy ceiling.
|
||
*
|
||
* Conservative design: prefer low false-negative rate (catch real payloads) at the cost
|
||
* of some false positives that the analyst reviews. The false-positive suppression rules
|
||
* above handle the most common benign cases.
|
||
*/
|
||
const DEFAULT_THRESHOLDS = {
|
||
// Large random-looking blob: very likely encoded/encrypted payload
|
||
CRITICAL: { entropy: 5.4, minLen: 128 },
|
||
// Medium-sized high-entropy string: likely encoded secret or payload fragment
|
||
HIGH: { entropy: 5.1, minLen: 64 },
|
||
// Shorter elevated-entropy string: suspicious but may be dense data/config
|
||
MEDIUM: { entropy: 4.7, minLen: 40 },
|
||
};
|
||
|
||
/**
|
||
* Merge policy.entropy.thresholds over defaults. Policy keys are lowercase
|
||
* (critical/high/medium) to match other policy sections; defaults use uppercase
|
||
* internally.
|
||
*
|
||
* @param {object|undefined} policyThresholds
|
||
* @returns {typeof DEFAULT_THRESHOLDS}
|
||
*/
|
||
function resolveThresholds(policyThresholds) {
|
||
if (!policyThresholds) return DEFAULT_THRESHOLDS;
|
||
return {
|
||
CRITICAL: { ...DEFAULT_THRESHOLDS.CRITICAL, ...(policyThresholds.critical || {}) },
|
||
HIGH: { ...DEFAULT_THRESHOLDS.HIGH, ...(policyThresholds.high || {}) },
|
||
MEDIUM: { ...DEFAULT_THRESHOLDS.MEDIUM, ...(policyThresholds.medium || {}) },
|
||
};
|
||
}
|
||
|
||
// Effective thresholds after policy-merge (set at scan() entry, read by classifyEntropy).
|
||
let THRESHOLDS = DEFAULT_THRESHOLDS;
|
||
|
||
/** User-extensible line-level regex patterns compiled from policy. Set per scan. */
|
||
let USER_SUPPRESS_LINE_PATTERNS = [];
|
||
|
||
/** User-extensible relative-path substrings to skip entirely. Set per scan. */
|
||
let USER_SUPPRESS_PATHS = [];
|
||
|
||
/** User-extensible extension suppress list (merged with built-in). Set per scan. */
|
||
let USER_SUPPRESS_EXTENSIONS = new Set();
|
||
|
||
/** Known hash/checksum filename patterns — false positive suppression. */
|
||
const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;
|
||
|
||
/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
|
||
const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;
|
||
|
||
/** Integrity hash value prefixes (SRI format). */
|
||
const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;
|
||
|
||
/** Known base64 image/font data-URI prefixes. */
|
||
const DATA_URI_PREFIXES = [
|
||
'iVBORw0KGgo', // PNG
|
||
'/9j/', // JPEG
|
||
'R0lGOD', // GIF
|
||
'PHN2Zy', // SVG
|
||
'AAABAA', // ICO
|
||
'T2dnUw', // OGG (audio)
|
||
'AAAAFGZ0', // MP4
|
||
'UklGR', // WebP/RIFF
|
||
'd09G', // WOFF font
|
||
'AAEAAAALAAI', // TTF font
|
||
];
|
||
|
||
/** UUID v4 pattern for false positive suppression. */
|
||
const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||
|
||
/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
|
||
const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;
|
||
|
||
/** GLSL/WGSL shader keywords — suppress inline shader source (gl_Position, vec3, uniform, ...). */
|
||
const GLSL_KEYWORDS = /\b(?:gl_(?:Position|FragColor|FragCoord|PointSize|PointCoord)|vec[234]|mat[234]|uniform|varying|attribute|precision\s+(?:high|medium|low)p|smoothstep|mix|clamp|texture2D|textureCube|sampler[123]D)\b/;
|
||
|
||
/** CSS-in-JS patterns (styled-components, emotion, vanilla-extract, @keyframes). */
|
||
const CSS_IN_JS_PATTERN = /\b(?:styled\.[a-z]+|css)\s*`|@(?:keyframes|media|supports)\s|:\s*(?:hover|focus|active|before|after|visited|root)\b/;
|
||
|
||
/** Inline HTML/SVG markup in source (tags with attributes on the same line). */
|
||
const INLINE_MARKUP = /<(?:svg|path|defs|g\s|rect\s|circle\s|polygon|polyline|ellipse|line\s|use\s|symbol\s|clipPath|linearGradient|radialGradient|div\s+[a-z-]+|span\s+[a-z-]+|style>|script>|template\s)/i;
|
||
|
||
/** ffmpeg filter-graph syntax (stream selectors + filter chains). */
|
||
const FFMPEG_SYNTAX = /\[\d+:[avs]\]|(?:scale|crop|concat|overlay|psnr|drawtext|setpts|atempo|filter_complex|format|pad|trim|setdar|setsar)\s*=/;
|
||
|
||
/** Browser User-Agent strings (hardcoded in source — long but structured, not encoded). */
|
||
const USER_AGENT_PATTERN = /Mozilla\/\d|AppleWebKit|Chrome\/\d+|Safari\/\d+|Firefox\/\d+|Edg\/\d+|OPR\/\d+/;
|
||
|
||
/** SQL DDL/DML statements (long structured strings, not encoded payloads). */
|
||
const SQL_STATEMENT = /^\s*(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE|GRANT|REVOKE)\s+/i;
|
||
|
||
/** Error-message templates with embedded HTML/markup (throw new Error("<div>...</div>")). */
|
||
const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxError)|new\s+Error\s*\()\s*[`'"]/;
|
||
|
||
/**
|
||
* Markdown image syntax with external URL — ``.
|
||
* Common in JSON data indexes / article metadata; CDN URL hash segments
|
||
* produce high Shannon entropy but are not credentials. Captures the full
|
||
* URL so rule 18 can apply CDN-host + secret-in-query checks (E18, v7.2.0).
|
||
*/
|
||
const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*(https?:\/\/[^)\s]+)/;
|
||
|
||
/**
|
||
* Hosts that legitimately serve high-entropy hashed image URLs. Suppression
|
||
* via rule 18 only applies when the markdown image URL host matches this
|
||
* pattern AND the URL does not carry a secret-shaped token in its query
|
||
* string. Anything else falls through to entropy classification (E18).
|
||
*/
|
||
const MARKDOWN_IMAGE_CDN_HOSTS = /^https?:\/\/(?:cdn\.|images\.|media\.|assets\.|static\.|[^/]*\.cdn\.|[^/]*\.amazonaws\.com\/(?:s3|cloudfront)\/|[^/]*\.cloudflare\.|[^/]*\.fastly\.|[^/]*\.akamaized\.|raw\.githubusercontent\.com\/|[^/]*\.imgix\.net\/|[^/]*\.cloudinary\.com\/)/i;
|
||
|
||
/**
|
||
* Secret-shaped tokens that disqualify an otherwise-CDN markdown image from
|
||
* suppression — query keys (`?token=`, `&api_key=`, etc.) and well-known
|
||
* provider prefixes (AWS Access Key ID, Bearer header, GitHub PAT, npm
|
||
* token, Stripe live key).
|
||
*/
|
||
const MARKDOWN_IMAGE_QUERY_SECRET = /(?:^|[?&])(?:token|key|secret|password|passwd|api[_-]?key|access[_-]?token|auth)=|AKIA[0-9A-Z]{14,}|Bearer\s|sk_live_|ghp_|ghs_|ghu_|gho_|ghr_|npm_/i;
|
||
|
||
/** @param {string} url */
|
||
function urlHasSecretInQuery(url) {
|
||
const qIdx = url.indexOf('?');
|
||
if (qIdx < 0) return false;
|
||
const query = url.slice(qIdx + 1);
|
||
return MARKDOWN_IMAGE_QUERY_SECRET.test(query);
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// File-context classification (B5, v7.2.0)
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/** File extensions treated as pure shader/markup/code by classifyFileContext. */
|
||
const SHADER_EXTENSIONS = new Set(['.glsl', '.frag', '.vert', '.shader', '.wgsl']);
|
||
const MARKUP_EXTENSIONS = new Set(['.html', '.htm', '.svg', '.xml', '.md', '.markdown', '.mdx']);
|
||
const CODE_EXTENSIONS = new Set([
|
||
'.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs',
|
||
'.py', '.go', '.rs', '.rb', '.java', '.cs',
|
||
'.kt', '.scala', '.swift', '.cpp', '.c', '.h', '.hpp', '.php',
|
||
]);
|
||
|
||
/**
|
||
* Classify a file as shader-dominant, markup-dominant, code-dominant, or mixed.
|
||
*
|
||
* Used by isFalsePositive() to gate rules 11-13 (GLSL/CSS-in-JS/inline-markup
|
||
* line-proximity suppressions). Those rules fire only when context !== 'code-dominant'
|
||
* — preventing the v7.0.0 polyglot false-negative (a real credential on a line
|
||
* with an inline GLSL keyword would be incorrectly suppressed).
|
||
*
|
||
* Conservative defaults to minimize regression risk:
|
||
* - Unknown extensions → 'mixed' (all rules apply)
|
||
* - Code-extension files with very few non-blank lines (<5 sampled) → 'mixed'
|
||
* - Code-extension files where ≥50% of sampled lines match GLSL/inline-markup → 'mixed'
|
||
* - Code-extension files otherwise → 'code-dominant'
|
||
*
|
||
* @param {string} absPath
|
||
* @param {string[]} lines
|
||
* @returns {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'}
|
||
*/
|
||
function classifyFileContext(absPath, lines) {
|
||
const lower = absPath.toLowerCase();
|
||
// Pull the actual extension; supports compound names by taking last dot.
|
||
const slashIdx = Math.max(lower.lastIndexOf('/'), lower.lastIndexOf('\\'));
|
||
const baseName = slashIdx >= 0 ? lower.slice(slashIdx + 1) : lower;
|
||
const dotIdx = baseName.lastIndexOf('.');
|
||
const ext = dotIdx >= 0 ? baseName.slice(dotIdx) : '';
|
||
|
||
if (SHADER_EXTENSIONS.has(ext)) return 'shader-dominant';
|
||
if (MARKUP_EXTENSIONS.has(ext)) return 'markup-dominant';
|
||
|
||
if (CODE_EXTENSIONS.has(ext)) {
|
||
let sampled = 0;
|
||
let suppressionHits = 0;
|
||
for (let i = 0; i < lines.length && sampled < 50; i++) {
|
||
const trimmed = lines[i].trim();
|
||
if (trimmed.length === 0) continue;
|
||
sampled++;
|
||
if (GLSL_KEYWORDS.test(trimmed) || INLINE_MARKUP.test(trimmed)) {
|
||
suppressionHits++;
|
||
}
|
||
}
|
||
// Too few non-blank lines to classify confidently → conservative default.
|
||
if (sampled < 5) return 'mixed';
|
||
// Mostly shader/markup despite the code extension → conservative default.
|
||
if (suppressionHits / sampled >= 0.5) return 'mixed';
|
||
return 'code-dominant';
|
||
}
|
||
|
||
return 'mixed';
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// False-positive suppression helpers
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Decide whether a candidate string should be suppressed (likely a false positive).
|
||
*
|
||
* v7.2.0 (B5): rules 11-13 (GLSL/CSS-in-JS/inline-markup line-proximity) are
|
||
* gated on `context !== 'code-dominant'`. In code-dominant files, an inline
|
||
* shader keyword next to a credential-shaped string is no longer a reason
|
||
* to suppress — that was the v7.0.0 polyglot false-negative (e.g. a `.ts`
|
||
* file with embedded GLSL block hiding a real secret on the next line).
|
||
*
|
||
* @param {string} str - The extracted string literal value
|
||
* @param {string} line - The full source line it came from
|
||
* @param {string} absPath - Absolute file path
|
||
* @param {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'} [context='mixed']
|
||
* File-level classification from classifyFileContext.
|
||
* @returns {boolean} - true if this string should be skipped
|
||
*/
|
||
function isFalsePositive(str, line, absPath, context = 'mixed') {
|
||
// 1. URLs — entropy is misleading for long query strings / JWTs in URLs
|
||
if (str.startsWith('http://') || str.startsWith('https://')) return true;
|
||
|
||
// 2. File/system paths
|
||
if (
|
||
str.startsWith('/') ||
|
||
str.startsWith('./') ||
|
||
str.startsWith('../') ||
|
||
/^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\
|
||
) return true;
|
||
|
||
// 3. Known hash formats in lock/checksum contexts
|
||
if (HEX_HASH_PATTERN.test(str)) {
|
||
if (
|
||
LOCK_FILE_PATTERN.test(absPath) ||
|
||
INTEGRITY_KEYWORDS.test(line)
|
||
) return true;
|
||
}
|
||
|
||
// 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
|
||
if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;
|
||
|
||
// 5. UUID patterns
|
||
if (UUID_PATTERN.test(str)) return true;
|
||
|
||
// 6. CSS / SVG / font data URIs embedded in source
|
||
if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;
|
||
|
||
// 7. Import / require paths — the string is a module specifier, not a payload
|
||
if (
|
||
/^\s*import\s/i.test(line) ||
|
||
/\brequire\s*\(/i.test(line)
|
||
) return true;
|
||
|
||
// 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
|
||
if (SRI_PREFIX.test(str)) return true;
|
||
|
||
// 9. Line-level integrity keyword context (catches SRI in HTML <link> / <script> tags)
|
||
if (INTEGRITY_KEYWORDS.test(line)) return true;
|
||
|
||
// 10. Base64 image data-URI content (raw prefix check, separate from the line check above)
|
||
for (const prefix of DATA_URI_PREFIXES) {
|
||
if (str.startsWith(prefix)) return true;
|
||
}
|
||
|
||
// Rules 11-13 (v7.2.0 B5): line-proximity suppressions for shader/CSS/markup.
|
||
// Gated on context !== 'code-dominant' so that a credential adjacent to an
|
||
// inline GLSL keyword in a `.ts` file is no longer suppressed.
|
||
if (context !== 'code-dominant') {
|
||
// 11. GLSL/WGSL shader keywords on the line — inline shader source
|
||
if (GLSL_KEYWORDS.test(line)) return true;
|
||
|
||
// 12. CSS-in-JS (styled-components, emotion, vanilla-extract)
|
||
if (CSS_IN_JS_PATTERN.test(line)) return true;
|
||
|
||
// 13. Inline HTML/SVG markup — React/Vue components, email templates
|
||
if (INLINE_MARKUP.test(line)) return true;
|
||
}
|
||
|
||
// 14. ffmpeg filter-graph syntax — long structured strings, not encoded
|
||
if (FFMPEG_SYNTAX.test(line)) return true;
|
||
|
||
// 15. Browser User-Agent strings — hardcoded but structured, not a payload
|
||
if (USER_AGENT_PATTERN.test(line)) return true;
|
||
|
||
// 16. SQL DDL/DML — long SELECT/INSERT/... lines
|
||
if (SQL_STATEMENT.test(line)) return true;
|
||
|
||
// 17. Error-message templates (throw new Error("<html>...</html>"))
|
||
if (ERROR_TEMPLATE.test(line)) return true;
|
||
|
||
// 18. Markdown image with external URL (E18, v7.2.0): suppress only when the
|
||
// URL host matches a known CDN allowlist AND the URL has no secret-shaped
|
||
// token in its query string. Non-CDN hosts and CDN hosts carrying
|
||
// secret-looking query parameters fall through to entropy classification.
|
||
const mdImgMatch = MARKDOWN_IMAGE.exec(line);
|
||
if (mdImgMatch) {
|
||
const url = mdImgMatch[1];
|
||
if (MARKDOWN_IMAGE_CDN_HOSTS.test(url) && !urlHasSecretInQuery(url)) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
// 19. User-policy regex patterns from .llm-security/policy.json
|
||
for (const pattern of USER_SUPPRESS_LINE_PATTERNS) {
|
||
if (pattern.test(line)) return true;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* Compile a list of regex sources (strings) into RegExp objects.
|
||
* Invalid patterns are silently skipped (policy is best-effort).
|
||
*
|
||
* @param {string[]} sources
|
||
* @returns {RegExp[]}
|
||
*/
|
||
function compilePatterns(sources) {
|
||
if (!Array.isArray(sources)) return [];
|
||
const compiled = [];
|
||
for (const src of sources) {
|
||
if (typeof src !== 'string' || src.length === 0) continue;
|
||
try {
|
||
compiled.push(new RegExp(src));
|
||
} catch { /* malformed regex — skip */ }
|
||
}
|
||
return compiled;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Severity classification
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Derive severity from entropy and string length.
|
||
* Returns null if below all thresholds.
|
||
*
|
||
* @param {number} H - Shannon entropy
|
||
* @param {number} len - String length
|
||
* @returns {string|null}
|
||
*/
|
||
function classifyEntropy(H, len) {
|
||
if (H >= THRESHOLDS.CRITICAL.entropy && len >= THRESHOLDS.CRITICAL.minLen) {
|
||
return SEVERITY.CRITICAL;
|
||
}
|
||
if (H >= THRESHOLDS.HIGH.entropy && len >= THRESHOLDS.HIGH.minLen) {
|
||
return SEVERITY.HIGH;
|
||
}
|
||
if (H >= THRESHOLDS.MEDIUM.entropy && len >= THRESHOLDS.MEDIUM.minLen) {
|
||
return SEVERITY.MEDIUM;
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* Merge two severities, keeping the higher one.
|
||
* @param {string|null} a
|
||
* @param {string|null} b
|
||
* @returns {string|null}
|
||
*/
|
||
function maxSeverity(a, b) {
|
||
const order = [SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO];
|
||
const rank = (s) => (s === null ? Infinity : order.indexOf(s));
|
||
return rank(a) <= rank(b) ? a : b;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Per-file scanning
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Scan a single file's content for high-entropy strings.
|
||
*
|
||
* @param {string} content - File text content
|
||
* @param {string} absPath - Absolute file path (for suppression checks)
|
||
* @param {string} relPath - Relative path (for finding output)
|
||
* @returns {object[]} - Array of finding objects
|
||
*/
|
||
function scanFileContent(content, absPath, relPath) {
|
||
const findings = [];
|
||
const lines = content.split('\n');
|
||
// v7.2.0 (B5): classify the file once per scan; rules 11-13 inside
|
||
// isFalsePositive are gated on this context.
|
||
const fileContext = classifyFileContext(absPath, lines);
|
||
|
||
// De-duplicate: track (line, evidence) pairs to avoid reporting the same
|
||
// string twice when it appears in both extractStringLiterals and assignment
|
||
// value extraction.
|
||
const seen = new Set();
|
||
|
||
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
|
||
const line = lines[lineIdx];
|
||
const lineNo = lineIdx + 1;
|
||
|
||
// Collect candidates: string literals from the standard extractor
|
||
const literalCandidates = extractStringLiterals(line);
|
||
|
||
// Additional extraction: assignment RHS values not caught by quote-matching
|
||
// (e.g., lines like: const TOKEN = "AQIB3j0..." or yaml: key: AQIB3j0...)
|
||
// We re-use the literal extractor which already handles these cases since it
|
||
// scans the full line. No extra pass needed — extractStringLiterals is
|
||
// comprehensive for quoted strings. Unquoted YAML values can appear here:
|
||
const unquotedYamlMatch = line.match(/^\s*\w[\w.-]*\s*:\s*([A-Za-z0-9+/=]{20,})(?:\s*#.*)?$/);
|
||
if (unquotedYamlMatch) {
|
||
literalCandidates.push(unquotedYamlMatch[1]);
|
||
}
|
||
|
||
for (const str of literalCandidates) {
|
||
if (!str || str.length < 10) continue;
|
||
|
||
// False positive suppression
|
||
if (isFalsePositive(str, line, absPath, fileContext)) continue;
|
||
|
||
const H = shannonEntropy(str);
|
||
let severity = classifyEntropy(H, str.length);
|
||
|
||
// Additional detection: base64-like blobs and hex blobs get at least MEDIUM
|
||
// even if entropy alone didn't trigger (very structured encodings can have
|
||
// slightly lower H than random but are still suspicious at length >100/64).
|
||
if (severity === null) {
|
||
if (isBase64Like(str) && str.length > 100) {
|
||
severity = SEVERITY.MEDIUM;
|
||
} else if (isHexBlob(str) && str.length > 64) {
|
||
severity = SEVERITY.MEDIUM;
|
||
}
|
||
} else {
|
||
// Structured encoding can upgrade or confirm severity
|
||
if (isBase64Like(str) && str.length > 100) {
|
||
severity = maxSeverity(severity, SEVERITY.MEDIUM);
|
||
}
|
||
if (isHexBlob(str) && str.length > 64) {
|
||
severity = maxSeverity(severity, SEVERITY.MEDIUM);
|
||
}
|
||
}
|
||
|
||
if (severity === null) continue;
|
||
|
||
// De-duplicate
|
||
const key = `${lineNo}:${str.slice(0, 16)}`;
|
||
if (seen.has(key)) continue;
|
||
seen.add(key);
|
||
|
||
// Determine OWASP mapping:
|
||
// - Very high entropy (>=5.5) with base64 → likely injection payload → LLM01
|
||
// - Encoded hex deps / supply chain obfuscation → LLM03
|
||
// - Default to LLM01 for encoded content that could carry instructions
|
||
const isLikelyPayload = H >= THRESHOLDS.CRITICAL.entropy || isBase64Like(str);
|
||
const owasp = isLikelyPayload ? 'LLM01' : 'LLM03';
|
||
|
||
const evidencePreview = redact(str, 8, 4);
|
||
const evidence = `H=${H.toFixed(2)}, len=${str.length}: ${evidencePreview}`;
|
||
|
||
findings.push(
|
||
finding({
|
||
scanner: 'ENT',
|
||
severity,
|
||
title: `High-entropy string (H=${H.toFixed(2)}, len=${str.length})`,
|
||
description:
|
||
`A string with unusually high Shannon entropy was detected. ` +
|
||
`High entropy (H>=${THRESHOLDS.MEDIUM.entropy}) in strings of this length ` +
|
||
`is characteristic of base64-encoded payloads, AES-encrypted blobs, ` +
|
||
`hardcoded secrets, or obfuscated instructions embedded in code or config.`,
|
||
file: relPath,
|
||
line: lineNo,
|
||
evidence,
|
||
owasp,
|
||
recommendation:
|
||
'Inspect this high-entropy string — it may contain an encoded payload, ' +
|
||
'hardcoded secret, or obfuscated code',
|
||
})
|
||
);
|
||
}
|
||
}
|
||
|
||
return findings;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Public scanner entry point
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Scan a target path for high-entropy encoded strings.
|
||
*
|
||
* @param {string} targetPath - Absolute path to scan (file or directory root)
|
||
* @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
|
||
* - Pre-computed file discovery result from the orchestrator
|
||
* @returns {Promise<object>} - Scanner result envelope
|
||
*/
|
||
export async function scan(targetPath, discovery) {
|
||
const startMs = Date.now();
|
||
const allFindings = [];
|
||
let filesScanned = 0;
|
||
|
||
// Load policy for this target and apply overrides to module-level state.
|
||
// Best-effort — on any error we fall back to built-in defaults. Provenance
|
||
// tracked via file-existence check, not by comparing merged values (defaults
|
||
// always include an entropy section so a value-based check would always
|
||
// report 'policy.json').
|
||
let policySource = 'defaults';
|
||
try {
|
||
if (existsSync(join(targetPath, '.llm-security', 'policy.json'))) {
|
||
policySource = 'policy.json';
|
||
}
|
||
const policy = loadPolicy(targetPath);
|
||
const ent = policy?.entropy || {};
|
||
THRESHOLDS = resolveThresholds(ent.thresholds);
|
||
USER_SUPPRESS_LINE_PATTERNS = compilePatterns(ent.suppress_line_patterns);
|
||
USER_SUPPRESS_PATHS = Array.isArray(ent.suppress_paths) ? ent.suppress_paths.slice() : [];
|
||
USER_SUPPRESS_EXTENSIONS = new Set(
|
||
(Array.isArray(ent.suppress_extensions) ? ent.suppress_extensions : [])
|
||
.filter((e) => typeof e === 'string')
|
||
.map((e) => e.toLowerCase()),
|
||
);
|
||
} catch {
|
||
THRESHOLDS = DEFAULT_THRESHOLDS;
|
||
USER_SUPPRESS_LINE_PATTERNS = [];
|
||
USER_SUPPRESS_PATHS = [];
|
||
USER_SUPPRESS_EXTENSIONS = new Set();
|
||
policySource = 'defaults';
|
||
}
|
||
|
||
let filesSkippedByExtension = 0;
|
||
let filesSkippedByPath = 0;
|
||
|
||
try {
|
||
for (const fileInfo of discovery.files) {
|
||
// Context-aware skip: GPU shaders, stylesheets, SVG, minified bundles.
|
||
// These file types produce ~70% false-positive rate on real codebases.
|
||
if (shouldSkipByExtension(fileInfo)) {
|
||
filesSkippedByExtension++;
|
||
continue;
|
||
}
|
||
|
||
// User-policy path-substring skip (additive, for project-specific noise).
|
||
if (shouldSkipByPath(fileInfo)) {
|
||
filesSkippedByPath++;
|
||
continue;
|
||
}
|
||
|
||
const content = await readTextFile(fileInfo.absPath);
|
||
|
||
// readTextFile returns null for binary files or unreadable paths — skip silently
|
||
if (content === null) continue;
|
||
|
||
filesScanned++;
|
||
|
||
const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
|
||
allFindings.push(...fileFindings);
|
||
}
|
||
|
||
const durationMs = Date.now() - startMs;
|
||
const status = 'ok';
|
||
|
||
const result = scannerResult('entropy-scanner', status, allFindings, filesScanned, durationMs);
|
||
// Calibration stats for synthesizer — suppression & policy provenance.
|
||
result.calibration = {
|
||
files_skipped_by_extension: filesSkippedByExtension,
|
||
files_skipped_by_path: filesSkippedByPath,
|
||
skip_extensions: [...ENTROPY_SKIP_EXTENSIONS, '.min.js', '.min.css'],
|
||
policy_source: policySource,
|
||
thresholds: {
|
||
critical: { entropy: THRESHOLDS.CRITICAL.entropy, minLen: THRESHOLDS.CRITICAL.minLen },
|
||
high: { entropy: THRESHOLDS.HIGH.entropy, minLen: THRESHOLDS.HIGH.minLen },
|
||
medium: { entropy: THRESHOLDS.MEDIUM.entropy, minLen: THRESHOLDS.MEDIUM.minLen },
|
||
},
|
||
};
|
||
return result;
|
||
} catch (err) {
|
||
const durationMs = Date.now() - startMs;
|
||
return scannerResult(
|
||
'entropy-scanner',
|
||
'error',
|
||
allFindings,
|
||
filesScanned,
|
||
durationMs,
|
||
String(err?.message || err)
|
||
);
|
||
}
|
||
}
|