ktg-plugin-marketplace/plugins/llm-security/scanners/entropy-scanner.mjs
Kjell Tore Guttormsen 04f1593df3 refactor(entropy): B5 — two-stage context-classified suppression pipeline
The v7.0.0 entropy-scanner ran rules 11-13 (GLSL/CSS-in-JS/inline-markup
line-proximity suppressions) for every line regardless of file type. A
polyglot `.ts` file with an embedded fragment-shader template literal
could therefore mask a real high-entropy credential when the credential
literal happened to share a line with a GLSL keyword. Critical-review
B5 documented the false-negative class.

Refactor:

  * New `classifyFileContext(absPath, lines)` returns
    `'shader-dominant' | 'markup-dominant' | 'code-dominant' | 'mixed'`,
    keyed off file extension with a content-density fallback for
    code-extension files (≥50% of sampled non-blank lines matching
    GLSL/inline-markup → downgrade to `mixed`).

  * `isFalsePositive(str, line, absPath, context)` gates rules 11-13
    on `context !== 'code-dominant'`. Rules 1-10 and 14-19 still run
    unconditionally, so URL/path/test-fixture/ffmpeg/UA/SQL/error-
    template suppression behaves identically.

  * `scanFileContent` computes `fileContext` once per file and threads
    it through every per-string suppression check.

Conservative defaults to keep the regression surface minimal:

  * Files with `<5` sampled non-blank lines fall back to `mixed`
    (preserves the existing rule-11/12/13 behaviour for the single-
    line .js fixtures used by entropy-context.test.mjs).
  * Unknown extensions fall back to `mixed`.
  * Code-extension files densely populated with shader/markup
    content fall back to `mixed`.

Net effect: a `.ts` file with an embedded GLSL block but mostly TS
code on the surrounding lines now surfaces credentials that the
v7.0.0 line-proximity heuristic suppressed. Pure shader/markup
files are unaffected (extension skip / mixed default).

New fixture: tests/fixtures/entropy/polyglot-ts-with-glsl.ts (with
runtime placeholder so it does not commit a high-entropy literal).

+3 tests in tests/scanners/entropy-context.test.mjs (26 → 29).
Existing entropy.test.mjs and entropy-context.test.mjs all remain
green. Full suite 1658 → 1661.

Refs: Batch B Wave 5 / Step 12 / v7.2.0
critical-review-2026-04-20.md §B5
2026-04-29 15:13:13 +02:00

614 lines
25 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
// Zero dependencies (Node.js builtins only via lib helpers).
//
// Rationale: Malicious skills and MCP servers often hide injected instructions,
// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
//
// References:
// - OWASP LLM01 (Prompt Injection via encoded payloads)
// - OWASP LLM03 (Supply Chain — obfuscated dependencies)
// - ToxicSkills research: evasion via base64-wrapped instructions
import { existsSync } from 'node:fs';
import { join } from 'node:path';
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
import { loadPolicy } from './lib/policy-loader.mjs';
// ---------------------------------------------------------------------------
// File-extension suppression (context-aware, v7.0.0+)
// ---------------------------------------------------------------------------
/**
* Extensions whose contents are almost always benign high-entropy noise:
* GPU shaders, stylesheets, SVG markup. Scanning these produces massive
* false-positive rates (observed 70% FP on hyperframes renderer codebase).
*/
const ENTROPY_SKIP_EXTENSIONS = new Set([
'.glsl', '.frag', '.vert', '.shader', '.wgsl', // GPU shaders
'.css', '.scss', '.sass', '.less', // stylesheets
'.svg', // SVG markup
]);
/**
* @param {{ relPath: string, ext: string }} fileInfo
* @returns {boolean} true if the file should be skipped entirely
*/
function shouldSkipByExtension(fileInfo) {
const lowerPath = (fileInfo.relPath || '').toLowerCase();
if (lowerPath.endsWith('.min.js') || lowerPath.endsWith('.min.css')) return true;
const ext = (fileInfo.ext || '').toLowerCase();
if (ENTROPY_SKIP_EXTENSIONS.has(ext)) return true;
if (USER_SUPPRESS_EXTENSIONS.has(ext)) return true;
return false;
}
/**
* @param {{ relPath: string }} fileInfo
* @returns {boolean} true if the file's relative path matches any user-policy skip-path substring.
*/
function shouldSkipByPath(fileInfo) {
if (USER_SUPPRESS_PATHS.length === 0) return false;
const rel = fileInfo.relPath || '';
for (const needle of USER_SUPPRESS_PATHS) {
if (typeof needle === 'string' && needle.length > 0 && rel.includes(needle)) return true;
}
return false;
}
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
*
* Plaintext prose: H ≈ 3.54.2 (len 2050)
* Structured code/JSON: H ≈ 3.94.4 (len 4080)
* SQL queries: H ≈ 4.24.5 (len 50100)
* Base64 len=40: H ≈ 4.45.2 (avg 4.8, p90 5.0)
* Base64 len=64: H ≈ 4.95.4 (avg 5.2, p90 5.3)
* Base64 len=80: H ≈ 5.05.6 (avg 5.3, p90 5.5)
* Base64 len=128: H ≈ 5.45.8 (avg 5.6, p90 5.7)
*
* Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
* Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
* the length-dependent entropy ceiling.
*
* Conservative design: prefer low false-negative rate (catch real payloads) at the cost
* of some false positives that the analyst reviews. The false-positive suppression rules
* above handle the most common benign cases.
*/
const DEFAULT_THRESHOLDS = {
// Large random-looking blob: very likely encoded/encrypted payload
CRITICAL: { entropy: 5.4, minLen: 128 },
// Medium-sized high-entropy string: likely encoded secret or payload fragment
HIGH: { entropy: 5.1, minLen: 64 },
// Shorter elevated-entropy string: suspicious but may be dense data/config
MEDIUM: { entropy: 4.7, minLen: 40 },
};
/**
* Merge policy.entropy.thresholds over defaults. Policy keys are lowercase
* (critical/high/medium) to match other policy sections; defaults use uppercase
* internally.
*
* @param {object|undefined} policyThresholds
* @returns {typeof DEFAULT_THRESHOLDS}
*/
function resolveThresholds(policyThresholds) {
if (!policyThresholds) return DEFAULT_THRESHOLDS;
return {
CRITICAL: { ...DEFAULT_THRESHOLDS.CRITICAL, ...(policyThresholds.critical || {}) },
HIGH: { ...DEFAULT_THRESHOLDS.HIGH, ...(policyThresholds.high || {}) },
MEDIUM: { ...DEFAULT_THRESHOLDS.MEDIUM, ...(policyThresholds.medium || {}) },
};
}
// Effective thresholds after policy-merge (set at scan() entry, read by classifyEntropy).
let THRESHOLDS = DEFAULT_THRESHOLDS;
/** User-extensible line-level regex patterns compiled from policy. Set per scan. */
let USER_SUPPRESS_LINE_PATTERNS = [];
/** User-extensible relative-path substrings to skip entirely. Set per scan. */
let USER_SUPPRESS_PATHS = [];
/** User-extensible extension suppress list (merged with built-in). Set per scan. */
let USER_SUPPRESS_EXTENSIONS = new Set();
/** Known hash/checksum filename patterns — false positive suppression. */
const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;
/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;
/** Integrity hash value prefixes (SRI format). */
const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;
/** Known base64 image/font data-URI prefixes. */
const DATA_URI_PREFIXES = [
'iVBORw0KGgo', // PNG
'/9j/', // JPEG
'R0lGOD', // GIF
'PHN2Zy', // SVG
'AAABAA', // ICO
'T2dnUw', // OGG (audio)
'AAAAFGZ0', // MP4
'UklGR', // WebP/RIFF
'd09G', // WOFF font
'AAEAAAALAAI', // TTF font
];
/** UUID v4 pattern for false positive suppression. */
const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;
/** GLSL/WGSL shader keywords — suppress inline shader source (gl_Position, vec3, uniform, ...). */
const GLSL_KEYWORDS = /\b(?:gl_(?:Position|FragColor|FragCoord|PointSize|PointCoord)|vec[234]|mat[234]|uniform|varying|attribute|precision\s+(?:high|medium|low)p|smoothstep|mix|clamp|texture2D|textureCube|sampler[123]D)\b/;
/** CSS-in-JS patterns (styled-components, emotion, vanilla-extract, @keyframes). */
const CSS_IN_JS_PATTERN = /\b(?:styled\.[a-z]+|css)\s*`|@(?:keyframes|media|supports)\s|:\s*(?:hover|focus|active|before|after|visited|root)\b/;
/** Inline HTML/SVG markup in source (tags with attributes on the same line). */
const INLINE_MARKUP = /<(?:svg|path|defs|g\s|rect\s|circle\s|polygon|polyline|ellipse|line\s|use\s|symbol\s|clipPath|linearGradient|radialGradient|div\s+[a-z-]+|span\s+[a-z-]+|style>|script>|template\s)/i;
/** ffmpeg filter-graph syntax (stream selectors + filter chains). */
const FFMPEG_SYNTAX = /\[\d+:[avs]\]|(?:scale|crop|concat|overlay|psnr|drawtext|setpts|atempo|filter_complex|format|pad|trim|setdar|setsar)\s*=/;
/** Browser User-Agent strings (hardcoded in source — long but structured, not encoded). */
const USER_AGENT_PATTERN = /Mozilla\/\d|AppleWebKit|Chrome\/\d+|Safari\/\d+|Firefox\/\d+|Edg\/\d+|OPR\/\d+/;
/** SQL DDL/DML statements (long structured strings, not encoded payloads). */
const SQL_STATEMENT = /^\s*(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE|GRANT|REVOKE)\s+/i;
/** Error-message templates with embedded HTML/markup (throw new Error("<div>...</div>")). */
const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxError)|new\s+Error\s*\()\s*[`'"]/;
/**
* Markdown image syntax with external URL — `![alt](https://cdn.../hash.ext)`.
* Common in JSON data indexes / article metadata; CDN URL hash segments
* produce high Shannon entropy but are not credentials.
*/
const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//;
// ---------------------------------------------------------------------------
// File-context classification (B5, v7.2.0)
// ---------------------------------------------------------------------------
/** File extensions treated as pure shader/markup/code by classifyFileContext. */
const SHADER_EXTENSIONS = new Set(['.glsl', '.frag', '.vert', '.shader', '.wgsl']);
const MARKUP_EXTENSIONS = new Set(['.html', '.htm', '.svg', '.xml', '.md', '.markdown', '.mdx']);
const CODE_EXTENSIONS = new Set([
'.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs',
'.py', '.go', '.rs', '.rb', '.java', '.cs',
'.kt', '.scala', '.swift', '.cpp', '.c', '.h', '.hpp', '.php',
]);
/**
* Classify a file as shader-dominant, markup-dominant, code-dominant, or mixed.
*
* Used by isFalsePositive() to gate rules 11-13 (GLSL/CSS-in-JS/inline-markup
* line-proximity suppressions). Those rules fire only when context !== 'code-dominant'
* — preventing the v7.0.0 polyglot false-negative (a real credential on a line
* with an inline GLSL keyword would be incorrectly suppressed).
*
* Conservative defaults to minimize regression risk:
* - Unknown extensions → 'mixed' (all rules apply)
* - Code-extension files with very few non-blank lines (<5 sampled) → 'mixed'
* - Code-extension files where ≥50% of sampled lines match GLSL/inline-markup → 'mixed'
* - Code-extension files otherwise → 'code-dominant'
*
* @param {string} absPath
* @param {string[]} lines
* @returns {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'}
*/
function classifyFileContext(absPath, lines) {
const lower = absPath.toLowerCase();
// Pull the actual extension; supports compound names by taking last dot.
const slashIdx = Math.max(lower.lastIndexOf('/'), lower.lastIndexOf('\\'));
const baseName = slashIdx >= 0 ? lower.slice(slashIdx + 1) : lower;
const dotIdx = baseName.lastIndexOf('.');
const ext = dotIdx >= 0 ? baseName.slice(dotIdx) : '';
if (SHADER_EXTENSIONS.has(ext)) return 'shader-dominant';
if (MARKUP_EXTENSIONS.has(ext)) return 'markup-dominant';
if (CODE_EXTENSIONS.has(ext)) {
let sampled = 0;
let suppressionHits = 0;
for (let i = 0; i < lines.length && sampled < 50; i++) {
const trimmed = lines[i].trim();
if (trimmed.length === 0) continue;
sampled++;
if (GLSL_KEYWORDS.test(trimmed) || INLINE_MARKUP.test(trimmed)) {
suppressionHits++;
}
}
// Too few non-blank lines to classify confidently → conservative default.
if (sampled < 5) return 'mixed';
// Mostly shader/markup despite the code extension → conservative default.
if (suppressionHits / sampled >= 0.5) return 'mixed';
return 'code-dominant';
}
return 'mixed';
}
// ---------------------------------------------------------------------------
// False-positive suppression helpers
// ---------------------------------------------------------------------------
/**
* Decide whether a candidate string should be suppressed (likely a false positive).
*
* v7.2.0 (B5): rules 11-13 (GLSL/CSS-in-JS/inline-markup line-proximity) are
* gated on `context !== 'code-dominant'`. In code-dominant files, an inline
* shader keyword next to a credential-shaped string is no longer a reason
* to suppress — that was the v7.0.0 polyglot false-negative (e.g. a `.ts`
* file with embedded GLSL block hiding a real secret on the next line).
*
* @param {string} str - The extracted string literal value
* @param {string} line - The full source line it came from
* @param {string} absPath - Absolute file path
* @param {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'} [context='mixed']
* File-level classification from classifyFileContext.
* @returns {boolean} - true if this string should be skipped
*/
function isFalsePositive(str, line, absPath, context = 'mixed') {
// 1. URLs — entropy is misleading for long query strings / JWTs in URLs
if (str.startsWith('http://') || str.startsWith('https://')) return true;
// 2. File/system paths
if (
str.startsWith('/') ||
str.startsWith('./') ||
str.startsWith('../') ||
/^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\
) return true;
// 3. Known hash formats in lock/checksum contexts
if (HEX_HASH_PATTERN.test(str)) {
if (
LOCK_FILE_PATTERN.test(absPath) ||
INTEGRITY_KEYWORDS.test(line)
) return true;
}
// 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;
// 5. UUID patterns
if (UUID_PATTERN.test(str)) return true;
// 6. CSS / SVG / font data URIs embedded in source
if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;
// 7. Import / require paths — the string is a module specifier, not a payload
if (
/^\s*import\s/i.test(line) ||
/\brequire\s*\(/i.test(line)
) return true;
// 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
if (SRI_PREFIX.test(str)) return true;
// 9. Line-level integrity keyword context (catches SRI in HTML <link> / <script> tags)
if (INTEGRITY_KEYWORDS.test(line)) return true;
// 10. Base64 image data-URI content (raw prefix check, separate from the line check above)
for (const prefix of DATA_URI_PREFIXES) {
if (str.startsWith(prefix)) return true;
}
// Rules 11-13 (v7.2.0 B5): line-proximity suppressions for shader/CSS/markup.
// Gated on context !== 'code-dominant' so that a credential adjacent to an
// inline GLSL keyword in a `.ts` file is no longer suppressed.
if (context !== 'code-dominant') {
// 11. GLSL/WGSL shader keywords on the line — inline shader source
if (GLSL_KEYWORDS.test(line)) return true;
// 12. CSS-in-JS (styled-components, emotion, vanilla-extract)
if (CSS_IN_JS_PATTERN.test(line)) return true;
// 13. Inline HTML/SVG markup — React/Vue components, email templates
if (INLINE_MARKUP.test(line)) return true;
}
// 14. ffmpeg filter-graph syntax — long structured strings, not encoded
if (FFMPEG_SYNTAX.test(line)) return true;
// 15. Browser User-Agent strings — hardcoded but structured, not a payload
if (USER_AGENT_PATTERN.test(line)) return true;
// 16. SQL DDL/DML — long SELECT/INSERT/... lines
if (SQL_STATEMENT.test(line)) return true;
// 17. Error-message templates (throw new Error("<html>...</html>"))
if (ERROR_TEMPLATE.test(line)) return true;
// 18. Markdown image syntax with external URL — CDN hash noise in content repos
if (MARKDOWN_IMAGE.test(line)) return true;
// 19. User-policy regex patterns from .llm-security/policy.json
for (const pattern of USER_SUPPRESS_LINE_PATTERNS) {
if (pattern.test(line)) return true;
}
return false;
}
/**
* Compile a list of regex sources (strings) into RegExp objects.
* Invalid patterns are silently skipped (policy is best-effort).
*
* @param {string[]} sources
* @returns {RegExp[]}
*/
function compilePatterns(sources) {
if (!Array.isArray(sources)) return [];
const compiled = [];
for (const src of sources) {
if (typeof src !== 'string' || src.length === 0) continue;
try {
compiled.push(new RegExp(src));
} catch { /* malformed regex — skip */ }
}
return compiled;
}
// ---------------------------------------------------------------------------
// Severity classification
// ---------------------------------------------------------------------------
/**
* Derive severity from entropy and string length.
* Returns null if below all thresholds.
*
* @param {number} H - Shannon entropy
* @param {number} len - String length
* @returns {string|null}
*/
function classifyEntropy(H, len) {
if (H >= THRESHOLDS.CRITICAL.entropy && len >= THRESHOLDS.CRITICAL.minLen) {
return SEVERITY.CRITICAL;
}
if (H >= THRESHOLDS.HIGH.entropy && len >= THRESHOLDS.HIGH.minLen) {
return SEVERITY.HIGH;
}
if (H >= THRESHOLDS.MEDIUM.entropy && len >= THRESHOLDS.MEDIUM.minLen) {
return SEVERITY.MEDIUM;
}
return null;
}
/**
* Merge two severities, keeping the higher one.
* @param {string|null} a
* @param {string|null} b
* @returns {string|null}
*/
function maxSeverity(a, b) {
const order = [SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO];
const rank = (s) => (s === null ? Infinity : order.indexOf(s));
return rank(a) <= rank(b) ? a : b;
}
// ---------------------------------------------------------------------------
// Per-file scanning
// ---------------------------------------------------------------------------
/**
* Scan a single file's content for high-entropy strings.
*
* @param {string} content - File text content
* @param {string} absPath - Absolute file path (for suppression checks)
* @param {string} relPath - Relative path (for finding output)
* @returns {object[]} - Array of finding objects
*/
function scanFileContent(content, absPath, relPath) {
const findings = [];
const lines = content.split('\n');
// v7.2.0 (B5): classify the file once per scan; rules 11-13 inside
// isFalsePositive are gated on this context.
const fileContext = classifyFileContext(absPath, lines);
// De-duplicate: track (line, evidence) pairs to avoid reporting the same
// string twice when it appears in both extractStringLiterals and assignment
// value extraction.
const seen = new Set();
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
const line = lines[lineIdx];
const lineNo = lineIdx + 1;
// Collect candidates: string literals from the standard extractor
const literalCandidates = extractStringLiterals(line);
// Additional extraction: assignment RHS values not caught by quote-matching
// (e.g., lines like: const TOKEN = "AQIB3j0..." or yaml: key: AQIB3j0...)
// We re-use the literal extractor which already handles these cases since it
// scans the full line. No extra pass needed — extractStringLiterals is
// comprehensive for quoted strings. Unquoted YAML values can appear here:
const unquotedYamlMatch = line.match(/^\s*\w[\w.-]*\s*:\s*([A-Za-z0-9+/=]{20,})(?:\s*#.*)?$/);
if (unquotedYamlMatch) {
literalCandidates.push(unquotedYamlMatch[1]);
}
for (const str of literalCandidates) {
if (!str || str.length < 10) continue;
// False positive suppression
if (isFalsePositive(str, line, absPath, fileContext)) continue;
const H = shannonEntropy(str);
let severity = classifyEntropy(H, str.length);
// Additional detection: base64-like blobs and hex blobs get at least MEDIUM
// even if entropy alone didn't trigger (very structured encodings can have
// slightly lower H than random but are still suspicious at length >100/64).
if (severity === null) {
if (isBase64Like(str) && str.length > 100) {
severity = SEVERITY.MEDIUM;
} else if (isHexBlob(str) && str.length > 64) {
severity = SEVERITY.MEDIUM;
}
} else {
// Structured encoding can upgrade or confirm severity
if (isBase64Like(str) && str.length > 100) {
severity = maxSeverity(severity, SEVERITY.MEDIUM);
}
if (isHexBlob(str) && str.length > 64) {
severity = maxSeverity(severity, SEVERITY.MEDIUM);
}
}
if (severity === null) continue;
// De-duplicate
const key = `${lineNo}:${str.slice(0, 16)}`;
if (seen.has(key)) continue;
seen.add(key);
// Determine OWASP mapping:
// - Very high entropy (>=5.5) with base64 → likely injection payload → LLM01
// - Encoded hex deps / supply chain obfuscation → LLM03
// - Default to LLM01 for encoded content that could carry instructions
const isLikelyPayload = H >= THRESHOLDS.CRITICAL.entropy || isBase64Like(str);
const owasp = isLikelyPayload ? 'LLM01' : 'LLM03';
const evidencePreview = redact(str, 8, 4);
const evidence = `H=${H.toFixed(2)}, len=${str.length}: ${evidencePreview}`;
findings.push(
finding({
scanner: 'ENT',
severity,
title: `High-entropy string (H=${H.toFixed(2)}, len=${str.length})`,
description:
`A string with unusually high Shannon entropy was detected. ` +
`High entropy (H>=${THRESHOLDS.MEDIUM.entropy}) in strings of this length ` +
`is characteristic of base64-encoded payloads, AES-encrypted blobs, ` +
`hardcoded secrets, or obfuscated instructions embedded in code or config.`,
file: relPath,
line: lineNo,
evidence,
owasp,
recommendation:
'Inspect this high-entropy string — it may contain an encoded payload, ' +
'hardcoded secret, or obfuscated code',
})
);
}
}
return findings;
}
// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------
/**
* Scan a target path for high-entropy encoded strings.
*
* @param {string} targetPath - Absolute path to scan (file or directory root)
* @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
* - Pre-computed file discovery result from the orchestrator
* @returns {Promise<object>} - Scanner result envelope
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
const allFindings = [];
let filesScanned = 0;
// Load policy for this target and apply overrides to module-level state.
// Best-effort — on any error we fall back to built-in defaults. Provenance
// tracked via file-existence check, not by comparing merged values (defaults
// always include an entropy section so a value-based check would always
// report 'policy.json').
let policySource = 'defaults';
try {
if (existsSync(join(targetPath, '.llm-security', 'policy.json'))) {
policySource = 'policy.json';
}
const policy = loadPolicy(targetPath);
const ent = policy?.entropy || {};
THRESHOLDS = resolveThresholds(ent.thresholds);
USER_SUPPRESS_LINE_PATTERNS = compilePatterns(ent.suppress_line_patterns);
USER_SUPPRESS_PATHS = Array.isArray(ent.suppress_paths) ? ent.suppress_paths.slice() : [];
USER_SUPPRESS_EXTENSIONS = new Set(
(Array.isArray(ent.suppress_extensions) ? ent.suppress_extensions : [])
.filter((e) => typeof e === 'string')
.map((e) => e.toLowerCase()),
);
} catch {
THRESHOLDS = DEFAULT_THRESHOLDS;
USER_SUPPRESS_LINE_PATTERNS = [];
USER_SUPPRESS_PATHS = [];
USER_SUPPRESS_EXTENSIONS = new Set();
policySource = 'defaults';
}
let filesSkippedByExtension = 0;
let filesSkippedByPath = 0;
try {
for (const fileInfo of discovery.files) {
// Context-aware skip: GPU shaders, stylesheets, SVG, minified bundles.
// These file types produce ~70% false-positive rate on real codebases.
if (shouldSkipByExtension(fileInfo)) {
filesSkippedByExtension++;
continue;
}
// User-policy path-substring skip (additive, for project-specific noise).
if (shouldSkipByPath(fileInfo)) {
filesSkippedByPath++;
continue;
}
const content = await readTextFile(fileInfo.absPath);
// readTextFile returns null for binary files or unreadable paths — skip silently
if (content === null) continue;
filesScanned++;
const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
allFindings.push(...fileFindings);
}
const durationMs = Date.now() - startMs;
const status = 'ok';
const result = scannerResult('entropy-scanner', status, allFindings, filesScanned, durationMs);
// Calibration stats for synthesizer — suppression & policy provenance.
result.calibration = {
files_skipped_by_extension: filesSkippedByExtension,
files_skipped_by_path: filesSkippedByPath,
skip_extensions: [...ENTROPY_SKIP_EXTENSIONS, '.min.js', '.min.css'],
policy_source: policySource,
thresholds: {
critical: { entropy: THRESHOLDS.CRITICAL.entropy, minLen: THRESHOLDS.CRITICAL.minLen },
high: { entropy: THRESHOLDS.HIGH.entropy, minLen: THRESHOLDS.HIGH.minLen },
medium: { entropy: THRESHOLDS.MEDIUM.entropy, minLen: THRESHOLDS.MEDIUM.minLen },
},
};
return result;
} catch (err) {
const durationMs = Date.now() - startMs;
return scannerResult(
'entropy-scanner',
'error',
allFindings,
filesScanned,
durationMs,
String(err?.message || err)
);
}
}