ktg-plugin-marketplace/plugins/llm-security/scanners/entropy-scanner.mjs

329 lines
12 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
// Zero dependencies (Node.js builtins only via lib helpers).
//
// Rationale: Malicious skills and MCP servers often hide injected instructions,
// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
//
// References:
// - OWASP LLM01 (Prompt Injection via encoded payloads)
// - OWASP LLM03 (Supply Chain — obfuscated dependencies)
// - ToxicSkills research: evasion via base64-wrapped instructions
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
*
* Plaintext prose: H ≈ 3.54.2 (len 2050)
* Structured code/JSON: H ≈ 3.94.4 (len 4080)
* SQL queries: H ≈ 4.24.5 (len 50100)
* Base64 len=40: H ≈ 4.45.2 (avg 4.8, p90 5.0)
* Base64 len=64: H ≈ 4.95.4 (avg 5.2, p90 5.3)
* Base64 len=80: H ≈ 5.05.6 (avg 5.3, p90 5.5)
* Base64 len=128: H ≈ 5.45.8 (avg 5.6, p90 5.7)
*
* Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
* Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
* the length-dependent entropy ceiling.
*
* Conservative design: prefer low false-negative rate (catch real payloads) at the cost
* of some false positives that the analyst reviews. The false-positive suppression rules
* above handle the most common benign cases.
*/
const THRESHOLDS = {
// Large random-looking blob: very likely encoded/encrypted payload
CRITICAL: { entropy: 5.4, minLen: 128 },
// Medium-sized high-entropy string: likely encoded secret or payload fragment
HIGH: { entropy: 5.1, minLen: 64 },
// Shorter elevated-entropy string: suspicious but may be dense data/config
MEDIUM: { entropy: 4.7, minLen: 40 },
};
/** Known hash/checksum filename patterns — false positive suppression. */
const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;
/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;
/** Integrity hash value prefixes (SRI format). */
const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;
/** Known base64 image/font data-URI prefixes. */
const DATA_URI_PREFIXES = [
'iVBORw0KGgo', // PNG
'/9j/', // JPEG
'R0lGOD', // GIF
'PHN2Zy', // SVG
'AAABAA', // ICO
'T2dnUw', // OGG (audio)
'AAAAFGZ0', // MP4
'UklGR', // WebP/RIFF
'd09G', // WOFF font
'AAEAAAALAAI', // TTF font
];
/** UUID v4 pattern for false positive suppression. */
const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;
// ---------------------------------------------------------------------------
// False-positive suppression helpers
// ---------------------------------------------------------------------------
/**
* Decide whether a candidate string should be suppressed (likely a false positive).
*
* @param {string} str - The extracted string literal value
* @param {string} line - The full source line it came from
* @param {string} absPath - Absolute file path
* @returns {boolean} - true if this string should be skipped
*/
function isFalsePositive(str, line, absPath) {
// 1. URLs — entropy is misleading for long query strings / JWTs in URLs
if (str.startsWith('http://') || str.startsWith('https://')) return true;
// 2. File/system paths
if (
str.startsWith('/') ||
str.startsWith('./') ||
str.startsWith('../') ||
/^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\
) return true;
// 3. Known hash formats in lock/checksum contexts
if (HEX_HASH_PATTERN.test(str)) {
if (
LOCK_FILE_PATTERN.test(absPath) ||
INTEGRITY_KEYWORDS.test(line)
) return true;
}
// 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;
// 5. UUID patterns
if (UUID_PATTERN.test(str)) return true;
// 6. CSS / SVG / font data URIs embedded in source
if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;
// 7. Import / require paths — the string is a module specifier, not a payload
if (
/^\s*import\s/i.test(line) ||
/\brequire\s*\(/i.test(line)
) return true;
// 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
if (SRI_PREFIX.test(str)) return true;
// 9. Line-level integrity keyword context (catches SRI in HTML <link> / <script> tags)
if (INTEGRITY_KEYWORDS.test(line)) return true;
// 10. Base64 image data-URI content (raw prefix check, separate from the line check above)
for (const prefix of DATA_URI_PREFIXES) {
if (str.startsWith(prefix)) return true;
}
return false;
}
// ---------------------------------------------------------------------------
// Severity classification
// ---------------------------------------------------------------------------
/**
* Derive severity from entropy and string length.
* Returns null if below all thresholds.
*
* @param {number} H - Shannon entropy
* @param {number} len - String length
* @returns {string|null}
*/
function classifyEntropy(H, len) {
if (H >= THRESHOLDS.CRITICAL.entropy && len >= THRESHOLDS.CRITICAL.minLen) {
return SEVERITY.CRITICAL;
}
if (H >= THRESHOLDS.HIGH.entropy && len >= THRESHOLDS.HIGH.minLen) {
return SEVERITY.HIGH;
}
if (H >= THRESHOLDS.MEDIUM.entropy && len >= THRESHOLDS.MEDIUM.minLen) {
return SEVERITY.MEDIUM;
}
return null;
}
/**
* Merge two severities, keeping the higher one.
* @param {string|null} a
* @param {string|null} b
* @returns {string|null}
*/
function maxSeverity(a, b) {
const order = [SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO];
const rank = (s) => (s === null ? Infinity : order.indexOf(s));
return rank(a) <= rank(b) ? a : b;
}
// ---------------------------------------------------------------------------
// Per-file scanning
// ---------------------------------------------------------------------------
/**
* Scan a single file's content for high-entropy strings.
*
* @param {string} content - File text content
* @param {string} absPath - Absolute file path (for suppression checks)
* @param {string} relPath - Relative path (for finding output)
* @returns {object[]} - Array of finding objects
*/
function scanFileContent(content, absPath, relPath) {
const findings = [];
const lines = content.split('\n');
// De-duplicate: track (line, evidence) pairs to avoid reporting the same
// string twice when it appears in both extractStringLiterals and assignment
// value extraction.
const seen = new Set();
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
const line = lines[lineIdx];
const lineNo = lineIdx + 1;
// Collect candidates: string literals from the standard extractor
const literalCandidates = extractStringLiterals(line);
// Additional extraction: assignment RHS values not caught by quote-matching
// (e.g., lines like: const TOKEN = "AQIB3j0..." or yaml: key: AQIB3j0...)
// We re-use the literal extractor which already handles these cases since it
// scans the full line. No extra pass needed — extractStringLiterals is
// comprehensive for quoted strings. Unquoted YAML values can appear here:
const unquotedYamlMatch = line.match(/^\s*\w[\w.-]*\s*:\s*([A-Za-z0-9+/=]{20,})(?:\s*#.*)?$/);
if (unquotedYamlMatch) {
literalCandidates.push(unquotedYamlMatch[1]);
}
for (const str of literalCandidates) {
if (!str || str.length < 10) continue;
// False positive suppression
if (isFalsePositive(str, line, absPath)) continue;
const H = shannonEntropy(str);
let severity = classifyEntropy(H, str.length);
// Additional detection: base64-like blobs and hex blobs get at least MEDIUM
// even if entropy alone didn't trigger (very structured encodings can have
// slightly lower H than random but are still suspicious at length >100/64).
if (severity === null) {
if (isBase64Like(str) && str.length > 100) {
severity = SEVERITY.MEDIUM;
} else if (isHexBlob(str) && str.length > 64) {
severity = SEVERITY.MEDIUM;
}
} else {
// Structured encoding can upgrade or confirm severity
if (isBase64Like(str) && str.length > 100) {
severity = maxSeverity(severity, SEVERITY.MEDIUM);
}
if (isHexBlob(str) && str.length > 64) {
severity = maxSeverity(severity, SEVERITY.MEDIUM);
}
}
if (severity === null) continue;
// De-duplicate
const key = `${lineNo}:${str.slice(0, 16)}`;
if (seen.has(key)) continue;
seen.add(key);
// Determine OWASP mapping:
// - Very high entropy (>=5.5) with base64 → likely injection payload → LLM01
// - Encoded hex deps / supply chain obfuscation → LLM03
// - Default to LLM01 for encoded content that could carry instructions
const isLikelyPayload = H >= THRESHOLDS.CRITICAL.entropy || isBase64Like(str);
const owasp = isLikelyPayload ? 'LLM01' : 'LLM03';
const evidencePreview = redact(str, 8, 4);
const evidence = `H=${H.toFixed(2)}, len=${str.length}: ${evidencePreview}`;
findings.push(
finding({
scanner: 'ENT',
severity,
title: `High-entropy string (H=${H.toFixed(2)}, len=${str.length})`,
description:
`A string with unusually high Shannon entropy was detected. ` +
`High entropy (H>=${THRESHOLDS.MEDIUM.entropy}) in strings of this length ` +
`is characteristic of base64-encoded payloads, AES-encrypted blobs, ` +
`hardcoded secrets, or obfuscated instructions embedded in code or config.`,
file: relPath,
line: lineNo,
evidence,
owasp,
recommendation:
'Inspect this high-entropy string — it may contain an encoded payload, ' +
'hardcoded secret, or obfuscated code',
})
);
}
}
return findings;
}
// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------
/**
* Scan a target path for high-entropy encoded strings.
*
* @param {string} targetPath - Absolute path to scan (file or directory root)
* @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
* - Pre-computed file discovery result from the orchestrator
* @returns {Promise<object>} - Scanner result envelope
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
const allFindings = [];
let filesScanned = 0;
try {
for (const fileInfo of discovery.files) {
const content = await readTextFile(fileInfo.absPath);
// readTextFile returns null for binary files or unreadable paths — skip silently
if (content === null) continue;
filesScanned++;
const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
allFindings.push(...fileFindings);
}
const durationMs = Date.now() - startMs;
const status = 'ok';
return scannerResult('entropy-scanner', status, allFindings, filesScanned, durationMs);
} catch (err) {
const durationMs = Date.now() - startMs;
return scannerResult(
'entropy-scanner',
'error',
allFindings,
filesScanned,
durationMs,
String(err?.message || err)
);
}
}