feat: initial open marketplace with llm-security, config-audit, ultraplan-local
This commit is contained in:
commit
f93d6abdae
380 changed files with 65935 additions and 0 deletions
329
plugins/llm-security/scanners/entropy-scanner.mjs
Normal file
329
plugins/llm-security/scanners/entropy-scanner.mjs
Normal file
|
|
@ -0,0 +1,329 @@
|
|||
// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
|
||||
// Zero dependencies (Node.js builtins only via lib helpers).
|
||||
//
|
||||
// Rationale: Malicious skills and MCP servers often hide injected instructions,
|
||||
// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
|
||||
// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
|
||||
//
|
||||
// References:
|
||||
// - OWASP LLM01 (Prompt Injection via encoded payloads)
|
||||
// - OWASP LLM03 (Supply Chain — obfuscated dependencies)
|
||||
// - ToxicSkills research: evasion via base64-wrapped instructions
|
||||
|
||||
import { readTextFile } from './lib/file-discovery.mjs';
|
||||
import { finding, scannerResult } from './lib/output.mjs';
|
||||
import { SEVERITY } from './lib/severity.mjs';
|
||||
import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Constants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
|
||||
*
|
||||
* Plaintext prose: H ≈ 3.5–4.2 (len 20–50)
|
||||
* Structured code/JSON: H ≈ 3.9–4.4 (len 40–80)
|
||||
* SQL queries: H ≈ 4.2–4.5 (len 50–100)
|
||||
* Base64 len=40: H ≈ 4.4–5.2 (avg 4.8, p90 5.0)
|
||||
* Base64 len=64: H ≈ 4.9–5.4 (avg 5.2, p90 5.3)
|
||||
* Base64 len=80: H ≈ 5.0–5.6 (avg 5.3, p90 5.5)
|
||||
* Base64 len=128: H ≈ 5.4–5.8 (avg 5.6, p90 5.7)
|
||||
*
|
||||
* Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
|
||||
* Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
|
||||
* the length-dependent entropy ceiling.
|
||||
*
|
||||
* Conservative design: prefer low false-negative rate (catch real payloads) at the cost
|
||||
* of some false positives that the analyst reviews. The false-positive suppression rules
|
||||
* above handle the most common benign cases.
|
||||
*/
|
||||
const THRESHOLDS = {
|
||||
// Large random-looking blob: very likely encoded/encrypted payload
|
||||
CRITICAL: { entropy: 5.4, minLen: 128 },
|
||||
// Medium-sized high-entropy string: likely encoded secret or payload fragment
|
||||
HIGH: { entropy: 5.1, minLen: 64 },
|
||||
// Shorter elevated-entropy string: suspicious but may be dense data/config
|
||||
MEDIUM: { entropy: 4.7, minLen: 40 },
|
||||
};
|
||||
|
||||
/** Known hash/checksum filename patterns — false positive suppression. */
|
||||
const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;
|
||||
|
||||
/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
|
||||
const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;
|
||||
|
||||
/** Integrity hash value prefixes (SRI format). */
|
||||
const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;
|
||||
|
||||
/** Known base64 image/font data-URI prefixes. */
|
||||
const DATA_URI_PREFIXES = [
|
||||
'iVBORw0KGgo', // PNG
|
||||
'/9j/', // JPEG
|
||||
'R0lGOD', // GIF
|
||||
'PHN2Zy', // SVG
|
||||
'AAABAA', // ICO
|
||||
'T2dnUw', // OGG (audio)
|
||||
'AAAAFGZ0', // MP4
|
||||
'UklGR', // WebP/RIFF
|
||||
'd09G', // WOFF font
|
||||
'AAEAAAALAAI', // TTF font
|
||||
];
|
||||
|
||||
/** UUID v4 pattern for false positive suppression. */
|
||||
const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||||
|
||||
/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
|
||||
const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// False-positive suppression helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Decide whether a candidate string should be suppressed (likely a false positive).
|
||||
*
|
||||
* @param {string} str - The extracted string literal value
|
||||
* @param {string} line - The full source line it came from
|
||||
* @param {string} absPath - Absolute file path
|
||||
* @returns {boolean} - true if this string should be skipped
|
||||
*/
|
||||
function isFalsePositive(str, line, absPath) {
|
||||
// 1. URLs — entropy is misleading for long query strings / JWTs in URLs
|
||||
if (str.startsWith('http://') || str.startsWith('https://')) return true;
|
||||
|
||||
// 2. File/system paths
|
||||
if (
|
||||
str.startsWith('/') ||
|
||||
str.startsWith('./') ||
|
||||
str.startsWith('../') ||
|
||||
/^[A-Za-z]:[/\\]/.test(str) // Windows drive letter, e.g. C:\
|
||||
) return true;
|
||||
|
||||
// 3. Known hash formats in lock/checksum contexts
|
||||
if (HEX_HASH_PATTERN.test(str)) {
|
||||
if (
|
||||
LOCK_FILE_PATTERN.test(absPath) ||
|
||||
INTEGRITY_KEYWORDS.test(line)
|
||||
) return true;
|
||||
}
|
||||
|
||||
// 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
|
||||
if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;
|
||||
|
||||
// 5. UUID patterns
|
||||
if (UUID_PATTERN.test(str)) return true;
|
||||
|
||||
// 6. CSS / SVG / font data URIs embedded in source
|
||||
if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;
|
||||
|
||||
// 7. Import / require paths — the string is a module specifier, not a payload
|
||||
if (
|
||||
/^\s*import\s/i.test(line) ||
|
||||
/\brequire\s*\(/i.test(line)
|
||||
) return true;
|
||||
|
||||
// 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
|
||||
if (SRI_PREFIX.test(str)) return true;
|
||||
|
||||
// 9. Line-level integrity keyword context (catches SRI in HTML <link> / <script> tags)
|
||||
if (INTEGRITY_KEYWORDS.test(line)) return true;
|
||||
|
||||
// 10. Base64 image data-URI content (raw prefix check, separate from the line check above)
|
||||
for (const prefix of DATA_URI_PREFIXES) {
|
||||
if (str.startsWith(prefix)) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Severity classification
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Derive severity from entropy and string length.
|
||||
* Returns null if below all thresholds.
|
||||
*
|
||||
* @param {number} H - Shannon entropy
|
||||
* @param {number} len - String length
|
||||
* @returns {string|null}
|
||||
*/
|
||||
function classifyEntropy(H, len) {
|
||||
if (H >= THRESHOLDS.CRITICAL.entropy && len >= THRESHOLDS.CRITICAL.minLen) {
|
||||
return SEVERITY.CRITICAL;
|
||||
}
|
||||
if (H >= THRESHOLDS.HIGH.entropy && len >= THRESHOLDS.HIGH.minLen) {
|
||||
return SEVERITY.HIGH;
|
||||
}
|
||||
if (H >= THRESHOLDS.MEDIUM.entropy && len >= THRESHOLDS.MEDIUM.minLen) {
|
||||
return SEVERITY.MEDIUM;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge two severities, keeping the higher one.
|
||||
* @param {string|null} a
|
||||
* @param {string|null} b
|
||||
* @returns {string|null}
|
||||
*/
|
||||
function maxSeverity(a, b) {
|
||||
const order = [SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO];
|
||||
const rank = (s) => (s === null ? Infinity : order.indexOf(s));
|
||||
return rank(a) <= rank(b) ? a : b;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-file scanning
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Scan a single file's content for high-entropy strings.
|
||||
*
|
||||
* @param {string} content - File text content
|
||||
* @param {string} absPath - Absolute file path (for suppression checks)
|
||||
* @param {string} relPath - Relative path (for finding output)
|
||||
* @returns {object[]} - Array of finding objects
|
||||
*/
|
||||
function scanFileContent(content, absPath, relPath) {
|
||||
const findings = [];
|
||||
const lines = content.split('\n');
|
||||
|
||||
// De-duplicate: track (line, evidence) pairs to avoid reporting the same
|
||||
// string twice when it appears in both extractStringLiterals and assignment
|
||||
// value extraction.
|
||||
const seen = new Set();
|
||||
|
||||
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
|
||||
const line = lines[lineIdx];
|
||||
const lineNo = lineIdx + 1;
|
||||
|
||||
// Collect candidates: string literals from the standard extractor
|
||||
const literalCandidates = extractStringLiterals(line);
|
||||
|
||||
// Additional extraction: assignment RHS values not caught by quote-matching
|
||||
// (e.g., lines like: const TOKEN = "AQIB3j0..." or yaml: key: AQIB3j0...)
|
||||
// We re-use the literal extractor which already handles these cases since it
|
||||
// scans the full line. No extra pass needed — extractStringLiterals is
|
||||
// comprehensive for quoted strings. Unquoted YAML values can appear here:
|
||||
const unquotedYamlMatch = line.match(/^\s*\w[\w.-]*\s*:\s*([A-Za-z0-9+/=]{20,})(?:\s*#.*)?$/);
|
||||
if (unquotedYamlMatch) {
|
||||
literalCandidates.push(unquotedYamlMatch[1]);
|
||||
}
|
||||
|
||||
for (const str of literalCandidates) {
|
||||
if (!str || str.length < 10) continue;
|
||||
|
||||
// False positive suppression
|
||||
if (isFalsePositive(str, line, absPath)) continue;
|
||||
|
||||
const H = shannonEntropy(str);
|
||||
let severity = classifyEntropy(H, str.length);
|
||||
|
||||
// Additional detection: base64-like blobs and hex blobs get at least MEDIUM
|
||||
// even if entropy alone didn't trigger (very structured encodings can have
|
||||
// slightly lower H than random but are still suspicious at length >100/64).
|
||||
if (severity === null) {
|
||||
if (isBase64Like(str) && str.length > 100) {
|
||||
severity = SEVERITY.MEDIUM;
|
||||
} else if (isHexBlob(str) && str.length > 64) {
|
||||
severity = SEVERITY.MEDIUM;
|
||||
}
|
||||
} else {
|
||||
// Structured encoding can upgrade or confirm severity
|
||||
if (isBase64Like(str) && str.length > 100) {
|
||||
severity = maxSeverity(severity, SEVERITY.MEDIUM);
|
||||
}
|
||||
if (isHexBlob(str) && str.length > 64) {
|
||||
severity = maxSeverity(severity, SEVERITY.MEDIUM);
|
||||
}
|
||||
}
|
||||
|
||||
if (severity === null) continue;
|
||||
|
||||
// De-duplicate
|
||||
const key = `${lineNo}:${str.slice(0, 16)}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
// Determine OWASP mapping:
|
||||
// - Very high entropy (>=5.5) with base64 → likely injection payload → LLM01
|
||||
// - Encoded hex deps / supply chain obfuscation → LLM03
|
||||
// - Default to LLM01 for encoded content that could carry instructions
|
||||
const isLikelyPayload = H >= THRESHOLDS.CRITICAL.entropy || isBase64Like(str);
|
||||
const owasp = isLikelyPayload ? 'LLM01' : 'LLM03';
|
||||
|
||||
const evidencePreview = redact(str, 8, 4);
|
||||
const evidence = `H=${H.toFixed(2)}, len=${str.length}: ${evidencePreview}`;
|
||||
|
||||
findings.push(
|
||||
finding({
|
||||
scanner: 'ENT',
|
||||
severity,
|
||||
title: `High-entropy string (H=${H.toFixed(2)}, len=${str.length})`,
|
||||
description:
|
||||
`A string with unusually high Shannon entropy was detected. ` +
|
||||
`High entropy (H>=${THRESHOLDS.MEDIUM.entropy}) in strings of this length ` +
|
||||
`is characteristic of base64-encoded payloads, AES-encrypted blobs, ` +
|
||||
`hardcoded secrets, or obfuscated instructions embedded in code or config.`,
|
||||
file: relPath,
|
||||
line: lineNo,
|
||||
evidence,
|
||||
owasp,
|
||||
recommendation:
|
||||
'Inspect this high-entropy string — it may contain an encoded payload, ' +
|
||||
'hardcoded secret, or obfuscated code',
|
||||
})
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return findings;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public scanner entry point
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Scan a target path for high-entropy encoded strings.
|
||||
*
|
||||
* @param {string} targetPath - Absolute path to scan (file or directory root)
|
||||
* @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
|
||||
* - Pre-computed file discovery result from the orchestrator
|
||||
* @returns {Promise<object>} - Scanner result envelope
|
||||
*/
|
||||
export async function scan(targetPath, discovery) {
|
||||
const startMs = Date.now();
|
||||
const allFindings = [];
|
||||
let filesScanned = 0;
|
||||
|
||||
try {
|
||||
for (const fileInfo of discovery.files) {
|
||||
const content = await readTextFile(fileInfo.absPath);
|
||||
|
||||
// readTextFile returns null for binary files or unreadable paths — skip silently
|
||||
if (content === null) continue;
|
||||
|
||||
filesScanned++;
|
||||
|
||||
const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
|
||||
allFindings.push(...fileFindings);
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startMs;
|
||||
const status = 'ok';
|
||||
|
||||
return scannerResult('entropy-scanner', status, allFindings, filesScanned, durationMs);
|
||||
} catch (err) {
|
||||
const durationMs = Date.now() - startMs;
|
||||
return scannerResult(
|
||||
'entropy-scanner',
|
||||
'error',
|
||||
allFindings,
|
||||
filesScanned,
|
||||
durationMs,
|
||||
String(err?.message || err)
|
||||
);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue