ktg-plugin-marketplace/plugins/llm-security/scanners/entropy-scanner.mjs

// entropy-scanner.mjs — Detects encoded payloads via Shannon entropy analysis
// Zero dependencies (Node.js builtins only via lib helpers).
//
// Rationale: Malicious skills and MCP servers often hide injected instructions,
// exfiltration endpoints, or obfuscated scripts in high-entropy encoded blobs
// (base64, hex, AES-encrypted payloads). This scanner flags those blobs for review.
//
// References:
//   - OWASP LLM01 (Prompt Injection via encoded payloads)
//   - OWASP LLM03 (Supply Chain — obfuscated dependencies)
//   - ToxicSkills research: evasion via base64-wrapped instructions

import { existsSync } from 'node:fs';
import { join } from 'node:path';
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { shannonEntropy, extractStringLiterals, isBase64Like, isHexBlob, redact } from './lib/string-utils.mjs';
import { loadPolicy } from './lib/policy-loader.mjs';

// ---------------------------------------------------------------------------
// File-extension suppression (context-aware, v7.0.0+)
// ---------------------------------------------------------------------------

/**
 * Extensions whose contents are almost always benign high-entropy noise:
 * GPU shaders, stylesheets, SVG markup. Scanning these produces massive
 * false-positive rates (observed 70% FP on hyperframes renderer codebase).
 */
const ENTROPY_SKIP_EXTENSIONS = new Set([
  '.glsl', '.frag', '.vert', '.shader', '.wgsl',  // GPU shaders
  '.css', '.scss', '.sass', '.less',              // stylesheets
  '.svg',                                          // SVG markup
]);

/**
 * @param {{ relPath: string, ext: string }} fileInfo
 * @returns {boolean} true if the file should be skipped entirely
 */
function shouldSkipByExtension(fileInfo) {
  const lowerPath = (fileInfo.relPath || '').toLowerCase();
  if (lowerPath.endsWith('.min.js') || lowerPath.endsWith('.min.css')) return true;
  const ext = (fileInfo.ext || '').toLowerCase();
  if (ENTROPY_SKIP_EXTENSIONS.has(ext)) return true;
  if (USER_SUPPRESS_EXTENSIONS.has(ext)) return true;
  return false;
}

/**
 * @param {{ relPath: string }} fileInfo
 * @returns {boolean} true if the file's relative path matches any user-policy skip-path substring.
 */
function shouldSkipByPath(fileInfo) {
  if (USER_SUPPRESS_PATHS.length === 0) return false;
  const rel = fileInfo.relPath || '';
  for (const needle of USER_SUPPRESS_PATHS) {
    if (typeof needle === 'string' && needle.length > 0 && rel.includes(needle)) return true;
  }
  return false;
}

// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------

/** Entropy thresholds (bits/char). Empirically calibrated against real distributions:
 *
 *  Plaintext prose:       H ≈ 3.5–4.2 (len 20–50)
 *  Structured code/JSON:  H ≈ 3.9–4.4 (len 40–80)
 *  SQL queries:           H ≈ 4.2–4.5 (len 50–100)
 *  Base64 len=40:         H ≈ 4.4–5.2 (avg 4.8, p90 5.0)
 *  Base64 len=64:         H ≈ 4.9–5.4 (avg 5.2, p90 5.3)
 *  Base64 len=80:         H ≈ 5.0–5.6 (avg 5.3, p90 5.5)
 *  Base64 len=128:        H ≈ 5.4–5.8 (avg 5.6, p90 5.7)
 *
 *  Key insight: base64 alphabet is only 65 chars → max theoretical H = log2(65) ≈ 6.02.
 *  Random base64 of len 64 achieves H ≈ 5.2 on average. Thresholds must account for
 *  the length-dependent entropy ceiling.
 *
 *  Conservative design: prefer low false-negative rate (catch real payloads) at the cost
 *  of some false positives that the analyst reviews. The false-positive suppression rules
 *  above handle the most common benign cases.
 */
const DEFAULT_THRESHOLDS = {
  // Large random-looking blob: very likely encoded/encrypted payload
  CRITICAL: { entropy: 5.4, minLen: 128 },
  // Medium-sized high-entropy string: likely encoded secret or payload fragment
  HIGH:     { entropy: 5.1, minLen: 64 },
  // Shorter elevated-entropy string: suspicious but may be dense data/config
  MEDIUM:   { entropy: 4.7, minLen: 40 },
};

/**
 * Merge policy.entropy.thresholds over defaults. Policy keys are lowercase
 * (critical/high/medium) to match other policy sections; defaults use uppercase
 * internally.
 *
 * @param {object|undefined} policyThresholds
 * @returns {typeof DEFAULT_THRESHOLDS}
 */
function resolveThresholds(policyThresholds) {
  if (!policyThresholds) return DEFAULT_THRESHOLDS;
  return {
    CRITICAL: { ...DEFAULT_THRESHOLDS.CRITICAL, ...(policyThresholds.critical || {}) },
    HIGH:     { ...DEFAULT_THRESHOLDS.HIGH,     ...(policyThresholds.high     || {}) },
    MEDIUM:   { ...DEFAULT_THRESHOLDS.MEDIUM,   ...(policyThresholds.medium   || {}) },
  };
}

// Effective thresholds after policy-merge (set at scan() entry, read by classifyEntropy).
let THRESHOLDS = DEFAULT_THRESHOLDS;

/** User-extensible line-level regex patterns compiled from policy. Set per scan. */
let USER_SUPPRESS_LINE_PATTERNS = [];

/** User-extensible relative-path substrings to skip entirely. Set per scan. */
let USER_SUPPRESS_PATHS = [];

/** User-extensible extension suppress list (merged with built-in). Set per scan. */
let USER_SUPPRESS_EXTENSIONS = new Set();

/** Known hash/checksum filename patterns — false positive suppression. */
const LOCK_FILE_PATTERN = /(?:package-lock\.json|yarn\.lock|pnpm-lock\.yaml|\.lock)$/i;

/** Line-level keywords that suggest integrity hashes rather than encoded payloads. */
const INTEGRITY_KEYWORDS = /\b(?:integrity|checksum|sha256|sha384|sha512|sha1|md5)\b/i;

/** Integrity hash value prefixes (SRI format). */
const SRI_PREFIX = /^(?:sha256-|sha384-|sha512-)/;

/** Known base64 image/font data-URI prefixes. */
const DATA_URI_PREFIXES = [
  'iVBORw0KGgo',  // PNG
  '/9j/',          // JPEG
  'R0lGOD',       // GIF
  'PHN2Zy',       // SVG
  'AAABAA',       // ICO
  'T2dnUw',       // OGG (audio)
  'AAAAFGZ0',     // MP4
  'UklGR',        // WebP/RIFF
  'd09G',         // WOFF font
  'AAEAAAALAAI',  // TTF font
];

/** UUID v4 pattern for false positive suppression. */
const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;

/** Pure lowercase hex that could be a hash digest (not obfuscated code). */
const HEX_HASH_PATTERN = /^[a-f0-9]{32,128}$/i;

/** GLSL/WGSL shader keywords — suppress inline shader source (gl_Position, vec3, uniform, ...). */
const GLSL_KEYWORDS = /\b(?:gl_(?:Position|FragColor|FragCoord|PointSize|PointCoord)|vec[234]|mat[234]|uniform|varying|attribute|precision\s+(?:high|medium|low)p|smoothstep|mix|clamp|texture2D|textureCube|sampler[123]D)\b/;

/** CSS-in-JS patterns (styled-components, emotion, vanilla-extract, @keyframes). */
const CSS_IN_JS_PATTERN = /\b(?:styled\.[a-z]+|css)\s*`|@(?:keyframes|media|supports)\s|:\s*(?:hover|focus|active|before|after|visited|root)\b/;

/** Inline HTML/SVG markup in source (tags with attributes on the same line). */
const INLINE_MARKUP = /<(?:svg|path|defs|g\s|rect\s|circle\s|polygon|polyline|ellipse|line\s|use\s|symbol\s|clipPath|linearGradient|radialGradient|div\s+[a-z-]+|span\s+[a-z-]+|style>|script>|template\s)/i;

/** ffmpeg filter-graph syntax (stream selectors + filter chains). */
const FFMPEG_SYNTAX = /\[\d+:[avs]\]|(?:scale|crop|concat|overlay|psnr|drawtext|setpts|atempo|filter_complex|format|pad|trim|setdar|setsar)\s*=/;

/** Browser User-Agent strings (hardcoded in source — long but structured, not encoded). */
const USER_AGENT_PATTERN = /Mozilla\/\d|AppleWebKit|Chrome\/\d+|Safari\/\d+|Firefox\/\d+|Edg\/\d+|OPR\/\d+/;

/** SQL DDL/DML statements (long structured strings, not encoded payloads). */
const SQL_STATEMENT = /^\s*(?:SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|WITH|DROP|TRUNCATE|GRANT|REVOKE)\s+/i;

/** Error-message templates with embedded HTML/markup (throw new Error("<div>...</div>")). */
const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxError)|new\s+Error\s*\()\s*[`'"]/;

/**
 * Markdown image syntax with external URL — `![alt](https://cdn.../hash.ext)`.
 * Common in JSON data indexes / article metadata; CDN URL hash segments
 * produce high Shannon entropy but are not credentials.
 */
const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//;

// ---------------------------------------------------------------------------
// False-positive suppression helpers
// ---------------------------------------------------------------------------

/**
 * Decide whether a candidate string should be suppressed (likely a false positive).
 *
 * @param {string}  str      - The extracted string literal value
 * @param {string}  line     - The full source line it came from
 * @param {string}  absPath  - Absolute file path
 * @returns {boolean}        - true if this string should be skipped
 */
function isFalsePositive(str, line, absPath) {
  // 1. URLs — entropy is misleading for long query strings / JWTs in URLs
  if (str.startsWith('http://') || str.startsWith('https://')) return true;

  // 2. File/system paths
  if (
    str.startsWith('/') ||
    str.startsWith('./') ||
    str.startsWith('../') ||
    /^[A-Za-z]:[/\\]/.test(str)  // Windows drive letter, e.g. C:\
  ) return true;

  // 3. Known hash formats in lock/checksum contexts
  if (HEX_HASH_PATTERN.test(str)) {
    if (
      LOCK_FILE_PATTERN.test(absPath) ||
      INTEGRITY_KEYWORDS.test(line)
    ) return true;
  }

  // 4. Test/fixture files — intentionally contain example secrets, tokens, etc.
  if (/(?:test|spec|fixture|mock|__test__|__spec__)/i.test(absPath)) return true;

  // 5. UUID patterns
  if (UUID_PATTERN.test(str)) return true;

  // 6. CSS / SVG / font data URIs embedded in source
  if (/data:image\/|data:font\/|data:application\//i.test(line)) return true;

  // 7. Import / require paths — the string is a module specifier, not a payload
  if (
    /^\s*import\s/i.test(line) ||
    /\brequire\s*\(/i.test(line)
  ) return true;

  // 8. SRI integrity hash values (sha256-..., sha384-..., sha512-...)
  if (SRI_PREFIX.test(str)) return true;

  // 9. Line-level integrity keyword context (catches SRI in HTML <link> / <script> tags)
  if (INTEGRITY_KEYWORDS.test(line)) return true;

  // 10. Base64 image data-URI content (raw prefix check, separate from the line check above)
  for (const prefix of DATA_URI_PREFIXES) {
    if (str.startsWith(prefix)) return true;
  }

  // 11. GLSL/WGSL shader keywords on the line — inline shader source
  if (GLSL_KEYWORDS.test(line)) return true;

  // 12. CSS-in-JS (styled-components, emotion, vanilla-extract)
  if (CSS_IN_JS_PATTERN.test(line)) return true;

  // 13. Inline HTML/SVG markup — React/Vue components, email templates
  if (INLINE_MARKUP.test(line)) return true;

  // 14. ffmpeg filter-graph syntax — long structured strings, not encoded
  if (FFMPEG_SYNTAX.test(line)) return true;

  // 15. Browser User-Agent strings — hardcoded but structured, not a payload
  if (USER_AGENT_PATTERN.test(line)) return true;

  // 16. SQL DDL/DML — long SELECT/INSERT/... lines
  if (SQL_STATEMENT.test(line)) return true;

  // 17. Error-message templates (throw new Error("<html>...</html>"))
  if (ERROR_TEMPLATE.test(line)) return true;

  // 18. Markdown image syntax with external URL — CDN hash noise in content repos
  if (MARKDOWN_IMAGE.test(line)) return true;

  // 19. User-policy regex patterns from .llm-security/policy.json
  for (const pattern of USER_SUPPRESS_LINE_PATTERNS) {
    if (pattern.test(line)) return true;
  }

  return false;
}

/**
 * Compile a list of regex sources (strings) into RegExp objects.
 * Invalid patterns are silently skipped (policy is best-effort).
 *
 * @param {string[]} sources
 * @returns {RegExp[]}
 */
function compilePatterns(sources) {
  if (!Array.isArray(sources)) return [];
  const compiled = [];
  for (const src of sources) {
    if (typeof src !== 'string' || src.length === 0) continue;
    try {
      compiled.push(new RegExp(src));
    } catch { /* malformed regex — skip */ }
  }
  return compiled;
}

// ---------------------------------------------------------------------------
// Severity classification
// ---------------------------------------------------------------------------

/**
 * Derive severity from entropy and string length.
 * Returns null if below all thresholds.
 *
 * @param {number} H   - Shannon entropy
 * @param {number} len - String length
 * @returns {string|null}
 */
function classifyEntropy(H, len) {
  if (H >= THRESHOLDS.CRITICAL.entropy && len >= THRESHOLDS.CRITICAL.minLen) {
    return SEVERITY.CRITICAL;
  }
  if (H >= THRESHOLDS.HIGH.entropy && len >= THRESHOLDS.HIGH.minLen) {
    return SEVERITY.HIGH;
  }
  if (H >= THRESHOLDS.MEDIUM.entropy && len >= THRESHOLDS.MEDIUM.minLen) {
    return SEVERITY.MEDIUM;
  }
  return null;
}

/**
 * Merge two severities, keeping the higher one.
 * @param {string|null} a
 * @param {string|null} b
 * @returns {string|null}
 */
function maxSeverity(a, b) {
  const order = [SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO];
  const rank = (s) => (s === null ? Infinity : order.indexOf(s));
  return rank(a) <= rank(b) ? a : b;
}

// ---------------------------------------------------------------------------
// Per-file scanning
// ---------------------------------------------------------------------------

/**
 * Scan a single file's content for high-entropy strings.
 *
 * @param {string}   content  - File text content
 * @param {string}   absPath  - Absolute file path (for suppression checks)
 * @param {string}   relPath  - Relative path (for finding output)
 * @returns {object[]}        - Array of finding objects
 */
function scanFileContent(content, absPath, relPath) {
  const findings = [];
  const lines = content.split('\n');

  // De-duplicate: track (line, evidence) pairs to avoid reporting the same
  // string twice when it appears in both extractStringLiterals and assignment
  // value extraction.
  const seen = new Set();

  for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
    const line = lines[lineIdx];
    const lineNo = lineIdx + 1;

    // Collect candidates: string literals from the standard extractor
    const literalCandidates = extractStringLiterals(line);

    // Additional extraction: assignment RHS values not caught by quote-matching
    // (e.g., lines like: const TOKEN = "AQIB3j0..." or yaml: key: AQIB3j0...)
    // We re-use the literal extractor which already handles these cases since it
    // scans the full line. No extra pass needed — extractStringLiterals is
    // comprehensive for quoted strings. Unquoted YAML values can appear here:
    const unquotedYamlMatch = line.match(/^\s*\w[\w.-]*\s*:\s*([A-Za-z0-9+/=]{20,})(?:\s*#.*)?$/);
    if (unquotedYamlMatch) {
      literalCandidates.push(unquotedYamlMatch[1]);
    }

    for (const str of literalCandidates) {
      if (!str || str.length < 10) continue;

      // False positive suppression
      if (isFalsePositive(str, line, absPath)) continue;

      const H = shannonEntropy(str);
      let severity = classifyEntropy(H, str.length);

      // Additional detection: base64-like blobs and hex blobs get at least MEDIUM
      // even if entropy alone didn't trigger (very structured encodings can have
      // slightly lower H than random but are still suspicious at length >100/64).
      if (severity === null) {
        if (isBase64Like(str) && str.length > 100) {
          severity = SEVERITY.MEDIUM;
        } else if (isHexBlob(str) && str.length > 64) {
          severity = SEVERITY.MEDIUM;
        }
      } else {
        // Structured encoding can upgrade or confirm severity
        if (isBase64Like(str) && str.length > 100) {
          severity = maxSeverity(severity, SEVERITY.MEDIUM);
        }
        if (isHexBlob(str) && str.length > 64) {
          severity = maxSeverity(severity, SEVERITY.MEDIUM);
        }
      }

      if (severity === null) continue;

      // De-duplicate
      const key = `${lineNo}:${str.slice(0, 16)}`;
      if (seen.has(key)) continue;
      seen.add(key);

      // Determine OWASP mapping:
      //   - Very high entropy (>=5.5) with base64 → likely injection payload → LLM01
      //   - Encoded hex deps / supply chain obfuscation → LLM03
      //   - Default to LLM01 for encoded content that could carry instructions
      const isLikelyPayload = H >= THRESHOLDS.CRITICAL.entropy || isBase64Like(str);
      const owasp = isLikelyPayload ? 'LLM01' : 'LLM03';

      const evidencePreview = redact(str, 8, 4);
      const evidence = `H=${H.toFixed(2)}, len=${str.length}: ${evidencePreview}`;

      findings.push(
        finding({
          scanner: 'ENT',
          severity,
          title: `High-entropy string (H=${H.toFixed(2)}, len=${str.length})`,
          description:
            `A string with unusually high Shannon entropy was detected. ` +
            `High entropy (H>=${THRESHOLDS.MEDIUM.entropy}) in strings of this length ` +
            `is characteristic of base64-encoded payloads, AES-encrypted blobs, ` +
            `hardcoded secrets, or obfuscated instructions embedded in code or config.`,
          file: relPath,
          line: lineNo,
          evidence,
          owasp,
          recommendation:
            'Inspect this high-entropy string — it may contain an encoded payload, ' +
            'hardcoded secret, or obfuscated code',
        })
      );
    }
  }

  return findings;
}

// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------

/**
 * Scan a target path for high-entropy encoded strings.
 *
 * @param {string}  targetPath  - Absolute path to scan (file or directory root)
 * @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
 *   - Pre-computed file discovery result from the orchestrator
 * @returns {Promise<object>}   - Scanner result envelope
 */
export async function scan(targetPath, discovery) {
  const startMs = Date.now();
  const allFindings = [];
  let filesScanned = 0;

  // Load policy for this target and apply overrides to module-level state.
  // Best-effort — on any error we fall back to built-in defaults. Provenance
  // tracked via file-existence check, not by comparing merged values (defaults
  // always include an entropy section so a value-based check would always
  // report 'policy.json').
  let policySource = 'defaults';
  try {
    if (existsSync(join(targetPath, '.llm-security', 'policy.json'))) {
      policySource = 'policy.json';
    }
    const policy = loadPolicy(targetPath);
    const ent = policy?.entropy || {};
    THRESHOLDS = resolveThresholds(ent.thresholds);
    USER_SUPPRESS_LINE_PATTERNS = compilePatterns(ent.suppress_line_patterns);
    USER_SUPPRESS_PATHS = Array.isArray(ent.suppress_paths) ? ent.suppress_paths.slice() : [];
    USER_SUPPRESS_EXTENSIONS = new Set(
      (Array.isArray(ent.suppress_extensions) ? ent.suppress_extensions : [])
        .filter((e) => typeof e === 'string')
        .map((e) => e.toLowerCase()),
    );
  } catch {
    THRESHOLDS = DEFAULT_THRESHOLDS;
    USER_SUPPRESS_LINE_PATTERNS = [];
    USER_SUPPRESS_PATHS = [];
    USER_SUPPRESS_EXTENSIONS = new Set();
    policySource = 'defaults';
  }

  let filesSkippedByExtension = 0;
  let filesSkippedByPath = 0;

  try {
    for (const fileInfo of discovery.files) {
      // Context-aware skip: GPU shaders, stylesheets, SVG, minified bundles.
      // These file types produce ~70% false-positive rate on real codebases.
      if (shouldSkipByExtension(fileInfo)) {
        filesSkippedByExtension++;
        continue;
      }

      // User-policy path-substring skip (additive, for project-specific noise).
      if (shouldSkipByPath(fileInfo)) {
        filesSkippedByPath++;
        continue;
      }

      const content = await readTextFile(fileInfo.absPath);

      // readTextFile returns null for binary files or unreadable paths — skip silently
      if (content === null) continue;

      filesScanned++;

      const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
      allFindings.push(...fileFindings);
    }

    const durationMs = Date.now() - startMs;
    const status = 'ok';

    const result = scannerResult('entropy-scanner', status, allFindings, filesScanned, durationMs);
    // Calibration stats for synthesizer — suppression & policy provenance.
    result.calibration = {
      files_skipped_by_extension: filesSkippedByExtension,
      files_skipped_by_path: filesSkippedByPath,
      skip_extensions: [...ENTROPY_SKIP_EXTENSIONS, '.min.js', '.min.css'],
      policy_source: policySource,
      thresholds: {
        critical: { entropy: THRESHOLDS.CRITICAL.entropy, minLen: THRESHOLDS.CRITICAL.minLen },
        high:     { entropy: THRESHOLDS.HIGH.entropy,     minLen: THRESHOLDS.HIGH.minLen     },
        medium:   { entropy: THRESHOLDS.MEDIUM.entropy,   minLen: THRESHOLDS.MEDIUM.minLen   },
      },
    };
    return result;
  } catch (err) {
    const durationMs = Date.now() - startMs;
    return scannerResult(
      'entropy-scanner',
      'error',
      allFindings,
      filesScanned,
      durationMs,
      String(err?.message || err)
    );
  }
}