ktg-plugin-marketplace/plugins/llm-security/scanners/lib/diff-engine.mjs

276 lines
9.5 KiB
JavaScript

// diff-engine.mjs — Baseline storage, finding fingerprinting, and diff categorization.
// Compares scan results against a stored baseline to classify findings as:
// new — present in current scan, absent from baseline
// resolved — present in baseline, absent from current scan
// unchanged — matched between baseline and current (line drift ≤3)
// moved — same finding, different location (line drift >3 or file renamed)
// Zero external dependencies.
import { createHash } from 'node:crypto';
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
import { join, resolve } from 'node:path';
// ---------------------------------------------------------------------------
// Configuration
// ---------------------------------------------------------------------------
const LINE_FUZZY_THRESHOLD = 3; // ±3 lines = unchanged, >3 = moved
// ---------------------------------------------------------------------------
// Target hashing — deterministic key for baseline storage
// ---------------------------------------------------------------------------
/**
* Create a stable hash for a target path to use as baseline filename.
* Uses the resolved absolute path so the same directory always maps
* to the same baseline regardless of how it was referenced.
* @param {string} targetPath
* @returns {string} 12-char hex hash
*/
export function targetHash(targetPath) {
const resolved = resolve(targetPath);
return createHash('sha256').update(resolved).digest('hex').slice(0, 12);
}
// ---------------------------------------------------------------------------
// Finding fingerprinting — identity that survives line drift
// ---------------------------------------------------------------------------
/**
* Generate a stable fingerprint for a finding.
* Combines scanner prefix + file + title + evidence to create an identity
* that is independent of line number (line drift is handled separately).
* @param {object} finding - A finding object from output.mjs
* @returns {string} hex fingerprint
*/
export function fingerprintFinding(finding) {
const parts = [
finding.scanner || '',
finding.file || '',
finding.title || '',
// Evidence provides content-level identity — two different findings
// in the same file with different evidence are distinct findings.
finding.evidence || '',
];
return createHash('sha256').update(parts.join('\x00')).digest('hex').slice(0, 16);
}
// ---------------------------------------------------------------------------
// Baseline I/O
// ---------------------------------------------------------------------------
/**
* Resolve the baseline file path for a given target.
* @param {string} baselinesDir - Path to reports/baselines/
* @param {string} targetPath
* @returns {string} Full path to baseline JSON file
*/
export function baselinePath(baselinesDir, targetPath) {
return join(baselinesDir, `${targetHash(targetPath)}.json`);
}
/**
* Save scan results as a baseline.
* @param {string} baselinesDir - Path to reports/baselines/
* @param {string} targetPath - The scanned target
* @param {object} scanEnvelope - Full scan output envelope from scan-orchestrator
* @returns {string} Path to saved baseline file
*/
export function saveBaseline(baselinesDir, targetPath, scanEnvelope) {
if (!existsSync(baselinesDir)) {
mkdirSync(baselinesDir, { recursive: true });
}
const filePath = baselinePath(baselinesDir, targetPath);
// Store a compact baseline: metadata + fingerprinted findings
const baseline = {
meta: {
target: scanEnvelope.meta.target,
timestamp: scanEnvelope.meta.timestamp,
version: '1', // baseline format version
},
aggregate: scanEnvelope.aggregate,
findings: extractFindings(scanEnvelope),
};
writeFileSync(filePath, JSON.stringify(baseline, null, 2) + '\n');
return filePath;
}
/**
* Load a baseline from disk.
* @param {string} baselinesDir
* @param {string} targetPath
* @returns {object|null} Baseline object or null if not found
*/
export function loadBaseline(baselinesDir, targetPath) {
const filePath = baselinePath(baselinesDir, targetPath);
if (!existsSync(filePath)) return null;
try {
return JSON.parse(readFileSync(filePath, 'utf8'));
} catch {
return null;
}
}
// ---------------------------------------------------------------------------
// Finding extraction — flatten all scanner results into fingerprinted list
// ---------------------------------------------------------------------------
/**
* Extract all findings from a scan envelope, adding fingerprints.
* @param {object} scanEnvelope
* @returns {object[]} Array of { fingerprint, scanner, severity, title, file, line, evidence, owasp, recommendation }
*/
export function extractFindings(scanEnvelope) {
const findings = [];
for (const [scannerName, result] of Object.entries(scanEnvelope.scanners || {})) {
for (const f of result.findings || []) {
findings.push({
fingerprint: fingerprintFinding(f),
scanner: f.scanner || scannerName.toUpperCase().slice(0, 3),
severity: f.severity,
title: f.title,
file: f.file || null,
line: f.line || null,
evidence: f.evidence || null,
owasp: f.owasp || null,
recommendation: f.recommendation || null,
});
}
}
return findings;
}
// ---------------------------------------------------------------------------
// Diff algorithm
// ---------------------------------------------------------------------------
/**
* Compare current scan findings against a baseline.
*
* Matching strategy (priority order):
* 1. Exact: fingerprint + file + line within ±LINE_FUZZY_THRESHOLD → unchanged
* 2. Moved: fingerprint matches but file or line drifted beyond threshold → moved
* 3. Unmatched current findings → new
* 4. Unmatched baseline findings → resolved
*
* @param {object[]} baselineFindings - From loadBaseline().findings
* @param {object[]} currentFindings - From extractFindings()
* @returns {object} { new, resolved, unchanged, moved, summary }
*/
export function diffFindings(baselineFindings, currentFindings) {
// Index baseline findings by fingerprint for O(n) lookup
// Multiple findings can share a fingerprint (same pattern, different locations)
const baselineByFp = new Map();
for (const f of baselineFindings) {
const existing = baselineByFp.get(f.fingerprint) || [];
existing.push({ ...f, matched: false });
baselineByFp.set(f.fingerprint, existing);
}
const results = {
new: [],
resolved: [],
unchanged: [],
moved: [],
};
// Pass 1: Match current findings against baseline
for (const current of currentFindings) {
const candidates = baselineByFp.get(current.fingerprint);
if (!candidates) {
results.new.push(current);
continue;
}
// Try exact match first (same file, line within threshold)
let matched = false;
for (const baseline of candidates) {
if (baseline.matched) continue;
if (baseline.file === current.file && isLineClose(baseline.line, current.line)) {
baseline.matched = true;
results.unchanged.push({
...current,
baseline_line: baseline.line,
});
matched = true;
break;
}
}
if (matched) continue;
// Try moved match (fingerprint matches, location differs)
for (const baseline of candidates) {
if (baseline.matched) continue;
baseline.matched = true;
results.moved.push({
...current,
previous_file: baseline.file,
previous_line: baseline.line,
});
matched = true;
break;
}
if (matched) continue;
// All candidates consumed — this is new
results.new.push(current);
}
// Pass 2: Unmatched baseline findings are resolved
for (const candidates of baselineByFp.values()) {
for (const baseline of candidates) {
if (!baseline.matched) {
const { matched: _, ...finding } = baseline;
results.resolved.push(finding);
}
}
}
// Summary
results.summary = {
new: results.new.length,
resolved: results.resolved.length,
unchanged: results.unchanged.length,
moved: results.moved.length,
total_current: currentFindings.length,
total_baseline: baselineFindings.length,
baseline_timestamp: null, // caller fills in
};
return results;
}
/**
* Check if two line numbers are within the fuzzy threshold.
* Null lines always match (some findings are file-level, not line-level).
* @param {number|null} a
* @param {number|null} b
* @returns {boolean}
*/
function isLineClose(a, b) {
if (a == null || b == null) return true;
return Math.abs(a - b) <= LINE_FUZZY_THRESHOLD;
}
// ---------------------------------------------------------------------------
// High-level API — used by scan-orchestrator
// ---------------------------------------------------------------------------
/**
* Run a full diff cycle: load baseline, compare, return diff results.
* @param {string} baselinesDir
* @param {string} targetPath
* @param {object} scanEnvelope - Current scan results
* @returns {object|null} Diff results with summary, or null if no baseline exists
*/
export function diffAgainstBaseline(baselinesDir, targetPath, scanEnvelope) {
const baseline = loadBaseline(baselinesDir, targetPath);
if (!baseline) return null;
const currentFindings = extractFindings(scanEnvelope);
const diff = diffFindings(baseline.findings, currentFindings);
diff.summary.baseline_timestamp = baseline.meta.timestamp;
return diff;
}