// lib/review/plan-review-dedup.mjs // Phase-9 dedup helper for /trekplan adversarial review: // merges plan-critic + scope-guardian findings into a single deduplicated // stream, preserving provenance (which agent originally raised each finding). // // Two dedup signals: // 1. Exact match — identical computeFindingId(file:line:rule_key) → merge. // 2. Jaccard ≥ 0.7 on text-token sets → merge (catches near-duplicates). // // Provenance is preserved on the surviving finding's `raised_by` array. // // CLI shim: // node lib/review/plan-review-dedup.mjs \ // --plan-critic /tmp/x.json --scope-guardian /tmp/y.json // → stdout: deduped JSON, exit 0 on success. // // Empty / missing inputs are tolerated (single-agent review still works). import { readFileSync } from 'node:fs'; import { jaccardSimilarity, meetsThreshold } from '../parsers/jaccard.mjs'; import { computeFindingId } from '../parsers/finding-id.mjs'; export const DEFAULT_THRESHOLD = 0.7; /** * Tokenize a finding's text for Jaccard comparison: lowercase, split on * non-word, drop empties. Stable + deterministic. */ export function tokenize(text) { if (typeof text !== 'string' || text.length === 0) return []; return text.toLowerCase().split(/\W+/).filter(t => t.length > 0); } /** * Normalize a single agent payload into an array of {agent, finding} pairs. * Tolerates missing payload (returns []). */ function normalizeAgentPayload(payload, fallbackAgent) { if (!payload || typeof payload !== 'object') return []; const agent = (typeof payload.agent === 'string' && payload.agent.length > 0) ? payload.agent : fallbackAgent; const findings = Array.isArray(payload.findings) ? payload.findings : []; return findings.map(f => ({ agent, finding: f })); } function annotate(finding, agent) { const id = computeFindingId( String(finding.file ?? 'unknown'), finding.line ?? 0, String(finding.rule_key ?? 'unknown'), ); return { id, file: finding.file ?? null, line: finding.line ?? null, rule_key: finding.rule_key ?? null, text: typeof finding.text === 'string' ? finding.text : '', raised_by: [agent], }; } /** * Dedup an arbitrary collection of agent payloads. * * @param {Array<{agent: string, payload: object | null | undefined}>} sources * @param {{ threshold?: number }} [opts] * @returns {{ * findings: Array, * dedup_stats: { total_in: number, total_out: number, * exact_id_dups: number, jaccard_dups: number } * }} */ export function dedupFindings(sources, opts = {}) { const threshold = typeof opts.threshold === 'number' ? opts.threshold : DEFAULT_THRESHOLD; const incoming = []; for (const s of sources) { for (const pair of normalizeAgentPayload(s.payload, s.agent)) { incoming.push(annotate(pair.finding, pair.agent)); } } const total_in = incoming.length; // Pass 1 — exact id dedup const byId = new Map(); let exact_id_dups = 0; for (const f of incoming) { const existing = byId.get(f.id); if (existing) { for (const a of f.raised_by) { if (!existing.raised_by.includes(a)) existing.raised_by.push(a); } exact_id_dups += 1; } else { byId.set(f.id, f); } } // Pass 2 — jaccard on text tokens; merge near-duplicates const survivors = []; let jaccard_dups = 0; for (const f of byId.values()) { const tokens = tokenize(f.text); let merged = false; for (const s of survivors) { const sim = jaccardSimilarity(tokens, tokenize(s.text)); if (meetsThreshold(sim, threshold)) { for (const a of f.raised_by) { if (!s.raised_by.includes(a)) s.raised_by.push(a); } jaccard_dups += 1; merged = true; break; } } if (!merged) survivors.push(f); } return { findings: survivors, dedup_stats: { total_in, total_out: survivors.length, exact_id_dups, jaccard_dups, }, }; } // ---- CLI shim ---------------------------------------------------------------- function parseArgs(argv) { const out = {}; for (let i = 0; i < argv.length; i++) { const a = argv[i]; if (a === '--plan-critic') out.planCritic = argv[++i]; else if (a === '--scope-guardian') out.scopeGuardian = argv[++i]; else if (a === '--threshold') out.threshold = Number(argv[++i]); } return out; } function readJsonOrNull(path) { if (!path) return null; try { return JSON.parse(readFileSync(path, 'utf-8')); } catch { return null; } } if (import.meta.url === `file://${process.argv[1]}`) { const args = parseArgs(process.argv.slice(2)); const sources = [ { agent: 'plan-critic', payload: readJsonOrNull(args.planCritic) }, { agent: 'scope-guardian', payload: readJsonOrNull(args.scopeGuardian) }, ]; const opts = {}; if (Number.isFinite(args.threshold)) opts.threshold = args.threshold; const result = dedupFindings(sources, opts); process.stdout.write(JSON.stringify(result, null, 2) + '\n'); process.exit(0); }