feat(ultraplan-local): add plan-review-dedup helper for Phase 9 finding dedup
Step 5 of plan-v2 (ultra-pipeline-speedup).
lib/review/plan-review-dedup.mjs (NEW)
Two-pass dedup:
1. Exact match — identical computeFindingId(file:line:rule_key) → merge.
2. Jaccard ≥ 0.7 on text-token sets → merge near-duplicates.
Provenance preserved in surviving finding's raised_by[] (which agents
raised it). Reuses lib/parsers/jaccard.mjs + lib/parsers/finding-id.mjs.
CLI shim:
node lib/review/plan-review-dedup.mjs \
--plan-critic /tmp/x.json --scope-guardian /tmp/y.json
Missing inputs tolerated (single-agent review still works).
Tests: 10 (tokenize + threshold + 6 dedup-logic cases + 2 CLI shim).
[skip-docs]
This commit is contained in:
parent
645f01625b
commit
bed14eae4a
2 changed files with 299 additions and 0 deletions
165
plugins/ultraplan-local/lib/review/plan-review-dedup.mjs
Normal file
165
plugins/ultraplan-local/lib/review/plan-review-dedup.mjs
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
// lib/review/plan-review-dedup.mjs
|
||||
// Phase-9 dedup helper for /ultraplan-local adversarial review:
|
||||
// merges plan-critic + scope-guardian findings into a single deduplicated
|
||||
// stream, preserving provenance (which agent originally raised each finding).
|
||||
//
|
||||
// Two dedup signals:
|
||||
// 1. Exact match — identical computeFindingId(file:line:rule_key) → merge.
|
||||
// 2. Jaccard ≥ 0.7 on text-token sets → merge (catches near-duplicates).
|
||||
//
|
||||
// Provenance is preserved on the surviving finding's `raised_by` array.
|
||||
//
|
||||
// CLI shim:
|
||||
// node lib/review/plan-review-dedup.mjs \
|
||||
// --plan-critic /tmp/x.json --scope-guardian /tmp/y.json
|
||||
// → stdout: deduped JSON, exit 0 on success.
|
||||
//
|
||||
// Empty / missing inputs are tolerated (single-agent review still works).
|
||||
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { jaccardSimilarity, meetsThreshold } from '../parsers/jaccard.mjs';
|
||||
import { computeFindingId } from '../parsers/finding-id.mjs';
|
||||
|
||||
export const DEFAULT_THRESHOLD = 0.7;
|
||||
|
||||
/**
|
||||
* Tokenize a finding's text for Jaccard comparison: lowercase, split on
|
||||
* non-word, drop empties. Stable + deterministic.
|
||||
*/
|
||||
export function tokenize(text) {
|
||||
if (typeof text !== 'string' || text.length === 0) return [];
|
||||
return text.toLowerCase().split(/\W+/).filter(t => t.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a single agent payload into an array of {agent, finding} pairs.
|
||||
* Tolerates missing payload (returns []).
|
||||
*/
|
||||
function normalizeAgentPayload(payload, fallbackAgent) {
|
||||
if (!payload || typeof payload !== 'object') return [];
|
||||
const agent = (typeof payload.agent === 'string' && payload.agent.length > 0)
|
||||
? payload.agent
|
||||
: fallbackAgent;
|
||||
const findings = Array.isArray(payload.findings) ? payload.findings : [];
|
||||
return findings.map(f => ({ agent, finding: f }));
|
||||
}
|
||||
|
||||
function annotate(finding, agent) {
|
||||
const id = computeFindingId(
|
||||
String(finding.file ?? 'unknown'),
|
||||
finding.line ?? 0,
|
||||
String(finding.rule_key ?? 'unknown'),
|
||||
);
|
||||
return {
|
||||
id,
|
||||
file: finding.file ?? null,
|
||||
line: finding.line ?? null,
|
||||
rule_key: finding.rule_key ?? null,
|
||||
text: typeof finding.text === 'string' ? finding.text : '',
|
||||
raised_by: [agent],
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Dedup an arbitrary collection of agent payloads.
|
||||
*
|
||||
* @param {Array<{agent: string, payload: object | null | undefined}>} sources
|
||||
* @param {{ threshold?: number }} [opts]
|
||||
* @returns {{
|
||||
* findings: Array<object>,
|
||||
* dedup_stats: { total_in: number, total_out: number,
|
||||
* exact_id_dups: number, jaccard_dups: number }
|
||||
* }}
|
||||
*/
|
||||
export function dedupFindings(sources, opts = {}) {
|
||||
const threshold = typeof opts.threshold === 'number' ? opts.threshold : DEFAULT_THRESHOLD;
|
||||
|
||||
const incoming = [];
|
||||
for (const s of sources) {
|
||||
for (const pair of normalizeAgentPayload(s.payload, s.agent)) {
|
||||
incoming.push(annotate(pair.finding, pair.agent));
|
||||
}
|
||||
}
|
||||
|
||||
const total_in = incoming.length;
|
||||
|
||||
// Pass 1 — exact id dedup
|
||||
const byId = new Map();
|
||||
let exact_id_dups = 0;
|
||||
for (const f of incoming) {
|
||||
const existing = byId.get(f.id);
|
||||
if (existing) {
|
||||
for (const a of f.raised_by) {
|
||||
if (!existing.raised_by.includes(a)) existing.raised_by.push(a);
|
||||
}
|
||||
exact_id_dups += 1;
|
||||
} else {
|
||||
byId.set(f.id, f);
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 2 — jaccard on text tokens; merge near-duplicates
|
||||
const survivors = [];
|
||||
let jaccard_dups = 0;
|
||||
for (const f of byId.values()) {
|
||||
const tokens = tokenize(f.text);
|
||||
let merged = false;
|
||||
for (const s of survivors) {
|
||||
const sim = jaccardSimilarity(tokens, tokenize(s.text));
|
||||
if (meetsThreshold(sim, threshold)) {
|
||||
for (const a of f.raised_by) {
|
||||
if (!s.raised_by.includes(a)) s.raised_by.push(a);
|
||||
}
|
||||
jaccard_dups += 1;
|
||||
merged = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!merged) survivors.push(f);
|
||||
}
|
||||
|
||||
return {
|
||||
findings: survivors,
|
||||
dedup_stats: {
|
||||
total_in,
|
||||
total_out: survivors.length,
|
||||
exact_id_dups,
|
||||
jaccard_dups,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ---- CLI shim ----------------------------------------------------------------
|
||||
|
||||
function parseArgs(argv) {
|
||||
const out = {};
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
const a = argv[i];
|
||||
if (a === '--plan-critic') out.planCritic = argv[++i];
|
||||
else if (a === '--scope-guardian') out.scopeGuardian = argv[++i];
|
||||
else if (a === '--threshold') out.threshold = Number(argv[++i]);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
function readJsonOrNull(path) {
|
||||
if (!path) return null;
|
||||
try {
|
||||
return JSON.parse(readFileSync(path, 'utf-8'));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||
const args = parseArgs(process.argv.slice(2));
|
||||
const sources = [
|
||||
{ agent: 'plan-critic', payload: readJsonOrNull(args.planCritic) },
|
||||
{ agent: 'scope-guardian', payload: readJsonOrNull(args.scopeGuardian) },
|
||||
];
|
||||
const opts = {};
|
||||
if (Number.isFinite(args.threshold)) opts.threshold = args.threshold;
|
||||
const result = dedupFindings(sources, opts);
|
||||
process.stdout.write(JSON.stringify(result, null, 2) + '\n');
|
||||
process.exit(0);
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue