feat(ultraplan-local): add plan-review-dedup helper for Phase 9 finding dedup

Step 5 of plan-v2 (ultra-pipeline-speedup). lib/review/plan-review-dedup.mjs (NEW) Two-pass dedup: 1. Exact match — identical computeFindingId(file:line:rule_key) → merge. 2. Jaccard ≥ 0.7 on text-token sets → merge near-duplicates. Provenance preserved in surviving finding's raised_by[] (which agents raised it). Reuses lib/parsers/jaccard.mjs + lib/parsers/finding-id.mjs. CLI shim: node lib/review/plan-review-dedup.mjs \ --plan-critic /tmp/x.json --scope-guardian /tmp/y.json Missing inputs tolerated (single-agent review still works). Tests: 10 (tokenize + threshold + 6 dedup-logic cases + 2 CLI shim). [skip-docs]
2026-05-04 06:30:28 +02:00 · 2026-05-04 06:30:28 +02:00 · bed14eae4a
commit bed14eae4a
parent 645f01625b
2 changed files with 299 additions and 0 deletions
--- a/plugins/ultraplan-local/lib/review/plan-review-dedup.mjs
+++ b/plugins/ultraplan-local/lib/review/plan-review-dedup.mjs
@ -0,0 +1,165 @@
+// lib/review/plan-review-dedup.mjs
+// Phase-9 dedup helper for /ultraplan-local adversarial review:
+// merges plan-critic + scope-guardian findings into a single deduplicated
+// stream, preserving provenance (which agent originally raised each finding).
+//
+// Two dedup signals:
+//   1. Exact match  — identical computeFindingId(file:line:rule_key) → merge.
+//   2. Jaccard ≥ 0.7 on text-token sets → merge (catches near-duplicates).
+//
+// Provenance is preserved on the surviving finding's `raised_by` array.
+//
+// CLI shim:
+//   node lib/review/plan-review-dedup.mjs \
+//        --plan-critic /tmp/x.json --scope-guardian /tmp/y.json
+//   → stdout: deduped JSON, exit 0 on success.
+//
+// Empty / missing inputs are tolerated (single-agent review still works).
+
+import { readFileSync } from 'node:fs';
+import { jaccardSimilarity, meetsThreshold } from '../parsers/jaccard.mjs';
+import { computeFindingId } from '../parsers/finding-id.mjs';
+
+export const DEFAULT_THRESHOLD = 0.7;
+
+/**
+ * Tokenize a finding's text for Jaccard comparison: lowercase, split on
+ * non-word, drop empties. Stable + deterministic.
+ */
+export function tokenize(text) {
+  if (typeof text !== 'string' || text.length === 0) return [];
+  return text.toLowerCase().split(/\W+/).filter(t => t.length > 0);
+}
+
+/**
+ * Normalize a single agent payload into an array of {agent, finding} pairs.
+ * Tolerates missing payload (returns []).
+ */
+function normalizeAgentPayload(payload, fallbackAgent) {
+  if (!payload || typeof payload !== 'object') return [];
+  const agent = (typeof payload.agent === 'string' && payload.agent.length > 0)
+    ? payload.agent
+    : fallbackAgent;
+  const findings = Array.isArray(payload.findings) ? payload.findings : [];
+  return findings.map(f => ({ agent, finding: f }));
+}
+
+function annotate(finding, agent) {
+  const id = computeFindingId(
+    String(finding.file ?? 'unknown'),
+    finding.line ?? 0,
+    String(finding.rule_key ?? 'unknown'),
+  );
+  return {
+    id,
+    file: finding.file ?? null,
+    line: finding.line ?? null,
+    rule_key: finding.rule_key ?? null,
+    text: typeof finding.text === 'string' ? finding.text : '',
+    raised_by: [agent],
+  };
+}
+
+/**
+ * Dedup an arbitrary collection of agent payloads.
+ *
+ * @param {Array<{agent: string, payload: object | null | undefined}>} sources
+ * @param {{ threshold?: number }} [opts]
+ * @returns {{
+ *   findings: Array<object>,
+ *   dedup_stats: { total_in: number, total_out: number,
+ *                  exact_id_dups: number, jaccard_dups: number }
+ * }}
+ */
+export function dedupFindings(sources, opts = {}) {
+  const threshold = typeof opts.threshold === 'number' ? opts.threshold : DEFAULT_THRESHOLD;
+
+  const incoming = [];
+  for (const s of sources) {
+    for (const pair of normalizeAgentPayload(s.payload, s.agent)) {
+      incoming.push(annotate(pair.finding, pair.agent));
+    }
+  }
+
+  const total_in = incoming.length;
+
+  // Pass 1 — exact id dedup
+  const byId = new Map();
+  let exact_id_dups = 0;
+  for (const f of incoming) {
+    const existing = byId.get(f.id);
+    if (existing) {
+      for (const a of f.raised_by) {
+        if (!existing.raised_by.includes(a)) existing.raised_by.push(a);
+      }
+      exact_id_dups += 1;
+    } else {
+      byId.set(f.id, f);
+    }
+  }
+
+  // Pass 2 — jaccard on text tokens; merge near-duplicates
+  const survivors = [];
+  let jaccard_dups = 0;
+  for (const f of byId.values()) {
+    const tokens = tokenize(f.text);
+    let merged = false;
+    for (const s of survivors) {
+      const sim = jaccardSimilarity(tokens, tokenize(s.text));
+      if (meetsThreshold(sim, threshold)) {
+        for (const a of f.raised_by) {
+          if (!s.raised_by.includes(a)) s.raised_by.push(a);
+        }
+        jaccard_dups += 1;
+        merged = true;
+        break;
+      }
+    }
+    if (!merged) survivors.push(f);
+  }
+
+  return {
+    findings: survivors,
+    dedup_stats: {
+      total_in,
+      total_out: survivors.length,
+      exact_id_dups,
+      jaccard_dups,
+    },
+  };
+}
+
+// ---- CLI shim ----------------------------------------------------------------
+
+function parseArgs(argv) {
+  const out = {};
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === '--plan-critic') out.planCritic = argv[++i];
+    else if (a === '--scope-guardian') out.scopeGuardian = argv[++i];
+    else if (a === '--threshold') out.threshold = Number(argv[++i]);
+  }
+  return out;
+}
+
+function readJsonOrNull(path) {
+  if (!path) return null;
+  try {
+    return JSON.parse(readFileSync(path, 'utf-8'));
+  } catch {
+    return null;
+  }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  const args = parseArgs(process.argv.slice(2));
+  const sources = [
+    { agent: 'plan-critic', payload: readJsonOrNull(args.planCritic) },
+    { agent: 'scope-guardian', payload: readJsonOrNull(args.scopeGuardian) },
+  ];
+  const opts = {};
+  if (Number.isFinite(args.threshold)) opts.threshold = args.threshold;
+  const result = dedupFindings(sources, opts);
+  process.stdout.write(JSON.stringify(result, null, 2) + '\n');
+  process.exit(0);
+}
--- a/plugins/ultraplan-local/tests/lib/plan-review-dedup.test.mjs
+++ b/plugins/ultraplan-local/tests/lib/plan-review-dedup.test.mjs
@ -0,0 +1,134 @@
+// tests/lib/plan-review-dedup.test.mjs
+// Cover lib/review/plan-review-dedup.mjs:
+//   - identical findings dedupe to 1 (exact-id path)
+//   - distinct findings stay separate
+//   - jaccard threshold 0.7 catches near-duplicates
+//   - empty / missing payloads tolerated
+//   - CLI shim emits parseable JSON on stdout
+
+import { test } from 'node:test';
+import { strict as assert } from 'node:assert';
+import { execFileSync } from 'node:child_process';
+import { writeFileSync, mkdtempSync, rmSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { fileURLToPath } from 'node:url';
+import { dedupFindings, tokenize, DEFAULT_THRESHOLD } from '../../lib/review/plan-review-dedup.mjs';
+
+const HERE = dirname(fileURLToPath(import.meta.url));
+const SHIM = join(HERE, '..', '..', 'lib', 'review', 'plan-review-dedup.mjs');
+
+function tmp(prefix = 'plan-review-dedup-') {
+  return mkdtempSync(join(tmpdir(), prefix));
+}
+
+test('tokenize splits on non-word and lowercases', () => {
+  assert.deepEqual(
+    tokenize('Step 4 LACKS verifiable acceptance!'),
+    ['step', '4', 'lacks', 'verifiable', 'acceptance'],
+  );
+  assert.deepEqual(tokenize(''), []);
+  assert.deepEqual(tokenize(undefined), []);
+});
+
+test('DEFAULT_THRESHOLD is 0.7 per plan-v2 spec', () => {
+  assert.equal(DEFAULT_THRESHOLD, 0.7);
+});
+
+test('identical findings (same file/line/rule_key) dedupe to 1, raised_by merged', () => {
+  const sources = [
+    { agent: 'plan-critic',    payload: { agent: 'plan-critic',    findings: [{ file: 'plan.md', line: 42, rule_key: 'PC1', text: 'Step 4 lacks verifiable acceptance criteria' }] } },
+    { agent: 'scope-guardian', payload: { agent: 'scope-guardian', findings: [{ file: 'plan.md', line: 42, rule_key: 'PC1', text: 'Step 4 lacks verifiable acceptance criteria' }] } },
+  ];
+  const r = dedupFindings(sources);
+  assert.equal(r.findings.length, 1);
+  assert.deepEqual(r.findings[0].raised_by.sort(), ['plan-critic', 'scope-guardian']);
+  assert.equal(r.dedup_stats.total_in, 2);
+  assert.equal(r.dedup_stats.total_out, 1);
+  assert.equal(r.dedup_stats.exact_id_dups, 1);
+});
+
+test('distinct findings (different file/line/rule_key) stay separate', () => {
+  const sources = [
+    { agent: 'plan-critic', payload: { findings: [
+      { file: 'plan.md', line: 10, rule_key: 'PC1', text: 'thing one' },
+      { file: 'plan.md', line: 20, rule_key: 'PC2', text: 'thing two unrelated entirely' },
+    ] } },
+  ];
+  const r = dedupFindings(sources);
+  assert.equal(r.findings.length, 2);
+  assert.equal(r.dedup_stats.exact_id_dups, 0);
+  assert.equal(r.dedup_stats.jaccard_dups, 0);
+});
+
+test('jaccard ≥ 0.7 on near-duplicate text merges (different file/line so id differs)', () => {
+  const sources = [
+    { agent: 'plan-critic',    payload: { findings: [{ file: 'plan.md', line: 10, rule_key: 'PC1', text: 'step lacks verifiable acceptance criteria for path A' }] } },
+    { agent: 'scope-guardian', payload: { findings: [{ file: 'plan.md', line: 11, rule_key: 'SG1', text: 'step lacks verifiable acceptance criteria for path A' }] } },
+  ];
+  const r = dedupFindings(sources);
+  assert.equal(r.findings.length, 1, 'jaccard merge should collapse near-duplicates');
+  assert.deepEqual(r.findings[0].raised_by.sort(), ['plan-critic', 'scope-guardian']);
+  assert.equal(r.dedup_stats.jaccard_dups, 1);
+});
+
+test('jaccard below threshold keeps both findings separate', () => {
+  const sources = [
+    { agent: 'plan-critic',    payload: { findings: [{ file: 'a.md', line: 1, rule_key: 'PC1', text: 'database migration risk' }] } },
+    { agent: 'scope-guardian', payload: { findings: [{ file: 'b.md', line: 2, rule_key: 'SG1', text: 'unrelated frontend hover state polish' }] } },
+  ];
+  const r = dedupFindings(sources);
+  assert.equal(r.findings.length, 2);
+  assert.equal(r.dedup_stats.jaccard_dups, 0);
+});
+
+test('empty / missing payloads tolerated (single-agent input)', () => {
+  const r = dedupFindings([
+    { agent: 'plan-critic',    payload: { findings: [{ file: 'a.md', line: 1, rule_key: 'PC1', text: 'one' }] } },
+    { agent: 'scope-guardian', payload: null },
+  ]);
+  assert.equal(r.findings.length, 1);
+  assert.deepEqual(r.findings[0].raised_by, ['plan-critic']);
+});
+
+test('all sources empty → empty result, dedup_stats zeros', () => {
+  const r = dedupFindings([
+    { agent: 'plan-critic',    payload: null },
+    { agent: 'scope-guardian', payload: { findings: [] } },
+  ]);
+  assert.equal(r.findings.length, 0);
+  assert.equal(r.dedup_stats.total_in, 0);
+  assert.equal(r.dedup_stats.total_out, 0);
+});
+
+test('CLI shim parses input files and emits valid deduped JSON', () => {
+  const dir = tmp();
+  try {
+    const planCritic = join(dir, 'pc.json');
+    const scopeGuardian = join(dir, 'sg.json');
+    writeFileSync(planCritic, JSON.stringify({
+      agent: 'plan-critic',
+      findings: [{ file: 'plan.md', line: 5, rule_key: 'PC1', text: 'duplicate finding shared by both' }],
+    }));
+    writeFileSync(scopeGuardian, JSON.stringify({
+      agent: 'scope-guardian',
+      findings: [{ file: 'plan.md', line: 5, rule_key: 'PC1', text: 'duplicate finding shared by both' }],
+    }));
+    const out = execFileSync(process.execPath, [
+      SHIM, '--plan-critic', planCritic, '--scope-guardian', scopeGuardian,
+    ], { encoding: 'utf-8' });
+    const parsed = JSON.parse(out);
+    assert.equal(parsed.findings.length, 1);
+    assert.deepEqual(parsed.findings[0].raised_by.sort(), ['plan-critic', 'scope-guardian']);
+    assert.equal(parsed.dedup_stats.total_out, 1);
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+test('CLI shim tolerates missing input files (returns empty deduped JSON)', () => {
+  const out = execFileSync(process.execPath, [SHIM], { encoding: 'utf-8' });
+  const parsed = JSON.parse(out);
+  assert.equal(parsed.findings.length, 0);
+  assert.equal(parsed.dedup_stats.total_in, 0);
+});