test(ultraplan-local): add review determinism integration test

3 integration tests using the run-A/run-B fixtures:
- Jaccard(A, B) ≥ 0.70 (SC4 brief threshold)
- IDs match 40-char hex shape (lib/parsers/finding-id.mjs format)
- no duplicate IDs within a single run

Tests the Jaccard PIPELINE; real-LLM determinism deferred to v1.1.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Kjell Tore Guttormsen 2026-05-01 17:21:42 +02:00
commit b69fdea883

View file

@ -0,0 +1,69 @@
// tests/lib/review-determinism.test.mjs
// SC4 determinism floor — Jaccard pipeline test.
//
// Reads two synthetic review-run fixtures (A ⊂ B), parses their findings
// arrays from frontmatter, and asserts:
// 1. Jaccard(A, B) ≥ 0.70 (the SC4 brief threshold)
// 2. every finding-ID is 40-char hex (matches lib/parsers/finding-id.mjs format)
// 3. no duplicate IDs within either run
//
// This test exercises the Jaccard PIPELINE on a known input. It does NOT
// measure real-LLM determinism — that is deferred to v1.1, see
// tests/fixtures/ultrareview/README.md.
import { test } from 'node:test';
import { strict as assert } from 'node:assert';
import { readFileSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs';
import { parseDocument } from '../../lib/util/frontmatter.mjs';
const HERE = dirname(fileURLToPath(import.meta.url));
const ROOT = join(HERE, '..', '..');
const HEX_ID_RE = /^[0-9a-f]{40}$/;
const SC4_THRESHOLD = 0.70;
function loadFindings(rel) {
const text = readFileSync(join(ROOT, rel), 'utf-8');
const doc = parseDocument(text);
assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`);
const findings = doc.parsed.frontmatter && doc.parsed.frontmatter.findings;
assert.ok(Array.isArray(findings), `frontmatter.findings of ${rel} is not an array`);
return findings;
}
test('review determinism — Jaccard of fixture run-A vs run-B meets SC4 threshold (0.70)', () => {
const a = loadFindings('tests/fixtures/ultrareview/review-run-A.md');
const b = loadFindings('tests/fixtures/ultrareview/review-run-B.md');
const jaccard = jaccardSimilarity(a, b);
assert.ok(
jaccard >= SC4_THRESHOLD,
`Jaccard(A, B) = ${jaccard} < ${SC4_THRESHOLD} (SC4 threshold). ` +
`Fixtures may have drifted — recompute IDs via lib/parsers/finding-id.mjs.`,
);
});
test('review determinism — finding IDs are 40-char hex', () => {
for (const rel of ['tests/fixtures/ultrareview/review-run-A.md', 'tests/fixtures/ultrareview/review-run-B.md']) {
const findings = loadFindings(rel);
for (const id of findings) {
assert.ok(
typeof id === 'string' && HEX_ID_RE.test(id),
`${rel}: ID ${JSON.stringify(id)} is not a 40-char lowercase hex string`,
);
}
}
});
test('review determinism — no duplicate IDs within run', () => {
for (const rel of ['tests/fixtures/ultrareview/review-run-A.md', 'tests/fixtures/ultrareview/review-run-B.md']) {
const findings = loadFindings(rel);
assert.strictEqual(
new Set(findings).size,
findings.length,
`${rel}: contains duplicate finding-IDs (${findings.length} entries vs ${new Set(findings).size} unique)`,
);
}
});