From b69fdea88392449364eab31bf6255191c30390f1 Mon Sep 17 00:00:00 2001 From: Kjell Tore Guttormsen Date: Fri, 1 May 2026 17:21:42 +0200 Subject: [PATCH] test(ultraplan-local): add review determinism integration test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3 integration tests using the run-A/run-B fixtures: - Jaccard(A, B) ≥ 0.70 (SC4 brief threshold) - IDs match 40-char hex shape (lib/parsers/finding-id.mjs format) - no duplicate IDs within a single run Tests the Jaccard PIPELINE; real-LLM determinism deferred to v1.1. Co-Authored-By: Claude Opus 4.7 --- .../tests/lib/review-determinism.test.mjs | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 plugins/ultraplan-local/tests/lib/review-determinism.test.mjs diff --git a/plugins/ultraplan-local/tests/lib/review-determinism.test.mjs b/plugins/ultraplan-local/tests/lib/review-determinism.test.mjs new file mode 100644 index 0000000..5d96921 --- /dev/null +++ b/plugins/ultraplan-local/tests/lib/review-determinism.test.mjs @@ -0,0 +1,69 @@ +// tests/lib/review-determinism.test.mjs +// SC4 determinism floor — Jaccard pipeline test. +// +// Reads two synthetic review-run fixtures (A ⊂ B), parses their findings +// arrays from frontmatter, and asserts: +// 1. Jaccard(A, B) ≥ 0.70 (the SC4 brief threshold) +// 2. every finding-ID is 40-char hex (matches lib/parsers/finding-id.mjs format) +// 3. no duplicate IDs within either run +// +// This test exercises the Jaccard PIPELINE on a known input. It does NOT +// measure real-LLM determinism — that is deferred to v1.1, see +// tests/fixtures/ultrareview/README.md. + +import { test } from 'node:test'; +import { strict as assert } from 'node:assert'; +import { readFileSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs'; +import { parseDocument } from '../../lib/util/frontmatter.mjs'; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const ROOT = join(HERE, '..', '..'); + +const HEX_ID_RE = /^[0-9a-f]{40}$/; +const SC4_THRESHOLD = 0.70; + +function loadFindings(rel) { + const text = readFileSync(join(ROOT, rel), 'utf-8'); + const doc = parseDocument(text); + assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`); + const findings = doc.parsed.frontmatter && doc.parsed.frontmatter.findings; + assert.ok(Array.isArray(findings), `frontmatter.findings of ${rel} is not an array`); + return findings; +} + +test('review determinism — Jaccard of fixture run-A vs run-B meets SC4 threshold (0.70)', () => { + const a = loadFindings('tests/fixtures/ultrareview/review-run-A.md'); + const b = loadFindings('tests/fixtures/ultrareview/review-run-B.md'); + const jaccard = jaccardSimilarity(a, b); + assert.ok( + jaccard >= SC4_THRESHOLD, + `Jaccard(A, B) = ${jaccard} < ${SC4_THRESHOLD} (SC4 threshold). ` + + `Fixtures may have drifted — recompute IDs via lib/parsers/finding-id.mjs.`, + ); +}); + +test('review determinism — finding IDs are 40-char hex', () => { + for (const rel of ['tests/fixtures/ultrareview/review-run-A.md', 'tests/fixtures/ultrareview/review-run-B.md']) { + const findings = loadFindings(rel); + for (const id of findings) { + assert.ok( + typeof id === 'string' && HEX_ID_RE.test(id), + `${rel}: ID ${JSON.stringify(id)} is not a 40-char lowercase hex string`, + ); + } + } +}); + +test('review determinism — no duplicate IDs within run', () => { + for (const rel of ['tests/fixtures/ultrareview/review-run-A.md', 'tests/fixtures/ultrareview/review-run-B.md']) { + const findings = loadFindings(rel); + assert.strictEqual( + new Set(findings).size, + findings.length, + `${rel}: contains duplicate finding-IDs (${findings.length} entries vs ${new Set(findings).size} unique)`, + ); + } +});