ktg-plugin-marketplace/plugins/ultraplan-local/tests/synthetic/plan-determinism.test.mjs
Kjell Tore Guttormsen 0c0a87e709 test(ultraplan-local): add plan-determinism + review-determinism synthetic fixtures (SC7 floor)
Adds 6 files in tests/synthetic/ exercising the determinism pipeline at the
SC7 brief floor (Jaccard >= 0.833). Plan fixture pair: 40 step titles each
with 38 shared (Jaccard 0.905). Review fixture pair: 30 finding-IDs each
with 28 shared (Jaccard 0.875). Reuses lib/parsers/jaccard.mjs +
lib/parsers/finding-id.mjs.

The new pair coexists with tests/lib/review-determinism.test.mjs which
holds the older SC4 (0.70) floor against tests/fixtures/ultrareview/.
The lower floor protects pipeline regressions; the higher floor anchors
the speedup brief's determinism aspiration.

[skip-docs]
2026-05-04 08:46:39 +02:00

63 lines
2.6 KiB
JavaScript

// tests/synthetic/plan-determinism.test.mjs
// SC7 plan-determinism floor — Jaccard pipeline test.
//
// Reads two synthetic plan-run fixtures and asserts that
// jaccardSimilarity(stepsTokens(planA), stepsTokens(planB)) >= 0.833.
//
// This exercises the determinism pipeline (parser + jaccard) on a known
// input pair. It does NOT measure real-LLM determinism — that is deferred
// to a future run of the pipeline against examples/01-add-verbose-flag/.
import { test } from 'node:test';
import { strict as assert } from 'node:assert';
import { readFileSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs';
import { parseDocument } from '../../lib/util/frontmatter.mjs';
const HERE = dirname(fileURLToPath(import.meta.url));
const ROOT = join(HERE, '..', '..');
const SC7_THRESHOLD = 0.833;
function loadSteps(rel) {
const text = readFileSync(join(ROOT, rel), 'utf-8');
const doc = parseDocument(text);
assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`);
const steps = doc.parsed.frontmatter && doc.parsed.frontmatter.steps;
assert.ok(Array.isArray(steps), `frontmatter.steps of ${rel} is not an array`);
return steps;
}
test('plan determinism — Jaccard of synthetic plan-run-A vs plan-run-B meets SC7 threshold (0.833)', () => {
const a = loadSteps('tests/synthetic/plan-run-A.md');
const b = loadSteps('tests/synthetic/plan-run-B.md');
const sim = jaccardSimilarity(a, b);
assert.ok(
sim >= SC7_THRESHOLD,
`jaccardSimilarity(stepsTokens(planA), stepsTokens(planB)) = ${sim} < ${SC7_THRESHOLD} (SC7 floor). ` +
`Fixtures may have drifted — re-tune step titles to restore the overlap.`,
);
});
test('plan determinism — both fixtures contain at least 30 unique step titles', () => {
for (const rel of ['tests/synthetic/plan-run-A.md', 'tests/synthetic/plan-run-B.md']) {
const steps = loadSteps(rel);
assert.ok(
new Set(steps).size >= 30,
`${rel}: < 30 unique step titles (got ${new Set(steps).size}). Synthetic fixtures must reflect a substantial plan.`,
);
}
});
test('plan determinism — no duplicate step titles within run', () => {
for (const rel of ['tests/synthetic/plan-run-A.md', 'tests/synthetic/plan-run-B.md']) {
const steps = loadSteps(rel);
assert.strictEqual(
new Set(steps).size,
steps.length,
`${rel}: contains duplicate step titles (${steps.length} entries vs ${new Set(steps).size} unique)`,
);
}
});