test(ultraplan-local): add plan-determinism + review-determinism synthetic fixtures (SC7 floor)

Adds 6 files in tests/synthetic/ exercising the determinism pipeline at the
SC7 brief floor (Jaccard >= 0.833). Plan fixture pair: 40 step titles each
with 38 shared (Jaccard 0.905). Review fixture pair: 30 finding-IDs each
with 28 shared (Jaccard 0.875). Reuses lib/parsers/jaccard.mjs +
lib/parsers/finding-id.mjs.

The new pair coexists with tests/lib/review-determinism.test.mjs which
holds the older SC4 (0.70) floor against tests/fixtures/ultrareview/.
The lower floor protects pipeline regressions; the higher floor anchors
the speedup brief's determinism aspiration.

[skip-docs]
This commit is contained in:
Kjell Tore Guttormsen 2026-05-04 08:46:39 +02:00
commit 0c0a87e709
6 changed files with 425 additions and 0 deletions

View file

@ -0,0 +1,63 @@
// tests/synthetic/plan-determinism.test.mjs
// SC7 plan-determinism floor — Jaccard pipeline test.
//
// Reads two synthetic plan-run fixtures and asserts that
// jaccardSimilarity(stepsTokens(planA), stepsTokens(planB)) >= 0.833.
//
// This exercises the determinism pipeline (parser + jaccard) on a known
// input pair. It does NOT measure real-LLM determinism — that is deferred
// to a future run of the pipeline against examples/01-add-verbose-flag/.
import { test } from 'node:test';
import { strict as assert } from 'node:assert';
import { readFileSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs';
import { parseDocument } from '../../lib/util/frontmatter.mjs';
const HERE = dirname(fileURLToPath(import.meta.url));
const ROOT = join(HERE, '..', '..');
const SC7_THRESHOLD = 0.833;
function loadSteps(rel) {
const text = readFileSync(join(ROOT, rel), 'utf-8');
const doc = parseDocument(text);
assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`);
const steps = doc.parsed.frontmatter && doc.parsed.frontmatter.steps;
assert.ok(Array.isArray(steps), `frontmatter.steps of ${rel} is not an array`);
return steps;
}
test('plan determinism — Jaccard of synthetic plan-run-A vs plan-run-B meets SC7 threshold (0.833)', () => {
const a = loadSteps('tests/synthetic/plan-run-A.md');
const b = loadSteps('tests/synthetic/plan-run-B.md');
const sim = jaccardSimilarity(a, b);
assert.ok(
sim >= SC7_THRESHOLD,
`jaccardSimilarity(stepsTokens(planA), stepsTokens(planB)) = ${sim} < ${SC7_THRESHOLD} (SC7 floor). ` +
`Fixtures may have drifted — re-tune step titles to restore the overlap.`,
);
});
test('plan determinism — both fixtures contain at least 30 unique step titles', () => {
for (const rel of ['tests/synthetic/plan-run-A.md', 'tests/synthetic/plan-run-B.md']) {
const steps = loadSteps(rel);
assert.ok(
new Set(steps).size >= 30,
`${rel}: < 30 unique step titles (got ${new Set(steps).size}). Synthetic fixtures must reflect a substantial plan.`,
);
}
});
test('plan determinism — no duplicate step titles within run', () => {
for (const rel of ['tests/synthetic/plan-run-A.md', 'tests/synthetic/plan-run-B.md']) {
const steps = loadSteps(rel);
assert.strictEqual(
new Set(steps).size,
steps.length,
`${rel}: contains duplicate step titles (${steps.length} entries vs ${new Set(steps).size} unique)`,
);
}
});

View file

@ -0,0 +1,74 @@
---
type: ultraplan-synthetic
plan_version: "1.7"
created: 2026-05-04
task: "Add --verbose flag to CLI"
slug: verbose-flag
run_id: A
steps:
- "Add config entry for verbose flag in package.json"
- "Define types for verbose mode in types.ts"
- "Update parseArgs to recognize --verbose flag"
- "Pass verbose context through main entry point"
- "Add log level enum (silent, normal, verbose)"
- "Wire log level into logger module"
- "Replace console.log with logger.info in handler.ts"
- "Add tests for parseArgs --verbose recognition"
- "Add tests for log level enum mapping"
- "Update README with --verbose flag documentation"
- "Add CHANGELOG entry for verbose flag"
- "Bump package.json minor version"
- "Add lint rule blocking direct console usage"
- "Run lint and fix new violations"
- "Add CLI integration test for --verbose end-to-end"
- "Add fixture file for verbose log capture"
- "Document verbose output format in docs/cli.md"
- "Add jsdoc for new logger API"
- "Verify all existing tests pass with verbose disabled"
- "Add backward-compat test for legacy quiet behavior"
- "Add edge-case test for repeated --verbose flags"
- "Add edge-case test for --verbose with --silent collision"
- "Update help text to list --verbose flag"
- "Add usage example to docs/quickstart.md"
- "Verify CI matrix runs on Node 18 and 20"
- "Add npm script for verbose mode debugging"
- "Run security audit on logger dependency tree"
- "Verify no PII leaks in verbose log output"
- "Add manual test checklist to CONTRIBUTING.md"
- "Update .gitignore for verbose log dump files"
- "Add cleanup logic for stale verbose logs"
- "Add unit test for cleanup logic"
- "Verify exit code on verbose mode error"
- "Add stderr routing for warnings in verbose"
- "Add timestamp prefix in verbose log lines"
- "Add test for timestamp format"
- "Update troubleshooting guide with verbose flag"
- "Verify version sync across all docs"
- "Add benchmark for verbose log emission cost"
- "Document benchmark methodology in PERF.md"
---
# Synthetic plan run A — Add --verbose flag to CLI
This fixture represents one synthesized run of `/ultraplan-local` against a
hand-calibrated brief. It is paired with `plan-run-B.md` for the
`plan-determinism.test.mjs` Jaccard floor (≥ 0.833).
## How this fixture is used
`tests/synthetic/plan-determinism.test.mjs` reads the `steps` array from this
file's frontmatter and computes `jaccardSimilarity(stepsA, stepsB)`. The test
asserts the similarity is at or above the SC7 brief threshold (0.833).
This is a SYNTHETIC fixture — it is NOT the output of a real LLM run. The
purpose is to exercise the determinism pipeline (parser + jaccard) on a known
input pair so regressions in the pipeline are caught even when LLM
determinism cannot be cheaply re-measured.
## Fixture math
- A has 40 unique step titles
- B has 40 unique step titles
- Intersection (shared titles): 38
- Union: 42
- Jaccard: 38/42 ≈ 0.9047 (well above 0.833 floor)

View file

@ -0,0 +1,77 @@
---
type: ultraplan-synthetic
plan_version: "1.7"
created: 2026-05-04
task: "Add --verbose flag to CLI"
slug: verbose-flag
run_id: B
steps:
- "Add config entry for verbose flag in package.json"
- "Define types for verbose mode in types.ts"
- "Update parseArgs to recognize --verbose flag"
- "Pass verbose context through main entry point"
- "Add log level enum (silent, normal, verbose)"
- "Wire log level into logger module"
- "Replace console.log with logger.info in handler.ts"
- "Add tests for parseArgs --verbose recognition"
- "Add tests for log level enum mapping"
- "Update README with --verbose flag documentation"
- "Add CHANGELOG entry for verbose flag"
- "Bump package.json minor version"
- "Add lint rule blocking direct console usage"
- "Run lint and fix new violations"
- "Add CLI integration test for --verbose end-to-end"
- "Add fixture file for verbose log capture"
- "Document verbose output format in docs/cli.md"
- "Add jsdoc for new logger API"
- "Verify all existing tests pass with verbose disabled"
- "Add backward-compat test for legacy quiet behavior"
- "Add edge-case test for repeated --verbose flags"
- "Add edge-case test for --verbose with --silent collision"
- "Update help text to list --verbose flag"
- "Add usage example to docs/quickstart.md"
- "Verify CI matrix runs on Node 18 and 20"
- "Add npm script for verbose mode debugging"
- "Run security audit on logger dependency tree"
- "Verify no PII leaks in verbose log output"
- "Add manual test checklist to CONTRIBUTING.md"
- "Update .gitignore for verbose log dump files"
- "Add cleanup logic for stale verbose logs"
- "Add unit test for cleanup logic"
- "Verify exit code on verbose mode error"
- "Add stderr routing for warnings in verbose"
- "Add timestamp prefix in verbose log lines"
- "Add test for timestamp format"
- "Update troubleshooting guide with verbose flag"
- "Verify version sync across all docs"
- "Add benchmark for verbose log capture overhead"
- "Document overhead methodology in PERF.md"
---
# Synthetic plan run B — Add --verbose flag to CLI
This fixture represents a second synthesized run of `/ultraplan-local` against
the same hand-calibrated brief used for `plan-run-A.md`. The two runs differ
on 2 step titles (modeling realistic LLM variation).
## How this fixture is used
See `plan-run-A.md` for the determinism contract.
## Fixture math
- A has 40 unique step titles
- B has 40 unique step titles
- Intersection (shared titles): 38
- Union: 42
- Jaccard: 38/42 ≈ 0.9047 (well above 0.833 floor)
## Differences from run A
- A includes "Add benchmark for verbose log emission cost" → B replaces with
"Add benchmark for verbose log capture overhead"
- A includes "Document benchmark methodology in PERF.md" → B replaces with
"Document overhead methodology in PERF.md"
These represent the kind of paraphrase variation a stochastic planner may
produce on consecutive runs against an identical brief.

View file

@ -0,0 +1,79 @@
// tests/synthetic/review-determinism.test.mjs
// SC7 review-determinism floor — Jaccard pipeline test.
//
// Reads two synthetic review-run fixtures and asserts that
// jaccardSimilarity(findingTokens(reviewA), findingTokens(reviewB)) >= 0.833.
//
// This is the SC7 (higher) floor. The companion
// tests/lib/review-determinism.test.mjs holds the SC4 (0.70) floor against
// tests/fixtures/ultrareview/. Both pairs coexist on purpose: the lower
// floor protects against pipeline regressions, the higher one anchors the
// determinism aspiration set in the speedup brief.
import { test } from 'node:test';
import { strict as assert } from 'node:assert';
import { readFileSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs';
import { parseFindingId } from '../../lib/parsers/finding-id.mjs';
import { parseDocument } from '../../lib/util/frontmatter.mjs';
const HERE = dirname(fileURLToPath(import.meta.url));
const ROOT = join(HERE, '..', '..');
const SC7_THRESHOLD = 0.833;
function loadFindings(rel) {
const text = readFileSync(join(ROOT, rel), 'utf-8');
const doc = parseDocument(text);
assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`);
const findings = doc.parsed.frontmatter && doc.parsed.frontmatter.findings;
assert.ok(Array.isArray(findings), `frontmatter.findings of ${rel} is not an array`);
return findings;
}
test('review determinism — Jaccard of synthetic review-run-A vs review-run-B meets SC7 threshold (0.833)', () => {
const a = loadFindings('tests/synthetic/review-run-A.md');
const b = loadFindings('tests/synthetic/review-run-B.md');
const sim = jaccardSimilarity(a, b);
assert.ok(
sim >= SC7_THRESHOLD,
`jaccardSimilarity(findingTokens(reviewA), findingTokens(reviewB)) = ${sim} < ${SC7_THRESHOLD} (SC7 floor). ` +
`Fixtures may have drifted — recompute IDs via lib/parsers/finding-id.mjs.`,
);
});
test('review determinism — finding IDs are 40-char hex (parseFindingId valid)', () => {
for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) {
const findings = loadFindings(rel);
for (const id of findings) {
const parsed = parseFindingId(id);
assert.ok(
parsed.valid,
`${rel}: ID ${JSON.stringify(id)} is not a 40-char lowercase hex string (parseFindingId rejected it)`,
);
}
}
});
test('review determinism — both fixtures contain at least 25 unique finding-IDs', () => {
for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) {
const findings = loadFindings(rel);
assert.ok(
new Set(findings).size >= 25,
`${rel}: < 25 unique finding-IDs (got ${new Set(findings).size}). Synthetic fixtures must reflect a substantial review.`,
);
}
});
test('review determinism — no duplicate IDs within run', () => {
for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) {
const findings = loadFindings(rel);
assert.strictEqual(
new Set(findings).size,
findings.length,
`${rel}: contains duplicate finding-IDs (${findings.length} entries vs ${new Set(findings).size} unique)`,
);
}
});

View file

@ -0,0 +1,69 @@
---
type: ultrareview-synthetic
review_version: "1.0"
created: 2026-05-04
task: "Add JWT authentication with refresh-token rotation"
slug: jwt-auth-synthetic
run_id: A
verdict: WARN
findings:
- 44b18cf6b84fcb23ef1d52682504c2edeed24f66
- f7e307a427154c2c15df4c63eaff6fd846e075a7
- 31fa81fa5bf9b84c70864ee09aa8d087870c473a
- bfc0e3a7c1a5b13dbdc6ed8325140100b02db45d
- be76c6dba12bfd9073b1737de5813e316a158dc6
- f0928545e7c1dc48796fe857138fab7f100ce8c7
- 4189ba4236119184017fd26735bfb582706994e9
- 46f07246ff17c013740c0726b7be9a65fff10c67
- 5501c54bda4a39df17d66938f4a7fe872e365a0f
- 0173116735f75aabab36ecec863cb429d2f30528
- 8f7fc683dc78d3adea8d35221915839702869af0
- ee986665d695ca46c9a7f0d5c38bab73e73450a9
- d863b17426ddec54bf7624405f3b64e206a73ed7
- 64ea0bbf43c44dbf0da53f25755e0112ce2eb08b
- 6971113644b777a8c164dfd8473739b03d1796be
- 65f6edb11fed982b921ff018bd0fb1dcd10a1703
- 9133851cf557f5955301803479936733b296f125
- ffb170a0d19e4afac6379e64d26485883267bea8
- 89f990535da373f5e97a091e5bbbf47a777c13d6
- 664d4ec53e90ef6d24525a85b8d4071bfb037da8
- 137db625a1ee639698c9e095e25845ef25879599
- 6e586f167fac4cd57dc8178ceb4ca265a37404dc
- 24671775282593381af4a8fa77eb3f7a36f9f84e
- 71dbed32baf440d94f0ccaa6a997a6922cee7679
- 5de9b2b26d03590845183d42387fcb22007b3f5d
- c9aca8c3a265e2f083d75ac6da3e6d67909091b9
- 75f32c9d304b742af2a7bafc354ec3666e53c054
- 6547dfd19035bc012a50c19f4321fcfc9535fec8
- 7554bc48226406e85282c7daeaba75cc732f4b35
- 4f48547385c2d343ee0994d825321e6e6b90c89d
---
# Synthetic review run A — JWT authentication with refresh-token rotation
This fixture represents one synthesized run of `/ultrareview-local` on a
hand-calibrated brief. It is paired with `review-run-B.md` for the
`review-determinism.test.mjs` Jaccard floor (≥ 0.833).
## How this fixture is used
`tests/synthetic/review-determinism.test.mjs` reads the `findings` array from
this file's frontmatter and computes
`jaccardSimilarity(findingsA, findingsB)`. The test asserts the similarity is
at or above the SC7 brief threshold (0.833).
This fixture is distinct from `tests/fixtures/ultrareview/review-run-A.md`,
which feeds the existing `tests/lib/review-determinism.test.mjs` against the
v1.0 SC4 floor (0.70). The synthetic pair pushes the floor higher per SC7.
## Fixture math
- A has 30 unique finding-IDs
- B has 30 unique finding-IDs
- Intersection (shared IDs): 28
- Union: 32
- Jaccard: 28/32 = 0.875 (above 0.833 floor)
Each ID is the SHA-1 of a synthetic `file:line:rule_key` triple per
`lib/parsers/finding-id.mjs`. The shared 28 represent stable findings; the
2 unique-per-side represent paraphrase variation in `file:line` anchoring.

View file

@ -0,0 +1,63 @@
---
type: ultrareview-synthetic
review_version: "1.0"
created: 2026-05-04
task: "Add JWT authentication with refresh-token rotation"
slug: jwt-auth-synthetic
run_id: B
verdict: WARN
findings:
- 44b18cf6b84fcb23ef1d52682504c2edeed24f66
- f7e307a427154c2c15df4c63eaff6fd846e075a7
- 31fa81fa5bf9b84c70864ee09aa8d087870c473a
- bfc0e3a7c1a5b13dbdc6ed8325140100b02db45d
- be76c6dba12bfd9073b1737de5813e316a158dc6
- f0928545e7c1dc48796fe857138fab7f100ce8c7
- 4189ba4236119184017fd26735bfb582706994e9
- 46f07246ff17c013740c0726b7be9a65fff10c67
- 5501c54bda4a39df17d66938f4a7fe872e365a0f
- 0173116735f75aabab36ecec863cb429d2f30528
- 8f7fc683dc78d3adea8d35221915839702869af0
- ee986665d695ca46c9a7f0d5c38bab73e73450a9
- d863b17426ddec54bf7624405f3b64e206a73ed7
- 64ea0bbf43c44dbf0da53f25755e0112ce2eb08b
- 6971113644b777a8c164dfd8473739b03d1796be
- 65f6edb11fed982b921ff018bd0fb1dcd10a1703
- 9133851cf557f5955301803479936733b296f125
- ffb170a0d19e4afac6379e64d26485883267bea8
- 89f990535da373f5e97a091e5bbbf47a777c13d6
- 664d4ec53e90ef6d24525a85b8d4071bfb037da8
- 137db625a1ee639698c9e095e25845ef25879599
- 6e586f167fac4cd57dc8178ceb4ca265a37404dc
- 24671775282593381af4a8fa77eb3f7a36f9f84e
- 71dbed32baf440d94f0ccaa6a997a6922cee7679
- 5de9b2b26d03590845183d42387fcb22007b3f5d
- c9aca8c3a265e2f083d75ac6da3e6d67909091b9
- 75f32c9d304b742af2a7bafc354ec3666e53c054
- 6547dfd19035bc012a50c19f4321fcfc9535fec8
- a5fbe85476128bb67796ecf97a42065b6a0bf9c4
- 19ec9d34e1d6560b56f885a5a12ce491354c4b40
---
# Synthetic review run B — JWT authentication with refresh-token rotation
Companion to `review-run-A.md`. See run A's body for the determinism
contract.
## Fixture math
- A has 30 unique finding-IDs
- B has 30 unique finding-IDs
- Intersection (shared IDs): 28
- Union: 32
- Jaccard: 28/32 = 0.875 (above 0.833 floor)
## Differences from run A
- A's last 2 IDs come from `src/auth/jwt.ts:201:rule-1` and
`src/auth/refresh.ts:55:rule-3`
- B's last 2 IDs come from `src/auth/jwt.ts:202:rule-1` and
`src/auth/refresh.ts:56:rule-3`
The off-by-one line anchoring models realistic post-edit drift between two
review runs against subtly different working trees.