test(ultraplan-local): add plan-determinism + review-determinism synthetic fixtures (SC7 floor)

Adds 6 files in tests/synthetic/ exercising the determinism pipeline at the SC7 brief floor (Jaccard >= 0.833). Plan fixture pair: 40 step titles each with 38 shared (Jaccard 0.905). Review fixture pair: 30 finding-IDs each with 28 shared (Jaccard 0.875). Reuses lib/parsers/jaccard.mjs + lib/parsers/finding-id.mjs. The new pair coexists with tests/lib/review-determinism.test.mjs which holds the older SC4 (0.70) floor against tests/fixtures/ultrareview/. The lower floor protects pipeline regressions; the higher floor anchors the speedup brief's determinism aspiration. [skip-docs]
2026-05-04 08:46:39 +02:00 · 2026-05-04 08:46:39 +02:00 · 0c0a87e709
commit 0c0a87e709
parent b1738b419c
6 changed files with 425 additions and 0 deletions
--- a/plugins/ultraplan-local/tests/synthetic/plan-determinism.test.mjs
+++ b/plugins/ultraplan-local/tests/synthetic/plan-determinism.test.mjs
@ -0,0 +1,63 @@
+// tests/synthetic/plan-determinism.test.mjs
+// SC7 plan-determinism floor — Jaccard pipeline test.
+//
+// Reads two synthetic plan-run fixtures and asserts that
+// jaccardSimilarity(stepsTokens(planA), stepsTokens(planB)) >= 0.833.
+//
+// This exercises the determinism pipeline (parser + jaccard) on a known
+// input pair. It does NOT measure real-LLM determinism — that is deferred
+// to a future run of the pipeline against examples/01-add-verbose-flag/.
+
+import { test } from 'node:test';
+import { strict as assert } from 'node:assert';
+import { readFileSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs';
+import { parseDocument } from '../../lib/util/frontmatter.mjs';
+
+const HERE = dirname(fileURLToPath(import.meta.url));
+const ROOT = join(HERE, '..', '..');
+
+const SC7_THRESHOLD = 0.833;
+
+function loadSteps(rel) {
+  const text = readFileSync(join(ROOT, rel), 'utf-8');
+  const doc = parseDocument(text);
+  assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`);
+  const steps = doc.parsed.frontmatter && doc.parsed.frontmatter.steps;
+  assert.ok(Array.isArray(steps), `frontmatter.steps of ${rel} is not an array`);
+  return steps;
+}
+
+test('plan determinism — Jaccard of synthetic plan-run-A vs plan-run-B meets SC7 threshold (0.833)', () => {
+  const a = loadSteps('tests/synthetic/plan-run-A.md');
+  const b = loadSteps('tests/synthetic/plan-run-B.md');
+  const sim = jaccardSimilarity(a, b);
+  assert.ok(
+    sim >= SC7_THRESHOLD,
+    `jaccardSimilarity(stepsTokens(planA), stepsTokens(planB)) = ${sim} < ${SC7_THRESHOLD} (SC7 floor). ` +
+    `Fixtures may have drifted — re-tune step titles to restore the overlap.`,
+  );
+});
+
+test('plan determinism — both fixtures contain at least 30 unique step titles', () => {
+  for (const rel of ['tests/synthetic/plan-run-A.md', 'tests/synthetic/plan-run-B.md']) {
+    const steps = loadSteps(rel);
+    assert.ok(
+      new Set(steps).size >= 30,
+      `${rel}: < 30 unique step titles (got ${new Set(steps).size}). Synthetic fixtures must reflect a substantial plan.`,
+    );
+  }
+});
+
+test('plan determinism — no duplicate step titles within run', () => {
+  for (const rel of ['tests/synthetic/plan-run-A.md', 'tests/synthetic/plan-run-B.md']) {
+    const steps = loadSteps(rel);
+    assert.strictEqual(
+      new Set(steps).size,
+      steps.length,
+      `${rel}: contains duplicate step titles (${steps.length} entries vs ${new Set(steps).size} unique)`,
+    );
+  }
+});
--- a/plugins/ultraplan-local/tests/synthetic/plan-run-A.md
+++ b/plugins/ultraplan-local/tests/synthetic/plan-run-A.md
@ -0,0 +1,74 @@
+---
+type: ultraplan-synthetic
+plan_version: "1.7"
+created: 2026-05-04
+task: "Add --verbose flag to CLI"
+slug: verbose-flag
+run_id: A
+steps:
+  - "Add config entry for verbose flag in package.json"
+  - "Define types for verbose mode in types.ts"
+  - "Update parseArgs to recognize --verbose flag"
+  - "Pass verbose context through main entry point"
+  - "Add log level enum (silent, normal, verbose)"
+  - "Wire log level into logger module"
+  - "Replace console.log with logger.info in handler.ts"
+  - "Add tests for parseArgs --verbose recognition"
+  - "Add tests for log level enum mapping"
+  - "Update README with --verbose flag documentation"
+  - "Add CHANGELOG entry for verbose flag"
+  - "Bump package.json minor version"
+  - "Add lint rule blocking direct console usage"
+  - "Run lint and fix new violations"
+  - "Add CLI integration test for --verbose end-to-end"
+  - "Add fixture file for verbose log capture"
+  - "Document verbose output format in docs/cli.md"
+  - "Add jsdoc for new logger API"
+  - "Verify all existing tests pass with verbose disabled"
+  - "Add backward-compat test for legacy quiet behavior"
+  - "Add edge-case test for repeated --verbose flags"
+  - "Add edge-case test for --verbose with --silent collision"
+  - "Update help text to list --verbose flag"
+  - "Add usage example to docs/quickstart.md"
+  - "Verify CI matrix runs on Node 18 and 20"
+  - "Add npm script for verbose mode debugging"
+  - "Run security audit on logger dependency tree"
+  - "Verify no PII leaks in verbose log output"
+  - "Add manual test checklist to CONTRIBUTING.md"
+  - "Update .gitignore for verbose log dump files"
+  - "Add cleanup logic for stale verbose logs"
+  - "Add unit test for cleanup logic"
+  - "Verify exit code on verbose mode error"
+  - "Add stderr routing for warnings in verbose"
+  - "Add timestamp prefix in verbose log lines"
+  - "Add test for timestamp format"
+  - "Update troubleshooting guide with verbose flag"
+  - "Verify version sync across all docs"
+  - "Add benchmark for verbose log emission cost"
+  - "Document benchmark methodology in PERF.md"
+---
+
+# Synthetic plan run A — Add --verbose flag to CLI
+
+This fixture represents one synthesized run of `/ultraplan-local` against a
+hand-calibrated brief. It is paired with `plan-run-B.md` for the
+`plan-determinism.test.mjs` Jaccard floor (≥ 0.833).
+
+## How this fixture is used
+
+`tests/synthetic/plan-determinism.test.mjs` reads the `steps` array from this
+file's frontmatter and computes `jaccardSimilarity(stepsA, stepsB)`. The test
+asserts the similarity is at or above the SC7 brief threshold (0.833).
+
+This is a SYNTHETIC fixture — it is NOT the output of a real LLM run. The
+purpose is to exercise the determinism pipeline (parser + jaccard) on a known
+input pair so regressions in the pipeline are caught even when LLM
+determinism cannot be cheaply re-measured.
+
+## Fixture math
+
+- A has 40 unique step titles
+- B has 40 unique step titles
+- Intersection (shared titles): 38
+- Union: 42
+- Jaccard: 38/42 ≈ 0.9047 (well above 0.833 floor)
--- a/plugins/ultraplan-local/tests/synthetic/plan-run-B.md
+++ b/plugins/ultraplan-local/tests/synthetic/plan-run-B.md
@ -0,0 +1,77 @@
+---
+type: ultraplan-synthetic
+plan_version: "1.7"
+created: 2026-05-04
+task: "Add --verbose flag to CLI"
+slug: verbose-flag
+run_id: B
+steps:
+  - "Add config entry for verbose flag in package.json"
+  - "Define types for verbose mode in types.ts"
+  - "Update parseArgs to recognize --verbose flag"
+  - "Pass verbose context through main entry point"
+  - "Add log level enum (silent, normal, verbose)"
+  - "Wire log level into logger module"
+  - "Replace console.log with logger.info in handler.ts"
+  - "Add tests for parseArgs --verbose recognition"
+  - "Add tests for log level enum mapping"
+  - "Update README with --verbose flag documentation"
+  - "Add CHANGELOG entry for verbose flag"
+  - "Bump package.json minor version"
+  - "Add lint rule blocking direct console usage"
+  - "Run lint and fix new violations"
+  - "Add CLI integration test for --verbose end-to-end"
+  - "Add fixture file for verbose log capture"
+  - "Document verbose output format in docs/cli.md"
+  - "Add jsdoc for new logger API"
+  - "Verify all existing tests pass with verbose disabled"
+  - "Add backward-compat test for legacy quiet behavior"
+  - "Add edge-case test for repeated --verbose flags"
+  - "Add edge-case test for --verbose with --silent collision"
+  - "Update help text to list --verbose flag"
+  - "Add usage example to docs/quickstart.md"
+  - "Verify CI matrix runs on Node 18 and 20"
+  - "Add npm script for verbose mode debugging"
+  - "Run security audit on logger dependency tree"
+  - "Verify no PII leaks in verbose log output"
+  - "Add manual test checklist to CONTRIBUTING.md"
+  - "Update .gitignore for verbose log dump files"
+  - "Add cleanup logic for stale verbose logs"
+  - "Add unit test for cleanup logic"
+  - "Verify exit code on verbose mode error"
+  - "Add stderr routing for warnings in verbose"
+  - "Add timestamp prefix in verbose log lines"
+  - "Add test for timestamp format"
+  - "Update troubleshooting guide with verbose flag"
+  - "Verify version sync across all docs"
+  - "Add benchmark for verbose log capture overhead"
+  - "Document overhead methodology in PERF.md"
+---
+
+# Synthetic plan run B — Add --verbose flag to CLI
+
+This fixture represents a second synthesized run of `/ultraplan-local` against
+the same hand-calibrated brief used for `plan-run-A.md`. The two runs differ
+on 2 step titles (modeling realistic LLM variation).
+
+## How this fixture is used
+
+See `plan-run-A.md` for the determinism contract.
+
+## Fixture math
+
+- A has 40 unique step titles
+- B has 40 unique step titles
+- Intersection (shared titles): 38
+- Union: 42
+- Jaccard: 38/42 ≈ 0.9047 (well above 0.833 floor)
+
+## Differences from run A
+
+- A includes "Add benchmark for verbose log emission cost" → B replaces with
+  "Add benchmark for verbose log capture overhead"
+- A includes "Document benchmark methodology in PERF.md" → B replaces with
+  "Document overhead methodology in PERF.md"
+
+These represent the kind of paraphrase variation a stochastic planner may
+produce on consecutive runs against an identical brief.
--- a/plugins/ultraplan-local/tests/synthetic/review-determinism.test.mjs
+++ b/plugins/ultraplan-local/tests/synthetic/review-determinism.test.mjs
@ -0,0 +1,79 @@
+// tests/synthetic/review-determinism.test.mjs
+// SC7 review-determinism floor — Jaccard pipeline test.
+//
+// Reads two synthetic review-run fixtures and asserts that
+// jaccardSimilarity(findingTokens(reviewA), findingTokens(reviewB)) >= 0.833.
+//
+// This is the SC7 (higher) floor. The companion
+// tests/lib/review-determinism.test.mjs holds the SC4 (0.70) floor against
+// tests/fixtures/ultrareview/. Both pairs coexist on purpose: the lower
+// floor protects against pipeline regressions, the higher one anchors the
+// determinism aspiration set in the speedup brief.
+
+import { test } from 'node:test';
+import { strict as assert } from 'node:assert';
+import { readFileSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs';
+import { parseFindingId } from '../../lib/parsers/finding-id.mjs';
+import { parseDocument } from '../../lib/util/frontmatter.mjs';
+
+const HERE = dirname(fileURLToPath(import.meta.url));
+const ROOT = join(HERE, '..', '..');
+
+const SC7_THRESHOLD = 0.833;
+
+function loadFindings(rel) {
+  const text = readFileSync(join(ROOT, rel), 'utf-8');
+  const doc = parseDocument(text);
+  assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`);
+  const findings = doc.parsed.frontmatter && doc.parsed.frontmatter.findings;
+  assert.ok(Array.isArray(findings), `frontmatter.findings of ${rel} is not an array`);
+  return findings;
+}
+
+test('review determinism — Jaccard of synthetic review-run-A vs review-run-B meets SC7 threshold (0.833)', () => {
+  const a = loadFindings('tests/synthetic/review-run-A.md');
+  const b = loadFindings('tests/synthetic/review-run-B.md');
+  const sim = jaccardSimilarity(a, b);
+  assert.ok(
+    sim >= SC7_THRESHOLD,
+    `jaccardSimilarity(findingTokens(reviewA), findingTokens(reviewB)) = ${sim} < ${SC7_THRESHOLD} (SC7 floor). ` +
+    `Fixtures may have drifted — recompute IDs via lib/parsers/finding-id.mjs.`,
+  );
+});
+
+test('review determinism — finding IDs are 40-char hex (parseFindingId valid)', () => {
+  for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) {
+    const findings = loadFindings(rel);
+    for (const id of findings) {
+      const parsed = parseFindingId(id);
+      assert.ok(
+        parsed.valid,
+        `${rel}: ID ${JSON.stringify(id)} is not a 40-char lowercase hex string (parseFindingId rejected it)`,
+      );
+    }
+  }
+});
+
+test('review determinism — both fixtures contain at least 25 unique finding-IDs', () => {
+  for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) {
+    const findings = loadFindings(rel);
+    assert.ok(
+      new Set(findings).size >= 25,
+      `${rel}: < 25 unique finding-IDs (got ${new Set(findings).size}). Synthetic fixtures must reflect a substantial review.`,
+    );
+  }
+});
+
+test('review determinism — no duplicate IDs within run', () => {
+  for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) {
+    const findings = loadFindings(rel);
+    assert.strictEqual(
+      new Set(findings).size,
+      findings.length,
+      `${rel}: contains duplicate finding-IDs (${findings.length} entries vs ${new Set(findings).size} unique)`,
+    );
+  }
+});
--- a/plugins/ultraplan-local/tests/synthetic/review-run-A.md
+++ b/plugins/ultraplan-local/tests/synthetic/review-run-A.md
@ -0,0 +1,69 @@
+---
+type: ultrareview-synthetic
+review_version: "1.0"
+created: 2026-05-04
+task: "Add JWT authentication with refresh-token rotation"
+slug: jwt-auth-synthetic
+run_id: A
+verdict: WARN
+findings:
+  - 44b18cf6b84fcb23ef1d52682504c2edeed24f66
+  - f7e307a427154c2c15df4c63eaff6fd846e075a7
+  - 31fa81fa5bf9b84c70864ee09aa8d087870c473a
+  - bfc0e3a7c1a5b13dbdc6ed8325140100b02db45d
+  - be76c6dba12bfd9073b1737de5813e316a158dc6
+  - f0928545e7c1dc48796fe857138fab7f100ce8c7
+  - 4189ba4236119184017fd26735bfb582706994e9
+  - 46f07246ff17c013740c0726b7be9a65fff10c67
+  - 5501c54bda4a39df17d66938f4a7fe872e365a0f
+  - 0173116735f75aabab36ecec863cb429d2f30528
+  - 8f7fc683dc78d3adea8d35221915839702869af0
+  - ee986665d695ca46c9a7f0d5c38bab73e73450a9
+  - d863b17426ddec54bf7624405f3b64e206a73ed7
+  - 64ea0bbf43c44dbf0da53f25755e0112ce2eb08b
+  - 6971113644b777a8c164dfd8473739b03d1796be
+  - 65f6edb11fed982b921ff018bd0fb1dcd10a1703
+  - 9133851cf557f5955301803479936733b296f125
+  - ffb170a0d19e4afac6379e64d26485883267bea8
+  - 89f990535da373f5e97a091e5bbbf47a777c13d6
+  - 664d4ec53e90ef6d24525a85b8d4071bfb037da8
+  - 137db625a1ee639698c9e095e25845ef25879599
+  - 6e586f167fac4cd57dc8178ceb4ca265a37404dc
+  - 24671775282593381af4a8fa77eb3f7a36f9f84e
+  - 71dbed32baf440d94f0ccaa6a997a6922cee7679
+  - 5de9b2b26d03590845183d42387fcb22007b3f5d
+  - c9aca8c3a265e2f083d75ac6da3e6d67909091b9
+  - 75f32c9d304b742af2a7bafc354ec3666e53c054
+  - 6547dfd19035bc012a50c19f4321fcfc9535fec8
+  - 7554bc48226406e85282c7daeaba75cc732f4b35
+  - 4f48547385c2d343ee0994d825321e6e6b90c89d
+---
+
+# Synthetic review run A — JWT authentication with refresh-token rotation
+
+This fixture represents one synthesized run of `/ultrareview-local` on a
+hand-calibrated brief. It is paired with `review-run-B.md` for the
+`review-determinism.test.mjs` Jaccard floor (≥ 0.833).
+
+## How this fixture is used
+
+`tests/synthetic/review-determinism.test.mjs` reads the `findings` array from
+this file's frontmatter and computes
+`jaccardSimilarity(findingsA, findingsB)`. The test asserts the similarity is
+at or above the SC7 brief threshold (0.833).
+
+This fixture is distinct from `tests/fixtures/ultrareview/review-run-A.md`,
+which feeds the existing `tests/lib/review-determinism.test.mjs` against the
+v1.0 SC4 floor (0.70). The synthetic pair pushes the floor higher per SC7.
+
+## Fixture math
+
+- A has 30 unique finding-IDs
+- B has 30 unique finding-IDs
+- Intersection (shared IDs): 28
+- Union: 32
+- Jaccard: 28/32 = 0.875 (above 0.833 floor)
+
+Each ID is the SHA-1 of a synthetic `file:line:rule_key` triple per
+`lib/parsers/finding-id.mjs`. The shared 28 represent stable findings; the
+2 unique-per-side represent paraphrase variation in `file:line` anchoring.
--- a/plugins/ultraplan-local/tests/synthetic/review-run-B.md
+++ b/plugins/ultraplan-local/tests/synthetic/review-run-B.md
@ -0,0 +1,63 @@
+---
+type: ultrareview-synthetic
+review_version: "1.0"
+created: 2026-05-04
+task: "Add JWT authentication with refresh-token rotation"
+slug: jwt-auth-synthetic
+run_id: B
+verdict: WARN
+findings:
+  - 44b18cf6b84fcb23ef1d52682504c2edeed24f66
+  - f7e307a427154c2c15df4c63eaff6fd846e075a7
+  - 31fa81fa5bf9b84c70864ee09aa8d087870c473a
+  - bfc0e3a7c1a5b13dbdc6ed8325140100b02db45d
+  - be76c6dba12bfd9073b1737de5813e316a158dc6
+  - f0928545e7c1dc48796fe857138fab7f100ce8c7
+  - 4189ba4236119184017fd26735bfb582706994e9
+  - 46f07246ff17c013740c0726b7be9a65fff10c67
+  - 5501c54bda4a39df17d66938f4a7fe872e365a0f
+  - 0173116735f75aabab36ecec863cb429d2f30528
+  - 8f7fc683dc78d3adea8d35221915839702869af0
+  - ee986665d695ca46c9a7f0d5c38bab73e73450a9
+  - d863b17426ddec54bf7624405f3b64e206a73ed7
+  - 64ea0bbf43c44dbf0da53f25755e0112ce2eb08b
+  - 6971113644b777a8c164dfd8473739b03d1796be
+  - 65f6edb11fed982b921ff018bd0fb1dcd10a1703
+  - 9133851cf557f5955301803479936733b296f125
+  - ffb170a0d19e4afac6379e64d26485883267bea8
+  - 89f990535da373f5e97a091e5bbbf47a777c13d6
+  - 664d4ec53e90ef6d24525a85b8d4071bfb037da8
+  - 137db625a1ee639698c9e095e25845ef25879599
+  - 6e586f167fac4cd57dc8178ceb4ca265a37404dc
+  - 24671775282593381af4a8fa77eb3f7a36f9f84e
+  - 71dbed32baf440d94f0ccaa6a997a6922cee7679
+  - 5de9b2b26d03590845183d42387fcb22007b3f5d
+  - c9aca8c3a265e2f083d75ac6da3e6d67909091b9
+  - 75f32c9d304b742af2a7bafc354ec3666e53c054
+  - 6547dfd19035bc012a50c19f4321fcfc9535fec8
+  - a5fbe85476128bb67796ecf97a42065b6a0bf9c4
+  - 19ec9d34e1d6560b56f885a5a12ce491354c4b40
+---
+
+# Synthetic review run B — JWT authentication with refresh-token rotation
+
+Companion to `review-run-A.md`. See run A's body for the determinism
+contract.
+
+## Fixture math
+
+- A has 30 unique finding-IDs
+- B has 30 unique finding-IDs
+- Intersection (shared IDs): 28
+- Union: 32
+- Jaccard: 28/32 = 0.875 (above 0.833 floor)
+
+## Differences from run A
+
+- A's last 2 IDs come from `src/auth/jwt.ts:201:rule-1` and
+  `src/auth/refresh.ts:55:rule-3`
+- B's last 2 IDs come from `src/auth/jwt.ts:202:rule-1` and
+  `src/auth/refresh.ts:56:rule-3`
+
+The off-by-one line anchoring models realistic post-edit drift between two
+review runs against subtly different working trees.