diff --git a/plugins/ultraplan-local/tests/synthetic/plan-determinism.test.mjs b/plugins/ultraplan-local/tests/synthetic/plan-determinism.test.mjs new file mode 100644 index 0000000..009d579 --- /dev/null +++ b/plugins/ultraplan-local/tests/synthetic/plan-determinism.test.mjs @@ -0,0 +1,63 @@ +// tests/synthetic/plan-determinism.test.mjs +// SC7 plan-determinism floor — Jaccard pipeline test. +// +// Reads two synthetic plan-run fixtures and asserts that +// jaccardSimilarity(stepsTokens(planA), stepsTokens(planB)) >= 0.833. +// +// This exercises the determinism pipeline (parser + jaccard) on a known +// input pair. It does NOT measure real-LLM determinism — that is deferred +// to a future run of the pipeline against examples/01-add-verbose-flag/. + +import { test } from 'node:test'; +import { strict as assert } from 'node:assert'; +import { readFileSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs'; +import { parseDocument } from '../../lib/util/frontmatter.mjs'; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const ROOT = join(HERE, '..', '..'); + +const SC7_THRESHOLD = 0.833; + +function loadSteps(rel) { + const text = readFileSync(join(ROOT, rel), 'utf-8'); + const doc = parseDocument(text); + assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`); + const steps = doc.parsed.frontmatter && doc.parsed.frontmatter.steps; + assert.ok(Array.isArray(steps), `frontmatter.steps of ${rel} is not an array`); + return steps; +} + +test('plan determinism — Jaccard of synthetic plan-run-A vs plan-run-B meets SC7 threshold (0.833)', () => { + const a = loadSteps('tests/synthetic/plan-run-A.md'); + const b = loadSteps('tests/synthetic/plan-run-B.md'); + const sim = jaccardSimilarity(a, b); + assert.ok( + sim >= SC7_THRESHOLD, + `jaccardSimilarity(stepsTokens(planA), stepsTokens(planB)) = ${sim} < ${SC7_THRESHOLD} (SC7 floor). ` + + `Fixtures may have drifted — re-tune step titles to restore the overlap.`, + ); +}); + +test('plan determinism — both fixtures contain at least 30 unique step titles', () => { + for (const rel of ['tests/synthetic/plan-run-A.md', 'tests/synthetic/plan-run-B.md']) { + const steps = loadSteps(rel); + assert.ok( + new Set(steps).size >= 30, + `${rel}: < 30 unique step titles (got ${new Set(steps).size}). Synthetic fixtures must reflect a substantial plan.`, + ); + } +}); + +test('plan determinism — no duplicate step titles within run', () => { + for (const rel of ['tests/synthetic/plan-run-A.md', 'tests/synthetic/plan-run-B.md']) { + const steps = loadSteps(rel); + assert.strictEqual( + new Set(steps).size, + steps.length, + `${rel}: contains duplicate step titles (${steps.length} entries vs ${new Set(steps).size} unique)`, + ); + } +}); diff --git a/plugins/ultraplan-local/tests/synthetic/plan-run-A.md b/plugins/ultraplan-local/tests/synthetic/plan-run-A.md new file mode 100644 index 0000000..09bf4c9 --- /dev/null +++ b/plugins/ultraplan-local/tests/synthetic/plan-run-A.md @@ -0,0 +1,74 @@ +--- +type: ultraplan-synthetic +plan_version: "1.7" +created: 2026-05-04 +task: "Add --verbose flag to CLI" +slug: verbose-flag +run_id: A +steps: + - "Add config entry for verbose flag in package.json" + - "Define types for verbose mode in types.ts" + - "Update parseArgs to recognize --verbose flag" + - "Pass verbose context through main entry point" + - "Add log level enum (silent, normal, verbose)" + - "Wire log level into logger module" + - "Replace console.log with logger.info in handler.ts" + - "Add tests for parseArgs --verbose recognition" + - "Add tests for log level enum mapping" + - "Update README with --verbose flag documentation" + - "Add CHANGELOG entry for verbose flag" + - "Bump package.json minor version" + - "Add lint rule blocking direct console usage" + - "Run lint and fix new violations" + - "Add CLI integration test for --verbose end-to-end" + - "Add fixture file for verbose log capture" + - "Document verbose output format in docs/cli.md" + - "Add jsdoc for new logger API" + - "Verify all existing tests pass with verbose disabled" + - "Add backward-compat test for legacy quiet behavior" + - "Add edge-case test for repeated --verbose flags" + - "Add edge-case test for --verbose with --silent collision" + - "Update help text to list --verbose flag" + - "Add usage example to docs/quickstart.md" + - "Verify CI matrix runs on Node 18 and 20" + - "Add npm script for verbose mode debugging" + - "Run security audit on logger dependency tree" + - "Verify no PII leaks in verbose log output" + - "Add manual test checklist to CONTRIBUTING.md" + - "Update .gitignore for verbose log dump files" + - "Add cleanup logic for stale verbose logs" + - "Add unit test for cleanup logic" + - "Verify exit code on verbose mode error" + - "Add stderr routing for warnings in verbose" + - "Add timestamp prefix in verbose log lines" + - "Add test for timestamp format" + - "Update troubleshooting guide with verbose flag" + - "Verify version sync across all docs" + - "Add benchmark for verbose log emission cost" + - "Document benchmark methodology in PERF.md" +--- + +# Synthetic plan run A — Add --verbose flag to CLI + +This fixture represents one synthesized run of `/ultraplan-local` against a +hand-calibrated brief. It is paired with `plan-run-B.md` for the +`plan-determinism.test.mjs` Jaccard floor (≥ 0.833). + +## How this fixture is used + +`tests/synthetic/plan-determinism.test.mjs` reads the `steps` array from this +file's frontmatter and computes `jaccardSimilarity(stepsA, stepsB)`. The test +asserts the similarity is at or above the SC7 brief threshold (0.833). + +This is a SYNTHETIC fixture — it is NOT the output of a real LLM run. The +purpose is to exercise the determinism pipeline (parser + jaccard) on a known +input pair so regressions in the pipeline are caught even when LLM +determinism cannot be cheaply re-measured. + +## Fixture math + +- A has 40 unique step titles +- B has 40 unique step titles +- Intersection (shared titles): 38 +- Union: 42 +- Jaccard: 38/42 ≈ 0.9047 (well above 0.833 floor) diff --git a/plugins/ultraplan-local/tests/synthetic/plan-run-B.md b/plugins/ultraplan-local/tests/synthetic/plan-run-B.md new file mode 100644 index 0000000..ba3698d --- /dev/null +++ b/plugins/ultraplan-local/tests/synthetic/plan-run-B.md @@ -0,0 +1,77 @@ +--- +type: ultraplan-synthetic +plan_version: "1.7" +created: 2026-05-04 +task: "Add --verbose flag to CLI" +slug: verbose-flag +run_id: B +steps: + - "Add config entry for verbose flag in package.json" + - "Define types for verbose mode in types.ts" + - "Update parseArgs to recognize --verbose flag" + - "Pass verbose context through main entry point" + - "Add log level enum (silent, normal, verbose)" + - "Wire log level into logger module" + - "Replace console.log with logger.info in handler.ts" + - "Add tests for parseArgs --verbose recognition" + - "Add tests for log level enum mapping" + - "Update README with --verbose flag documentation" + - "Add CHANGELOG entry for verbose flag" + - "Bump package.json minor version" + - "Add lint rule blocking direct console usage" + - "Run lint and fix new violations" + - "Add CLI integration test for --verbose end-to-end" + - "Add fixture file for verbose log capture" + - "Document verbose output format in docs/cli.md" + - "Add jsdoc for new logger API" + - "Verify all existing tests pass with verbose disabled" + - "Add backward-compat test for legacy quiet behavior" + - "Add edge-case test for repeated --verbose flags" + - "Add edge-case test for --verbose with --silent collision" + - "Update help text to list --verbose flag" + - "Add usage example to docs/quickstart.md" + - "Verify CI matrix runs on Node 18 and 20" + - "Add npm script for verbose mode debugging" + - "Run security audit on logger dependency tree" + - "Verify no PII leaks in verbose log output" + - "Add manual test checklist to CONTRIBUTING.md" + - "Update .gitignore for verbose log dump files" + - "Add cleanup logic for stale verbose logs" + - "Add unit test for cleanup logic" + - "Verify exit code on verbose mode error" + - "Add stderr routing for warnings in verbose" + - "Add timestamp prefix in verbose log lines" + - "Add test for timestamp format" + - "Update troubleshooting guide with verbose flag" + - "Verify version sync across all docs" + - "Add benchmark for verbose log capture overhead" + - "Document overhead methodology in PERF.md" +--- + +# Synthetic plan run B — Add --verbose flag to CLI + +This fixture represents a second synthesized run of `/ultraplan-local` against +the same hand-calibrated brief used for `plan-run-A.md`. The two runs differ +on 2 step titles (modeling realistic LLM variation). + +## How this fixture is used + +See `plan-run-A.md` for the determinism contract. + +## Fixture math + +- A has 40 unique step titles +- B has 40 unique step titles +- Intersection (shared titles): 38 +- Union: 42 +- Jaccard: 38/42 ≈ 0.9047 (well above 0.833 floor) + +## Differences from run A + +- A includes "Add benchmark for verbose log emission cost" → B replaces with + "Add benchmark for verbose log capture overhead" +- A includes "Document benchmark methodology in PERF.md" → B replaces with + "Document overhead methodology in PERF.md" + +These represent the kind of paraphrase variation a stochastic planner may +produce on consecutive runs against an identical brief. diff --git a/plugins/ultraplan-local/tests/synthetic/review-determinism.test.mjs b/plugins/ultraplan-local/tests/synthetic/review-determinism.test.mjs new file mode 100644 index 0000000..05b9155 --- /dev/null +++ b/plugins/ultraplan-local/tests/synthetic/review-determinism.test.mjs @@ -0,0 +1,79 @@ +// tests/synthetic/review-determinism.test.mjs +// SC7 review-determinism floor — Jaccard pipeline test. +// +// Reads two synthetic review-run fixtures and asserts that +// jaccardSimilarity(findingTokens(reviewA), findingTokens(reviewB)) >= 0.833. +// +// This is the SC7 (higher) floor. The companion +// tests/lib/review-determinism.test.mjs holds the SC4 (0.70) floor against +// tests/fixtures/ultrareview/. Both pairs coexist on purpose: the lower +// floor protects against pipeline regressions, the higher one anchors the +// determinism aspiration set in the speedup brief. + +import { test } from 'node:test'; +import { strict as assert } from 'node:assert'; +import { readFileSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { jaccardSimilarity } from '../../lib/parsers/jaccard.mjs'; +import { parseFindingId } from '../../lib/parsers/finding-id.mjs'; +import { parseDocument } from '../../lib/util/frontmatter.mjs'; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const ROOT = join(HERE, '..', '..'); + +const SC7_THRESHOLD = 0.833; + +function loadFindings(rel) { + const text = readFileSync(join(ROOT, rel), 'utf-8'); + const doc = parseDocument(text); + assert.ok(doc.valid, `frontmatter of ${rel} did not parse: ${(doc.errors || []).map(e => e.message).join(', ')}`); + const findings = doc.parsed.frontmatter && doc.parsed.frontmatter.findings; + assert.ok(Array.isArray(findings), `frontmatter.findings of ${rel} is not an array`); + return findings; +} + +test('review determinism — Jaccard of synthetic review-run-A vs review-run-B meets SC7 threshold (0.833)', () => { + const a = loadFindings('tests/synthetic/review-run-A.md'); + const b = loadFindings('tests/synthetic/review-run-B.md'); + const sim = jaccardSimilarity(a, b); + assert.ok( + sim >= SC7_THRESHOLD, + `jaccardSimilarity(findingTokens(reviewA), findingTokens(reviewB)) = ${sim} < ${SC7_THRESHOLD} (SC7 floor). ` + + `Fixtures may have drifted — recompute IDs via lib/parsers/finding-id.mjs.`, + ); +}); + +test('review determinism — finding IDs are 40-char hex (parseFindingId valid)', () => { + for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) { + const findings = loadFindings(rel); + for (const id of findings) { + const parsed = parseFindingId(id); + assert.ok( + parsed.valid, + `${rel}: ID ${JSON.stringify(id)} is not a 40-char lowercase hex string (parseFindingId rejected it)`, + ); + } + } +}); + +test('review determinism — both fixtures contain at least 25 unique finding-IDs', () => { + for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) { + const findings = loadFindings(rel); + assert.ok( + new Set(findings).size >= 25, + `${rel}: < 25 unique finding-IDs (got ${new Set(findings).size}). Synthetic fixtures must reflect a substantial review.`, + ); + } +}); + +test('review determinism — no duplicate IDs within run', () => { + for (const rel of ['tests/synthetic/review-run-A.md', 'tests/synthetic/review-run-B.md']) { + const findings = loadFindings(rel); + assert.strictEqual( + new Set(findings).size, + findings.length, + `${rel}: contains duplicate finding-IDs (${findings.length} entries vs ${new Set(findings).size} unique)`, + ); + } +}); diff --git a/plugins/ultraplan-local/tests/synthetic/review-run-A.md b/plugins/ultraplan-local/tests/synthetic/review-run-A.md new file mode 100644 index 0000000..a7b6856 --- /dev/null +++ b/plugins/ultraplan-local/tests/synthetic/review-run-A.md @@ -0,0 +1,69 @@ +--- +type: ultrareview-synthetic +review_version: "1.0" +created: 2026-05-04 +task: "Add JWT authentication with refresh-token rotation" +slug: jwt-auth-synthetic +run_id: A +verdict: WARN +findings: + - 44b18cf6b84fcb23ef1d52682504c2edeed24f66 + - f7e307a427154c2c15df4c63eaff6fd846e075a7 + - 31fa81fa5bf9b84c70864ee09aa8d087870c473a + - bfc0e3a7c1a5b13dbdc6ed8325140100b02db45d + - be76c6dba12bfd9073b1737de5813e316a158dc6 + - f0928545e7c1dc48796fe857138fab7f100ce8c7 + - 4189ba4236119184017fd26735bfb582706994e9 + - 46f07246ff17c013740c0726b7be9a65fff10c67 + - 5501c54bda4a39df17d66938f4a7fe872e365a0f + - 0173116735f75aabab36ecec863cb429d2f30528 + - 8f7fc683dc78d3adea8d35221915839702869af0 + - ee986665d695ca46c9a7f0d5c38bab73e73450a9 + - d863b17426ddec54bf7624405f3b64e206a73ed7 + - 64ea0bbf43c44dbf0da53f25755e0112ce2eb08b + - 6971113644b777a8c164dfd8473739b03d1796be + - 65f6edb11fed982b921ff018bd0fb1dcd10a1703 + - 9133851cf557f5955301803479936733b296f125 + - ffb170a0d19e4afac6379e64d26485883267bea8 + - 89f990535da373f5e97a091e5bbbf47a777c13d6 + - 664d4ec53e90ef6d24525a85b8d4071bfb037da8 + - 137db625a1ee639698c9e095e25845ef25879599 + - 6e586f167fac4cd57dc8178ceb4ca265a37404dc + - 24671775282593381af4a8fa77eb3f7a36f9f84e + - 71dbed32baf440d94f0ccaa6a997a6922cee7679 + - 5de9b2b26d03590845183d42387fcb22007b3f5d + - c9aca8c3a265e2f083d75ac6da3e6d67909091b9 + - 75f32c9d304b742af2a7bafc354ec3666e53c054 + - 6547dfd19035bc012a50c19f4321fcfc9535fec8 + - 7554bc48226406e85282c7daeaba75cc732f4b35 + - 4f48547385c2d343ee0994d825321e6e6b90c89d +--- + +# Synthetic review run A — JWT authentication with refresh-token rotation + +This fixture represents one synthesized run of `/ultrareview-local` on a +hand-calibrated brief. It is paired with `review-run-B.md` for the +`review-determinism.test.mjs` Jaccard floor (≥ 0.833). + +## How this fixture is used + +`tests/synthetic/review-determinism.test.mjs` reads the `findings` array from +this file's frontmatter and computes +`jaccardSimilarity(findingsA, findingsB)`. The test asserts the similarity is +at or above the SC7 brief threshold (0.833). + +This fixture is distinct from `tests/fixtures/ultrareview/review-run-A.md`, +which feeds the existing `tests/lib/review-determinism.test.mjs` against the +v1.0 SC4 floor (0.70). The synthetic pair pushes the floor higher per SC7. + +## Fixture math + +- A has 30 unique finding-IDs +- B has 30 unique finding-IDs +- Intersection (shared IDs): 28 +- Union: 32 +- Jaccard: 28/32 = 0.875 (above 0.833 floor) + +Each ID is the SHA-1 of a synthetic `file:line:rule_key` triple per +`lib/parsers/finding-id.mjs`. The shared 28 represent stable findings; the +2 unique-per-side represent paraphrase variation in `file:line` anchoring. diff --git a/plugins/ultraplan-local/tests/synthetic/review-run-B.md b/plugins/ultraplan-local/tests/synthetic/review-run-B.md new file mode 100644 index 0000000..78950a1 --- /dev/null +++ b/plugins/ultraplan-local/tests/synthetic/review-run-B.md @@ -0,0 +1,63 @@ +--- +type: ultrareview-synthetic +review_version: "1.0" +created: 2026-05-04 +task: "Add JWT authentication with refresh-token rotation" +slug: jwt-auth-synthetic +run_id: B +verdict: WARN +findings: + - 44b18cf6b84fcb23ef1d52682504c2edeed24f66 + - f7e307a427154c2c15df4c63eaff6fd846e075a7 + - 31fa81fa5bf9b84c70864ee09aa8d087870c473a + - bfc0e3a7c1a5b13dbdc6ed8325140100b02db45d + - be76c6dba12bfd9073b1737de5813e316a158dc6 + - f0928545e7c1dc48796fe857138fab7f100ce8c7 + - 4189ba4236119184017fd26735bfb582706994e9 + - 46f07246ff17c013740c0726b7be9a65fff10c67 + - 5501c54bda4a39df17d66938f4a7fe872e365a0f + - 0173116735f75aabab36ecec863cb429d2f30528 + - 8f7fc683dc78d3adea8d35221915839702869af0 + - ee986665d695ca46c9a7f0d5c38bab73e73450a9 + - d863b17426ddec54bf7624405f3b64e206a73ed7 + - 64ea0bbf43c44dbf0da53f25755e0112ce2eb08b + - 6971113644b777a8c164dfd8473739b03d1796be + - 65f6edb11fed982b921ff018bd0fb1dcd10a1703 + - 9133851cf557f5955301803479936733b296f125 + - ffb170a0d19e4afac6379e64d26485883267bea8 + - 89f990535da373f5e97a091e5bbbf47a777c13d6 + - 664d4ec53e90ef6d24525a85b8d4071bfb037da8 + - 137db625a1ee639698c9e095e25845ef25879599 + - 6e586f167fac4cd57dc8178ceb4ca265a37404dc + - 24671775282593381af4a8fa77eb3f7a36f9f84e + - 71dbed32baf440d94f0ccaa6a997a6922cee7679 + - 5de9b2b26d03590845183d42387fcb22007b3f5d + - c9aca8c3a265e2f083d75ac6da3e6d67909091b9 + - 75f32c9d304b742af2a7bafc354ec3666e53c054 + - 6547dfd19035bc012a50c19f4321fcfc9535fec8 + - a5fbe85476128bb67796ecf97a42065b6a0bf9c4 + - 19ec9d34e1d6560b56f885a5a12ce491354c4b40 +--- + +# Synthetic review run B — JWT authentication with refresh-token rotation + +Companion to `review-run-A.md`. See run A's body for the determinism +contract. + +## Fixture math + +- A has 30 unique finding-IDs +- B has 30 unique finding-IDs +- Intersection (shared IDs): 28 +- Union: 32 +- Jaccard: 28/32 = 0.875 (above 0.833 floor) + +## Differences from run A + +- A's last 2 IDs come from `src/auth/jwt.ts:201:rule-1` and + `src/auth/refresh.ts:55:rule-3` +- B's last 2 IDs come from `src/auth/jwt.ts:202:rule-1` and + `src/auth/refresh.ts:56:rule-3` + +The off-by-one line anchoring models realistic post-edit drift between two +review runs against subtly different working trees.