From 491711119ab446c58a8d9a30bb12a4fadfed84b4 Mon Sep 17 00:00:00 2001 From: Kjell Tore Guttormsen Date: Sat, 18 Apr 2026 15:07:25 +0200 Subject: [PATCH] test(ultraplan-local): add ngram-overlap node:test suite --- .../scripts/ngram-overlap.test.mjs | 281 ++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 plugins/ultraplan-local/scripts/ngram-overlap.test.mjs diff --git a/plugins/ultraplan-local/scripts/ngram-overlap.test.mjs b/plugins/ultraplan-local/scripts/ngram-overlap.test.mjs new file mode 100644 index 0000000..b4bf37a --- /dev/null +++ b/plugins/ultraplan-local/scripts/ngram-overlap.test.mjs @@ -0,0 +1,281 @@ +// node:test suite for scripts/ngram-overlap.mjs +// +// Run: node --test scripts/ngram-overlap.test.mjs +// +// Covers: identical text, disjoint text, partial overlap bands, +// longest-run override, fenced-code stripping, short-source fallback, +// markdown-emphasis stripping, fixture integration. + +import { test } from 'node:test'; +import assert from 'node:assert/strict'; +import { execFileSync } from 'node:child_process'; +import { existsSync } from 'node:fs'; +import { analyze, tokenize, shingles, overlap, verdict, stripMarkdown } from './ngram-overlap.mjs'; + +// === Fixtures (inline prose to control word counts and overlap) === + +// 600+ word source on a generic technical topic (Claude Code hooks). +// Reused across multiple tests with different drafts. +const SOURCE_LONG = (() => { + const sentences = [ + 'Hooks in Claude Code allow you to intercept events emitted by the agent runtime', + 'These events fire at specific lifecycle points such as before a tool call runs', + 'or after the agent completes a turn or when a session starts up for the first time', + 'A hook is configured by adding an entry to the settings file under the hooks key', + 'Each hook entry binds a matcher pattern to a shell command that the runtime executes', + 'The matcher uses simple glob syntax to select which tool calls trigger the hook', + 'When a tool call matches the pattern the hook runs synchronously before the call proceeds', + 'A non-zero exit code from a hook script blocks the underlying tool call entirely', + 'This blocking behavior makes hooks useful for security policy enforcement and audit logging', + 'For example a pre-bash-executor hook can scan command strings against a denylist', + 'Hooks receive structured JSON input on standard input describing the event payload', + 'The schema includes the tool name the parameters and the working directory among other fields', + 'Hooks can emit JSON output on standard output to add additional context for the model', + 'Output is appended to the conversation as a system message before the next turn begins', + 'Plugin hooks live inside the plugin directory and apply only when the plugin is enabled', + 'User hooks live in the home directory under dot claude and apply across every project', + 'Project hooks live in the project root and apply only when working in that project', + 'Conflicts between hook layers resolve in a documented precedence order favoring user settings', + 'Hooks are written as plain executable scripts in any language that the system can run', + 'Common languages include shell python and node although any executable will work fine', + 'Best practice is to keep hooks fast and deterministic so they do not slow down the agent', + 'Slow hooks add latency to every tool call which compounds across long agent turns', + 'Hook scripts should also avoid making destructive changes during their execution', + 'Read-only checks fail safely while write operations from hooks are very hard to debug', + 'Testing hooks is straightforward by invoking them directly with the same input json', + 'Capture the output and exit code and verify they match the expected values', + 'Document hook behavior in the project readme so other contributors understand the constraints', + 'Hook misconfigurations often manifest as mysterious blocked tool calls during normal use', + 'Always include a clear error message in stderr when a hook intentionally blocks a call', + 'This makes debugging easier when the user wonders why their command did not run', + 'When designing a hook you should think first about what event you actually need to intercept', + 'Pre-tool-use events fire before any tool runs and can block dangerous operations early', + 'Post-tool-use events fire after a tool returns and can log results or trigger follow-up actions', + 'Session-start events fire when the agent begins a new conversation in a fresh context window', + 'Session-end events fire when the user closes the session and are useful for cleanup tasks', + 'Stop events fire whenever the agent finishes generating a response and yields back to the user', + 'Compaction events fire when the conversation history grows too large and must be summarized', + 'Each event type passes a different payload shape so you must read the schema documentation carefully', + 'A common pattern is to write a small dispatcher hook that routes events to language-specific handlers', + 'The dispatcher pattern keeps individual handlers simple and lets you add new ones without rewriting glue code', + 'Avoid putting business logic directly in the dispatcher because it becomes a bottleneck for testing', + 'Instead keep the dispatcher pure and delegate all real work to small focused single-purpose handler scripts', + 'Hook timeouts matter because slow handlers block the agent indefinitely until they return or error out', + 'Set a strict timeout in your handler implementation rather than relying on the runtime to kill it', + 'Use exit code two for hard errors and exit code zero for normal pass-through with no policy violation', + 'Reserve exit code one for soft warnings that should appear in the conversation but not block execution', + ]; + return sentences.join('. ') + '.'; +})(); + +const wordCount = (s) => (s.match(/[\p{L}\p{N}]+/gu) || []).length; + +// === Unit tests on pure functions === + +test('tokenize: lowercases and splits on word boundaries', () => { + const tokens = tokenize('Hello, World! Foo-bar.'); + assert.deepEqual(tokens, ['hello', 'world', 'foo', 'bar']); +}); + +test('tokenize: NFKC normalizes', () => { + // Full-width digits normalize to ASCII + const tokens = tokenize('café 123'); + assert.deepEqual(tokens, ['café', '123']); +}); + +test('shingles: returns empty when input shorter than n', () => { + assert.deepEqual(shingles(['a', 'b', 'c'], 5), []); +}); + +test('shingles: returns sliding window of size n', () => { + const result = shingles(['a', 'b', 'c', 'd', 'e'], 3); + assert.deepEqual(result, ['a b c', 'b c d', 'c d e']); +}); + +test('stripMarkdown: removes fenced code blocks', () => { + const input = 'Before\n```js\nconst x = 1;\n```\nAfter'; + const stripped = stripMarkdown(input); + assert.ok(!stripped.includes('const x')); + assert.ok(stripped.includes('Before')); + assert.ok(stripped.includes('After')); +}); + +test('stripMarkdown: removes inline code', () => { + const stripped = stripMarkdown('Use `npm install` to set up.'); + assert.ok(!stripped.includes('npm install')); +}); + +test('stripMarkdown: removes heading markers but keeps text', () => { + const stripped = stripMarkdown('# Title\nBody'); + assert.ok(!stripped.includes('#')); + assert.ok(stripped.includes('Title')); +}); + +test('stripMarkdown: removes emphasis markers', () => { + const stripped = stripMarkdown('This **is bold** and *italic* and ~~strike~~'); + assert.ok(!stripped.includes('**')); + assert.ok(!stripped.includes('~~')); + assert.ok(stripped.includes('is bold')); + assert.ok(stripped.includes('italic')); +}); + +test('stripMarkdown: links keep text only', () => { + const stripped = stripMarkdown('See [docs](https://example.com) for info.'); + assert.ok(!stripped.includes('https')); + assert.ok(stripped.includes('docs')); +}); + +test('stripMarkdown: removes YAML frontmatter at start', () => { + const input = '---\nname: foo\n---\nBody text here'; + const stripped = stripMarkdown(input); + assert.ok(!stripped.includes('name: foo')); + assert.ok(stripped.includes('Body text here')); +}); + +// === Overlap behavior === + +test('overlap: identical token streams give containment 1.0', () => { + const tokens = tokenize(SOURCE_LONG); + const m = overlap(tokens, tokens, 5); + assert.equal(m.containment, 1); + assert.ok(m.longestRun > 15); +}); + +test('overlap: completely disjoint streams give containment 0', () => { + const a = ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa']; + const b = ['xray', 'yankee', 'zulu', 'whiskey', 'victor', 'uniform', 'tango', 'sierra', 'romeo', 'quebec']; + const m = overlap(a, b, 5); + assert.equal(m.containment, 0); + assert.equal(m.longestRun, 0); +}); + +// === Verdict bands === + +test('verdict 1: identical text → rejected (containment 1.0)', () => { + const result = analyze(SOURCE_LONG, SOURCE_LONG); + assert.equal(result.verdict, 'rejected'); + assert.equal(result.containment, 1); +}); + +test('verdict 2: completely disjoint text → accepted (low containment, low run)', () => { + // Build a draft of unrelated words ≥300 to skip too-short fallback + const draftWords = []; + for (let i = 0; i < 350; i++) { + draftWords.push(`uniqueword${i}`); + } + const draft = draftWords.join(' '); + const result = analyze(draft, SOURCE_LONG); + assert.equal(result.verdict, 'accepted'); + assert.equal(result.containment, 0); + assert.equal(result.longestRun, 0); +}); + +test('verdict 3: partial overlap (mid-band) → needs-review', () => { + // Construct draft where ~25% of 5-grams match source but no run is long. + // Strategy: alternate 6-token source chunks with 2-token padding. Each + // chunk yields exactly 2 source 5-grams (longestRun = 2). Need both + // draft and source ≥500 tokens to keep shingleSize=5 (no fallback). + // 65 chunks × 8 = 520 draft tokens; SOURCE_LONG is ~600 tokens. + const sourceTokens = tokenize(SOURCE_LONG); + const draftWords = []; + let pad = 0; + for (let i = 0; i < 65; i++) { + draftWords.push(...sourceTokens.slice(i * 6, i * 6 + 6)); + draftWords.push(`padword${pad++}`, `padword${pad++}`); + } + const draft = draftWords.join(' '); + const result = analyze(draft, SOURCE_LONG); + assert.equal(result.shingleSize, 5, + `precondition: expected shingleSize=5 (no fallback), got ${result.shingleSize}`); + assert.equal(result.verdict, 'needs-review', + `expected needs-review, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`); +}); + +test('verdict 4: high overlap → rejected (containment ≥0.35)', () => { + // Draft is 60% source + 40% padding + const sourceTokens = tokenize(SOURCE_LONG); + const sourcePart = sourceTokens.slice(0, 200); + const padding = []; + for (let i = 0; i < 130; i++) padding.push(`pad${i}`); + const draft = sourcePart.concat(padding).join(' '); + const result = analyze(draft, SOURCE_LONG); + assert.equal(result.verdict, 'rejected', + `expected rejected, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`); +}); + +test('verdict 5: long verbatim run triggers rejection even with low containment', () => { + // Mostly unique words (low containment) but one 25-word verbatim sentence + // from source — longestRun ≥15 should reject. + const verbatim = tokenize(SOURCE_LONG).slice(50, 75).join(' '); + const padding = []; + for (let i = 0; i < 500; i++) padding.push(`unique${i}`); + const draft = padding.slice(0, 250).join(' ') + ' ' + verbatim + ' ' + padding.slice(250).join(' '); + const result = analyze(draft, SOURCE_LONG); + assert.equal(result.verdict, 'rejected', + `expected rejected, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`); + assert.ok(result.longestRun >= 15, `longestRun ${result.longestRun} should be ≥15`); +}); + +test('verdict 6: fenced code block in source → stripped → not counted as match', () => { + const draftBody = []; + for (let i = 0; i < 350; i++) draftBody.push(`uniq${i}`); + const draft = draftBody.join(' '); + // Source with a fenced code block containing some of the draft's words + const sourceWithCode = SOURCE_LONG + '\n```\n' + draftBody.slice(0, 100).join(' ') + '\n```\n'; + const result = analyze(draft, sourceWithCode); + // The code-block words should be stripped from source, so the draft remains disjoint + assert.equal(result.containment, 0, + `code-block words should be stripped (got containment ${result.containment})`); +}); + +test('verdict 7: short draft (<300 words) → needs-review with too-short reason', () => { + const draft = 'This is a short note. It has fewer than three hundred words. Just a quick sketch.'; + const result = analyze(draft, SOURCE_LONG); + assert.equal(result.verdict, 'needs-review'); + assert.equal(result.reason, 'too-short-to-score'); +}); + +test('verdict 8: markdown emphasis is stripped before tokenization', () => { + // Build a draft of unique tokens then wrap parts in **bold** and *italic* + const baseWords = []; + for (let i = 0; i < 350; i++) baseWords.push(`tok${i}`); + const plain = baseWords.join(' '); + const wrapped = baseWords + .map((w, i) => (i % 5 === 0 ? `**${w}**` : i % 7 === 0 ? `*${w}*` : w)) + .join(' '); + const plainResult = analyze(plain, SOURCE_LONG); + const wrappedResult = analyze(wrapped, SOURCE_LONG); + // After stripping, both should yield the same containment / longestRun + assert.equal(plainResult.containment, wrappedResult.containment, + 'markdown emphasis should not change containment after stripping'); + assert.equal(plainResult.longestRun, wrappedResult.longestRun, + 'markdown emphasis should not change longestRun after stripping'); +}); + +// === Integration: fixtures (Step 5 will create these; skip if missing) === + +const FIXTURE_DIR = 'tests/fixtures/skill-factory'; +const SCRIPT = 'scripts/ngram-overlap.mjs'; + +function runCli(draft, source) { + const out = execFileSync('node', [SCRIPT, draft, source], { encoding: 'utf8' }); + return JSON.parse(out); +} + +test('integration: accepted fixture pair → verdict accepted', { skip: !existsSync(`${FIXTURE_DIR}/draft-accepted.md`) }, () => { + const result = runCli(`${FIXTURE_DIR}/draft-accepted.md`, `${FIXTURE_DIR}/source-accepted.md`); + assert.equal(result.verdict, 'accepted', + `expected accepted, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`); +}); + +test('integration: needs-review fixture pair → verdict needs-review', { skip: !existsSync(`${FIXTURE_DIR}/draft-needs-review.md`) }, () => { + const result = runCli(`${FIXTURE_DIR}/draft-needs-review.md`, `${FIXTURE_DIR}/source-needs-review.md`); + assert.equal(result.verdict, 'needs-review', + `expected needs-review, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`); +}); + +test('integration: rejected fixture pair → verdict rejected', { skip: !existsSync(`${FIXTURE_DIR}/draft-rejected.md`) }, () => { + const result = runCli(`${FIXTURE_DIR}/draft-rejected.md`, `${FIXTURE_DIR}/source-rejected.md`); + assert.equal(result.verdict, 'rejected', + `expected rejected, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`); +});