test(ultraplan-local): add ngram-overlap node:test suite

2026-04-18 15:07:25 +02:00 · 2026-04-18 15:07:25 +02:00 · 491711119a
commit 491711119a
parent 4d541418ba
1 changed files with 281 additions and 0 deletions
--- a/plugins/ultraplan-local/scripts/ngram-overlap.test.mjs
+++ b/plugins/ultraplan-local/scripts/ngram-overlap.test.mjs
@ -0,0 +1,281 @@
+// node:test suite for scripts/ngram-overlap.mjs
+//
+// Run: node --test scripts/ngram-overlap.test.mjs
+//
+// Covers: identical text, disjoint text, partial overlap bands,
+// longest-run override, fenced-code stripping, short-source fallback,
+// markdown-emphasis stripping, fixture integration.
+
+import { test } from 'node:test';
+import assert from 'node:assert/strict';
+import { execFileSync } from 'node:child_process';
+import { existsSync } from 'node:fs';
+import { analyze, tokenize, shingles, overlap, verdict, stripMarkdown } from './ngram-overlap.mjs';
+
+// === Fixtures (inline prose to control word counts and overlap) ===
+
+// 600+ word source on a generic technical topic (Claude Code hooks).
+// Reused across multiple tests with different drafts.
+const SOURCE_LONG = (() => {
+  const sentences = [
+    'Hooks in Claude Code allow you to intercept events emitted by the agent runtime',
+    'These events fire at specific lifecycle points such as before a tool call runs',
+    'or after the agent completes a turn or when a session starts up for the first time',
+    'A hook is configured by adding an entry to the settings file under the hooks key',
+    'Each hook entry binds a matcher pattern to a shell command that the runtime executes',
+    'The matcher uses simple glob syntax to select which tool calls trigger the hook',
+    'When a tool call matches the pattern the hook runs synchronously before the call proceeds',
+    'A non-zero exit code from a hook script blocks the underlying tool call entirely',
+    'This blocking behavior makes hooks useful for security policy enforcement and audit logging',
+    'For example a pre-bash-executor hook can scan command strings against a denylist',
+    'Hooks receive structured JSON input on standard input describing the event payload',
+    'The schema includes the tool name the parameters and the working directory among other fields',
+    'Hooks can emit JSON output on standard output to add additional context for the model',
+    'Output is appended to the conversation as a system message before the next turn begins',
+    'Plugin hooks live inside the plugin directory and apply only when the plugin is enabled',
+    'User hooks live in the home directory under dot claude and apply across every project',
+    'Project hooks live in the project root and apply only when working in that project',
+    'Conflicts between hook layers resolve in a documented precedence order favoring user settings',
+    'Hooks are written as plain executable scripts in any language that the system can run',
+    'Common languages include shell python and node although any executable will work fine',
+    'Best practice is to keep hooks fast and deterministic so they do not slow down the agent',
+    'Slow hooks add latency to every tool call which compounds across long agent turns',
+    'Hook scripts should also avoid making destructive changes during their execution',
+    'Read-only checks fail safely while write operations from hooks are very hard to debug',
+    'Testing hooks is straightforward by invoking them directly with the same input json',
+    'Capture the output and exit code and verify they match the expected values',
+    'Document hook behavior in the project readme so other contributors understand the constraints',
+    'Hook misconfigurations often manifest as mysterious blocked tool calls during normal use',
+    'Always include a clear error message in stderr when a hook intentionally blocks a call',
+    'This makes debugging easier when the user wonders why their command did not run',
+    'When designing a hook you should think first about what event you actually need to intercept',
+    'Pre-tool-use events fire before any tool runs and can block dangerous operations early',
+    'Post-tool-use events fire after a tool returns and can log results or trigger follow-up actions',
+    'Session-start events fire when the agent begins a new conversation in a fresh context window',
+    'Session-end events fire when the user closes the session and are useful for cleanup tasks',
+    'Stop events fire whenever the agent finishes generating a response and yields back to the user',
+    'Compaction events fire when the conversation history grows too large and must be summarized',
+    'Each event type passes a different payload shape so you must read the schema documentation carefully',
+    'A common pattern is to write a small dispatcher hook that routes events to language-specific handlers',
+    'The dispatcher pattern keeps individual handlers simple and lets you add new ones without rewriting glue code',
+    'Avoid putting business logic directly in the dispatcher because it becomes a bottleneck for testing',
+    'Instead keep the dispatcher pure and delegate all real work to small focused single-purpose handler scripts',
+    'Hook timeouts matter because slow handlers block the agent indefinitely until they return or error out',
+    'Set a strict timeout in your handler implementation rather than relying on the runtime to kill it',
+    'Use exit code two for hard errors and exit code zero for normal pass-through with no policy violation',
+    'Reserve exit code one for soft warnings that should appear in the conversation but not block execution',
+  ];
+  return sentences.join('. ') + '.';
+})();
+
+const wordCount = (s) => (s.match(/[\p{L}\p{N}]+/gu) || []).length;
+
+// === Unit tests on pure functions ===
+
+test('tokenize: lowercases and splits on word boundaries', () => {
+  const tokens = tokenize('Hello, World! Foo-bar.');
+  assert.deepEqual(tokens, ['hello', 'world', 'foo', 'bar']);
+});
+
+test('tokenize: NFKC normalizes', () => {
+  // Full-width digits normalize to ASCII
+  const tokens = tokenize('café 123');
+  assert.deepEqual(tokens, ['café', '123']);
+});
+
+test('shingles: returns empty when input shorter than n', () => {
+  assert.deepEqual(shingles(['a', 'b', 'c'], 5), []);
+});
+
+test('shingles: returns sliding window of size n', () => {
+  const result = shingles(['a', 'b', 'c', 'd', 'e'], 3);
+  assert.deepEqual(result, ['a b c', 'b c d', 'c d e']);
+});
+
+test('stripMarkdown: removes fenced code blocks', () => {
+  const input = 'Before\n```js\nconst x = 1;\n```\nAfter';
+  const stripped = stripMarkdown(input);
+  assert.ok(!stripped.includes('const x'));
+  assert.ok(stripped.includes('Before'));
+  assert.ok(stripped.includes('After'));
+});
+
+test('stripMarkdown: removes inline code', () => {
+  const stripped = stripMarkdown('Use `npm install` to set up.');
+  assert.ok(!stripped.includes('npm install'));
+});
+
+test('stripMarkdown: removes heading markers but keeps text', () => {
+  const stripped = stripMarkdown('# Title\nBody');
+  assert.ok(!stripped.includes('#'));
+  assert.ok(stripped.includes('Title'));
+});
+
+test('stripMarkdown: removes emphasis markers', () => {
+  const stripped = stripMarkdown('This **is bold** and *italic* and ~~strike~~');
+  assert.ok(!stripped.includes('**'));
+  assert.ok(!stripped.includes('~~'));
+  assert.ok(stripped.includes('is bold'));
+  assert.ok(stripped.includes('italic'));
+});
+
+test('stripMarkdown: links keep text only', () => {
+  const stripped = stripMarkdown('See [docs](https://example.com) for info.');
+  assert.ok(!stripped.includes('https'));
+  assert.ok(stripped.includes('docs'));
+});
+
+test('stripMarkdown: removes YAML frontmatter at start', () => {
+  const input = '---\nname: foo\n---\nBody text here';
+  const stripped = stripMarkdown(input);
+  assert.ok(!stripped.includes('name: foo'));
+  assert.ok(stripped.includes('Body text here'));
+});
+
+// === Overlap behavior ===
+
+test('overlap: identical token streams give containment 1.0', () => {
+  const tokens = tokenize(SOURCE_LONG);
+  const m = overlap(tokens, tokens, 5);
+  assert.equal(m.containment, 1);
+  assert.ok(m.longestRun > 15);
+});
+
+test('overlap: completely disjoint streams give containment 0', () => {
+  const a = ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa'];
+  const b = ['xray', 'yankee', 'zulu', 'whiskey', 'victor', 'uniform', 'tango', 'sierra', 'romeo', 'quebec'];
+  const m = overlap(a, b, 5);
+  assert.equal(m.containment, 0);
+  assert.equal(m.longestRun, 0);
+});
+
+// === Verdict bands ===
+
+test('verdict 1: identical text → rejected (containment 1.0)', () => {
+  const result = analyze(SOURCE_LONG, SOURCE_LONG);
+  assert.equal(result.verdict, 'rejected');
+  assert.equal(result.containment, 1);
+});
+
+test('verdict 2: completely disjoint text → accepted (low containment, low run)', () => {
+  // Build a draft of unrelated words ≥300 to skip too-short fallback
+  const draftWords = [];
+  for (let i = 0; i < 350; i++) {
+    draftWords.push(`uniqueword${i}`);
+  }
+  const draft = draftWords.join(' ');
+  const result = analyze(draft, SOURCE_LONG);
+  assert.equal(result.verdict, 'accepted');
+  assert.equal(result.containment, 0);
+  assert.equal(result.longestRun, 0);
+});
+
+test('verdict 3: partial overlap (mid-band) → needs-review', () => {
+  // Construct draft where ~25% of 5-grams match source but no run is long.
+  // Strategy: alternate 6-token source chunks with 2-token padding. Each
+  // chunk yields exactly 2 source 5-grams (longestRun = 2). Need both
+  // draft and source ≥500 tokens to keep shingleSize=5 (no fallback).
+  // 65 chunks × 8 = 520 draft tokens; SOURCE_LONG is ~600 tokens.
+  const sourceTokens = tokenize(SOURCE_LONG);
+  const draftWords = [];
+  let pad = 0;
+  for (let i = 0; i < 65; i++) {
+    draftWords.push(...sourceTokens.slice(i * 6, i * 6 + 6));
+    draftWords.push(`padword${pad++}`, `padword${pad++}`);
+  }
+  const draft = draftWords.join(' ');
+  const result = analyze(draft, SOURCE_LONG);
+  assert.equal(result.shingleSize, 5,
+    `precondition: expected shingleSize=5 (no fallback), got ${result.shingleSize}`);
+  assert.equal(result.verdict, 'needs-review',
+    `expected needs-review, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`);
+});
+
+test('verdict 4: high overlap → rejected (containment ≥0.35)', () => {
+  // Draft is 60% source + 40% padding
+  const sourceTokens = tokenize(SOURCE_LONG);
+  const sourcePart = sourceTokens.slice(0, 200);
+  const padding = [];
+  for (let i = 0; i < 130; i++) padding.push(`pad${i}`);
+  const draft = sourcePart.concat(padding).join(' ');
+  const result = analyze(draft, SOURCE_LONG);
+  assert.equal(result.verdict, 'rejected',
+    `expected rejected, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`);
+});
+
+test('verdict 5: long verbatim run triggers rejection even with low containment', () => {
+  // Mostly unique words (low containment) but one 25-word verbatim sentence
+  // from source — longestRun ≥15 should reject.
+  const verbatim = tokenize(SOURCE_LONG).slice(50, 75).join(' ');
+  const padding = [];
+  for (let i = 0; i < 500; i++) padding.push(`unique${i}`);
+  const draft = padding.slice(0, 250).join(' ') + ' ' + verbatim + ' ' + padding.slice(250).join(' ');
+  const result = analyze(draft, SOURCE_LONG);
+  assert.equal(result.verdict, 'rejected',
+    `expected rejected, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`);
+  assert.ok(result.longestRun >= 15, `longestRun ${result.longestRun} should be ≥15`);
+});
+
+test('verdict 6: fenced code block in source → stripped → not counted as match', () => {
+  const draftBody = [];
+  for (let i = 0; i < 350; i++) draftBody.push(`uniq${i}`);
+  const draft = draftBody.join(' ');
+  // Source with a fenced code block containing some of the draft's words
+  const sourceWithCode = SOURCE_LONG + '\n```\n' + draftBody.slice(0, 100).join(' ') + '\n```\n';
+  const result = analyze(draft, sourceWithCode);
+  // The code-block words should be stripped from source, so the draft remains disjoint
+  assert.equal(result.containment, 0,
+    `code-block words should be stripped (got containment ${result.containment})`);
+});
+
+test('verdict 7: short draft (<300 words) → needs-review with too-short reason', () => {
+  const draft = 'This is a short note. It has fewer than three hundred words. Just a quick sketch.';
+  const result = analyze(draft, SOURCE_LONG);
+  assert.equal(result.verdict, 'needs-review');
+  assert.equal(result.reason, 'too-short-to-score');
+});
+
+test('verdict 8: markdown emphasis is stripped before tokenization', () => {
+  // Build a draft of unique tokens then wrap parts in **bold** and *italic*
+  const baseWords = [];
+  for (let i = 0; i < 350; i++) baseWords.push(`tok${i}`);
+  const plain = baseWords.join(' ');
+  const wrapped = baseWords
+    .map((w, i) => (i % 5 === 0 ? `**${w}**` : i % 7 === 0 ? `*${w}*` : w))
+    .join(' ');
+  const plainResult = analyze(plain, SOURCE_LONG);
+  const wrappedResult = analyze(wrapped, SOURCE_LONG);
+  // After stripping, both should yield the same containment / longestRun
+  assert.equal(plainResult.containment, wrappedResult.containment,
+    'markdown emphasis should not change containment after stripping');
+  assert.equal(plainResult.longestRun, wrappedResult.longestRun,
+    'markdown emphasis should not change longestRun after stripping');
+});
+
+// === Integration: fixtures (Step 5 will create these; skip if missing) ===
+
+const FIXTURE_DIR = 'tests/fixtures/skill-factory';
+const SCRIPT = 'scripts/ngram-overlap.mjs';
+
+function runCli(draft, source) {
+  const out = execFileSync('node', [SCRIPT, draft, source], { encoding: 'utf8' });
+  return JSON.parse(out);
+}
+
+test('integration: accepted fixture pair → verdict accepted', { skip: !existsSync(`${FIXTURE_DIR}/draft-accepted.md`) }, () => {
+  const result = runCli(`${FIXTURE_DIR}/draft-accepted.md`, `${FIXTURE_DIR}/source-accepted.md`);
+  assert.equal(result.verdict, 'accepted',
+    `expected accepted, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`);
+});
+
+test('integration: needs-review fixture pair → verdict needs-review', { skip: !existsSync(`${FIXTURE_DIR}/draft-needs-review.md`) }, () => {
+  const result = runCli(`${FIXTURE_DIR}/draft-needs-review.md`, `${FIXTURE_DIR}/source-needs-review.md`);
+  assert.equal(result.verdict, 'needs-review',
+    `expected needs-review, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`);
+});
+
+test('integration: rejected fixture pair → verdict rejected', { skip: !existsSync(`${FIXTURE_DIR}/draft-rejected.md`) }, () => {
+  const result = runCli(`${FIXTURE_DIR}/draft-rejected.md`, `${FIXTURE_DIR}/source-rejected.md`);
+  assert.equal(result.verdict, 'rejected',
+    `expected rejected, got ${result.verdict} (containment=${result.containment.toFixed(3)}, longestRun=${result.longestRun})`);
+});