From 4d541418ba3efff8f8b888ad18de2fbc65230846 Mon Sep 17 00:00:00 2001 From: Kjell Tore Guttormsen Date: Sat, 18 Apr 2026 15:03:36 +0200 Subject: [PATCH] feat(ultraplan-local): add scripts/ngram-overlap.mjs IP-hygiene utility --- .../ultraplan-local/scripts/ngram-overlap.mjs | 251 ++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 plugins/ultraplan-local/scripts/ngram-overlap.mjs diff --git a/plugins/ultraplan-local/scripts/ngram-overlap.mjs b/plugins/ultraplan-local/scripts/ngram-overlap.mjs new file mode 100644 index 0000000..f2bb340 --- /dev/null +++ b/plugins/ultraplan-local/scripts/ngram-overlap.mjs @@ -0,0 +1,251 @@ +#!/usr/bin/env node +// ngram-overlap.mjs — IP-hygiene check for skill-factory drafts. +// +// Computes word-n-gram containment similarity between a draft and source, +// plus a longest-consecutive-shingle-run signal. Verdict drives whether the +// draft passes IP-hygiene (`accepted`), warrants human review +// (`needs-review`), or must be rejected as too-close-to-source (`rejected`). +// +// Algorithm (research brief 01 §Recommendation): +// - Word 5-gram containment: |shingles(draft) ∩ shingles(source)| / |shingles(draft)| +// - Longest-run secondary signal: max consecutive draft shingles also in source +// - Verdict thresholds: +// containment <0.15 AND longestRun <8 → accepted +// containment ≥0.35 OR longestRun ≥15 → rejected +// otherwise → needs-review +// - Short-text fallback: shingles n=4 when min(words) <500; verdict +// `needs-review` with `reason: too-short-to-score` when min(words) <300. +// +// Pure Node stdlib. No npm dependencies. +// +// CLI: +// node scripts/ngram-overlap.mjs +// node scripts/ngram-overlap.mjs --help + +import { readFile } from 'node:fs/promises'; +import { argv, exit, stdout, stderr } from 'node:process'; + +// === Constants (research-backed defaults; see research brief 01 §Open Questions) === +export const SHINGLE_N_DEFAULT = 5; +export const SHINGLE_N_FALLBACK = 4; +export const SHORT_FALLBACK_THRESHOLD = 500; +export const TOO_SHORT_THRESHOLD = 300; +export const ACCEPT_CONTAINMENT_MAX = 0.15; +export const REJECT_CONTAINMENT_MIN = 0.35; +export const ACCEPT_RUN_MAX = 8; +export const REJECT_RUN_MIN = 15; + +const USAGE = `Usage: node scripts/ngram-overlap.mjs + +Computes word-n-gram containment overlap of draft against source. +Outputs JSON to stdout. Exit 0 on success, non-zero only on I/O error. + +Verdict bands: + accepted containment <${ACCEPT_CONTAINMENT_MAX} AND longestRun <${ACCEPT_RUN_MAX} + needs-review between bands, or min words <${TOO_SHORT_THRESHOLD} + rejected containment >=${REJECT_CONTAINMENT_MIN} OR longestRun >=${REJECT_RUN_MIN} +`; + +// === Markdown stripping (research brief 01 §step 2) === +// Strict order: frontmatter, fenced code, inline code, block quotes, links, +// images, emphasis, headings, hr, table pipes. +export function stripMarkdown(text) { + let t = text; + // YAML frontmatter (only at file start) + t = t.replace(/^---\n[\s\S]*?\n---\n?/, ''); + // Fenced code blocks ```lang ... ``` + t = t.replace(/```[\s\S]*?```/g, ' '); + // Inline code `...` + t = t.replace(/`[^`\n]*`/g, ' '); + // Block quotes (line-leading >) + t = t.replace(/^>\s?/gm, ''); + // Markdown links [text](url) → text + t = t.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1'); + // Images ![alt](url) → alt (handled before link strip if alt present; + // but link strip already removed; so handle remaining bang-prefix) + t = t.replace(/!\[([^\]]*)\]/g, '$1'); + // Emphasis: ** _ * ~~ (anchored by char, non-greedy) + t = t.replace(/\*\*([^*]+)\*\*/g, '$1'); + t = t.replace(/__([^_]+)__/g, '$1'); + t = t.replace(/\*([^*\n]+)\*/g, '$1'); + t = t.replace(/_([^_\n]+)_/g, '$1'); + t = t.replace(/~~([^~]+)~~/g, '$1'); + // Heading markers (line-leading #+) + t = t.replace(/^#{1,6}\s+/gm, ''); + // Horizontal rules + t = t.replace(/^[-*_]{3,}\s*$/gm, ' '); + // Table pipes + t = t.replace(/\|/g, ' '); + return t; +} + +// === Tokenization === +// NFKC normalize, lowercase, split on Unicode letter/number runs. +export function tokenize(text) { + const stripped = stripMarkdown(text); + const normalized = stripped.normalize('NFKC').toLowerCase(); + // Match runs of letters/numbers (Unicode-aware via /u flag) + const matches = normalized.match(/[\p{L}\p{N}]+/gu); + return matches || []; +} + +// === Shingles (n-grams of words) === +export function shingles(tokens, n) { + if (tokens.length < n) return []; + const out = []; + for (let i = 0; i <= tokens.length - n; i++) { + out.push(tokens.slice(i, i + n).join(' ')); + } + return out; +} + +// === Overlap metrics === +// +// Returns {containment, longestRun, draftShingleCount, sharedCount}. +// Containment = |draft ∩ source| / |draft| (asymmetric: how much of draft +// is reused from source). Longest-run = max consecutive draft shingles also +// in source. +export function overlap(draftTokens, sourceTokens, n) { + const draftShingles = shingles(draftTokens, n); + const sourceShingles = shingles(sourceTokens, n); + if (draftShingles.length === 0) { + return { containment: 0, longestRun: 0, draftShingleCount: 0, sharedCount: 0 }; + } + const sourceSet = new Set(sourceShingles); + let shared = 0; + let longest = 0; + let current = 0; + for (const sh of draftShingles) { + if (sourceSet.has(sh)) { + shared += 1; + current += 1; + if (current > longest) longest = current; + } else { + current = 0; + } + } + const containment = shared / draftShingles.length; + return { + containment, + longestRun: longest, + draftShingleCount: draftShingles.length, + sharedCount: shared, + }; +} + +// === Verdict dispatch === +export function verdict(metrics, opts = {}) { + const { + acceptContainmentMax = ACCEPT_CONTAINMENT_MAX, + rejectContainmentMin = REJECT_CONTAINMENT_MIN, + acceptRunMax = ACCEPT_RUN_MAX, + rejectRunMin = REJECT_RUN_MIN, + } = opts; + const { containment, longestRun } = metrics; + if (containment >= rejectContainmentMin || longestRun >= rejectRunMin) { + const reasons = []; + if (containment >= rejectContainmentMin) { + reasons.push(`containment ${containment.toFixed(3)} >= ${rejectContainmentMin}`); + } + if (longestRun >= rejectRunMin) { + reasons.push(`longestRun ${longestRun} >= ${rejectRunMin}`); + } + return { verdict: 'rejected', reasons }; + } + if (containment < acceptContainmentMax && longestRun < acceptRunMax) { + return { verdict: 'accepted', reasons: [] }; + } + const reasons = []; + if (containment >= acceptContainmentMax) { + reasons.push(`containment ${containment.toFixed(3)} in [${acceptContainmentMax}, ${rejectContainmentMin})`); + } + if (longestRun >= acceptRunMax) { + reasons.push(`longestRun ${longestRun} in [${acceptRunMax}, ${rejectRunMin})`); + } + return { verdict: 'needs-review', reasons }; +} + +// === Top-level analysis === +export function analyze(draftText, sourceText) { + const draftTokens = tokenize(draftText); + const sourceTokens = tokenize(sourceText); + const minWords = Math.min(draftTokens.length, sourceTokens.length); + const n = minWords < SHORT_FALLBACK_THRESHOLD ? SHINGLE_N_FALLBACK : SHINGLE_N_DEFAULT; + + if (minWords < TOO_SHORT_THRESHOLD) { + return { + verdict: 'needs-review', + reasons: [`min word count ${minWords} < ${TOO_SHORT_THRESHOLD}`], + reason: 'too-short-to-score', + containment: 0, + longestRun: 0, + thresholds: { + accept: ACCEPT_CONTAINMENT_MAX, + reject: REJECT_CONTAINMENT_MIN, + minRun: REJECT_RUN_MIN, + }, + shingleSize: n, + draftWords: draftTokens.length, + sourceWords: sourceTokens.length, + draftShingles: 0, + sharedShingles: 0, + }; + } + + const metrics = overlap(draftTokens, sourceTokens, n); + const v = verdict(metrics); + + return { + verdict: v.verdict, + reasons: v.reasons, + containment: metrics.containment, + longestRun: metrics.longestRun, + thresholds: { + accept: ACCEPT_CONTAINMENT_MAX, + reject: REJECT_CONTAINMENT_MIN, + minRun: REJECT_RUN_MIN, + }, + shingleSize: n, + draftWords: draftTokens.length, + sourceWords: sourceTokens.length, + draftShingles: metrics.draftShingleCount, + sharedShingles: metrics.sharedCount, + }; +} + +// === CLI entry === +async function main() { + const args = argv.slice(2); + if (args.length === 0 || args.includes('--help') || args.includes('-h')) { + stdout.write(USAGE); + exit(0); + } + if (args.length !== 2) { + stderr.write(`Error: expected 2 arguments (draft-path, source-path), got ${args.length}\n`); + stderr.write(USAGE); + exit(2); + } + const [draftPath, sourcePath] = args; + let draftText, sourceText; + try { + draftText = await readFile(draftPath, 'utf8'); + } catch (err) { + stderr.write(`Error reading draft ${draftPath}: ${err.message}\n`); + exit(1); + } + try { + sourceText = await readFile(sourcePath, 'utf8'); + } catch (err) { + stderr.write(`Error reading source ${sourcePath}: ${err.message}\n`); + exit(1); + } + const result = analyze(draftText, sourceText); + stdout.write(JSON.stringify(result, null, 2) + '\n'); + exit(0); +} + +// Only run main when invoked as CLI (not when imported) +const invokedAsScript = import.meta.url === `file://${process.argv[1]}`; +if (invokedAsScript) { + main(); +}