feat(ultraplan-local): add scripts/ngram-overlap.mjs IP-hygiene utility

This commit is contained in:
Kjell Tore Guttormsen 2026-04-18 15:03:36 +02:00
commit 4d541418ba

View file

@ -0,0 +1,251 @@
#!/usr/bin/env node
// ngram-overlap.mjs — IP-hygiene check for skill-factory drafts.
//
// Computes word-n-gram containment similarity between a draft and source,
// plus a longest-consecutive-shingle-run signal. Verdict drives whether the
// draft passes IP-hygiene (`accepted`), warrants human review
// (`needs-review`), or must be rejected as too-close-to-source (`rejected`).
//
// Algorithm (research brief 01 §Recommendation):
// - Word 5-gram containment: |shingles(draft) ∩ shingles(source)| / |shingles(draft)|
// - Longest-run secondary signal: max consecutive draft shingles also in source
// - Verdict thresholds:
// containment <0.15 AND longestRun <8 → accepted
// containment ≥0.35 OR longestRun ≥15 → rejected
// otherwise → needs-review
// - Short-text fallback: shingles n=4 when min(words) <500; verdict
// `needs-review` with `reason: too-short-to-score` when min(words) <300.
//
// Pure Node stdlib. No npm dependencies.
//
// CLI:
// node scripts/ngram-overlap.mjs <draft-path> <source-path>
// node scripts/ngram-overlap.mjs --help
import { readFile } from 'node:fs/promises';
import { argv, exit, stdout, stderr } from 'node:process';
// === Constants (research-backed defaults; see research brief 01 §Open Questions) ===
export const SHINGLE_N_DEFAULT = 5;
export const SHINGLE_N_FALLBACK = 4;
export const SHORT_FALLBACK_THRESHOLD = 500;
export const TOO_SHORT_THRESHOLD = 300;
export const ACCEPT_CONTAINMENT_MAX = 0.15;
export const REJECT_CONTAINMENT_MIN = 0.35;
export const ACCEPT_RUN_MAX = 8;
export const REJECT_RUN_MIN = 15;
const USAGE = `Usage: node scripts/ngram-overlap.mjs <draft-path> <source-path>
Computes word-n-gram containment overlap of draft against source.
Outputs JSON to stdout. Exit 0 on success, non-zero only on I/O error.
Verdict bands:
accepted containment <${ACCEPT_CONTAINMENT_MAX} AND longestRun <${ACCEPT_RUN_MAX}
needs-review between bands, or min words <${TOO_SHORT_THRESHOLD}
rejected containment >=${REJECT_CONTAINMENT_MIN} OR longestRun >=${REJECT_RUN_MIN}
`;
// === Markdown stripping (research brief 01 §step 2) ===
// Strict order: frontmatter, fenced code, inline code, block quotes, links,
// images, emphasis, headings, hr, table pipes.
export function stripMarkdown(text) {
let t = text;
// YAML frontmatter (only at file start)
t = t.replace(/^---\n[\s\S]*?\n---\n?/, '');
// Fenced code blocks ```lang ... ```
t = t.replace(/```[\s\S]*?```/g, ' ');
// Inline code `...`
t = t.replace(/`[^`\n]*`/g, ' ');
// Block quotes (line-leading >)
t = t.replace(/^>\s?/gm, '');
// Markdown links [text](url) → text
t = t.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
// Images ![alt](url) → alt (handled before link strip if alt present;
// but link strip already removed; so handle remaining bang-prefix)
t = t.replace(/!\[([^\]]*)\]/g, '$1');
// Emphasis: ** _ * ~~ (anchored by char, non-greedy)
t = t.replace(/\*\*([^*]+)\*\*/g, '$1');
t = t.replace(/__([^_]+)__/g, '$1');
t = t.replace(/\*([^*\n]+)\*/g, '$1');
t = t.replace(/_([^_\n]+)_/g, '$1');
t = t.replace(/~~([^~]+)~~/g, '$1');
// Heading markers (line-leading #+)
t = t.replace(/^#{1,6}\s+/gm, '');
// Horizontal rules
t = t.replace(/^[-*_]{3,}\s*$/gm, ' ');
// Table pipes
t = t.replace(/\|/g, ' ');
return t;
}
// === Tokenization ===
// NFKC normalize, lowercase, split on Unicode letter/number runs.
export function tokenize(text) {
const stripped = stripMarkdown(text);
const normalized = stripped.normalize('NFKC').toLowerCase();
// Match runs of letters/numbers (Unicode-aware via /u flag)
const matches = normalized.match(/[\p{L}\p{N}]+/gu);
return matches || [];
}
// === Shingles (n-grams of words) ===
export function shingles(tokens, n) {
if (tokens.length < n) return [];
const out = [];
for (let i = 0; i <= tokens.length - n; i++) {
out.push(tokens.slice(i, i + n).join(' '));
}
return out;
}
// === Overlap metrics ===
//
// Returns {containment, longestRun, draftShingleCount, sharedCount}.
// Containment = |draft ∩ source| / |draft| (asymmetric: how much of draft
// is reused from source). Longest-run = max consecutive draft shingles also
// in source.
export function overlap(draftTokens, sourceTokens, n) {
const draftShingles = shingles(draftTokens, n);
const sourceShingles = shingles(sourceTokens, n);
if (draftShingles.length === 0) {
return { containment: 0, longestRun: 0, draftShingleCount: 0, sharedCount: 0 };
}
const sourceSet = new Set(sourceShingles);
let shared = 0;
let longest = 0;
let current = 0;
for (const sh of draftShingles) {
if (sourceSet.has(sh)) {
shared += 1;
current += 1;
if (current > longest) longest = current;
} else {
current = 0;
}
}
const containment = shared / draftShingles.length;
return {
containment,
longestRun: longest,
draftShingleCount: draftShingles.length,
sharedCount: shared,
};
}
// === Verdict dispatch ===
export function verdict(metrics, opts = {}) {
const {
acceptContainmentMax = ACCEPT_CONTAINMENT_MAX,
rejectContainmentMin = REJECT_CONTAINMENT_MIN,
acceptRunMax = ACCEPT_RUN_MAX,
rejectRunMin = REJECT_RUN_MIN,
} = opts;
const { containment, longestRun } = metrics;
if (containment >= rejectContainmentMin || longestRun >= rejectRunMin) {
const reasons = [];
if (containment >= rejectContainmentMin) {
reasons.push(`containment ${containment.toFixed(3)} >= ${rejectContainmentMin}`);
}
if (longestRun >= rejectRunMin) {
reasons.push(`longestRun ${longestRun} >= ${rejectRunMin}`);
}
return { verdict: 'rejected', reasons };
}
if (containment < acceptContainmentMax && longestRun < acceptRunMax) {
return { verdict: 'accepted', reasons: [] };
}
const reasons = [];
if (containment >= acceptContainmentMax) {
reasons.push(`containment ${containment.toFixed(3)} in [${acceptContainmentMax}, ${rejectContainmentMin})`);
}
if (longestRun >= acceptRunMax) {
reasons.push(`longestRun ${longestRun} in [${acceptRunMax}, ${rejectRunMin})`);
}
return { verdict: 'needs-review', reasons };
}
// === Top-level analysis ===
export function analyze(draftText, sourceText) {
const draftTokens = tokenize(draftText);
const sourceTokens = tokenize(sourceText);
const minWords = Math.min(draftTokens.length, sourceTokens.length);
const n = minWords < SHORT_FALLBACK_THRESHOLD ? SHINGLE_N_FALLBACK : SHINGLE_N_DEFAULT;
if (minWords < TOO_SHORT_THRESHOLD) {
return {
verdict: 'needs-review',
reasons: [`min word count ${minWords} < ${TOO_SHORT_THRESHOLD}`],
reason: 'too-short-to-score',
containment: 0,
longestRun: 0,
thresholds: {
accept: ACCEPT_CONTAINMENT_MAX,
reject: REJECT_CONTAINMENT_MIN,
minRun: REJECT_RUN_MIN,
},
shingleSize: n,
draftWords: draftTokens.length,
sourceWords: sourceTokens.length,
draftShingles: 0,
sharedShingles: 0,
};
}
const metrics = overlap(draftTokens, sourceTokens, n);
const v = verdict(metrics);
return {
verdict: v.verdict,
reasons: v.reasons,
containment: metrics.containment,
longestRun: metrics.longestRun,
thresholds: {
accept: ACCEPT_CONTAINMENT_MAX,
reject: REJECT_CONTAINMENT_MIN,
minRun: REJECT_RUN_MIN,
},
shingleSize: n,
draftWords: draftTokens.length,
sourceWords: sourceTokens.length,
draftShingles: metrics.draftShingleCount,
sharedShingles: metrics.sharedCount,
};
}
// === CLI entry ===
async function main() {
const args = argv.slice(2);
if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
stdout.write(USAGE);
exit(0);
}
if (args.length !== 2) {
stderr.write(`Error: expected 2 arguments (draft-path, source-path), got ${args.length}\n`);
stderr.write(USAGE);
exit(2);
}
const [draftPath, sourcePath] = args;
let draftText, sourceText;
try {
draftText = await readFile(draftPath, 'utf8');
} catch (err) {
stderr.write(`Error reading draft ${draftPath}: ${err.message}\n`);
exit(1);
}
try {
sourceText = await readFile(sourcePath, 'utf8');
} catch (err) {
stderr.write(`Error reading source ${sourcePath}: ${err.message}\n`);
exit(1);
}
const result = analyze(draftText, sourceText);
stdout.write(JSON.stringify(result, null, 2) + '\n');
exit(0);
}
// Only run main when invoked as CLI (not when imported)
const invokedAsScript = import.meta.url === `file://${process.argv[1]}`;
if (invokedAsScript) {
main();
}