#!/usr/bin/env node // ngram-overlap.mjs — IP-hygiene check for skill-factory drafts. // // Computes word-n-gram containment similarity between a draft and source, // plus a longest-consecutive-shingle-run signal. Verdict drives whether the // draft passes IP-hygiene (`accepted`), warrants human review // (`needs-review`), or must be rejected as too-close-to-source (`rejected`). // // Algorithm (research brief 01 §Recommendation): // - Word 5-gram containment: |shingles(draft) ∩ shingles(source)| / |shingles(draft)| // - Longest-run secondary signal: max consecutive draft shingles also in source // - Verdict thresholds: // containment <0.15 AND longestRun <8 → accepted // containment ≥0.35 OR longestRun ≥15 → rejected // otherwise → needs-review // - Short-text fallback: shingles n=4 when min(words) <500; verdict // `needs-review` with `reason: too-short-to-score` when min(words) <300. // // Pure Node stdlib. No npm dependencies. // // CLI: // node scripts/ngram-overlap.mjs // node scripts/ngram-overlap.mjs --help import { readFile } from 'node:fs/promises'; import { argv, exit, stdout, stderr } from 'node:process'; // === Constants (research-backed defaults; see research brief 01 §Open Questions) === export const SHINGLE_N_DEFAULT = 5; export const SHINGLE_N_FALLBACK = 4; export const SHORT_FALLBACK_THRESHOLD = 500; export const TOO_SHORT_THRESHOLD = 300; export const ACCEPT_CONTAINMENT_MAX = 0.15; export const REJECT_CONTAINMENT_MIN = 0.35; export const ACCEPT_RUN_MAX = 8; export const REJECT_RUN_MIN = 15; const USAGE = `Usage: node scripts/ngram-overlap.mjs Computes word-n-gram containment overlap of draft against source. Outputs JSON to stdout. Exit 0 on success, non-zero only on I/O error. Verdict bands: accepted containment <${ACCEPT_CONTAINMENT_MAX} AND longestRun <${ACCEPT_RUN_MAX} needs-review between bands, or min words <${TOO_SHORT_THRESHOLD} rejected containment >=${REJECT_CONTAINMENT_MIN} OR longestRun >=${REJECT_RUN_MIN} `; // === Markdown stripping (research brief 01 §step 2) === // Strict order: frontmatter, fenced code, inline code, block quotes, links, // images, emphasis, headings, hr, table pipes. export function stripMarkdown(text) { let t = text; // YAML frontmatter (only at file start) t = t.replace(/^---\n[\s\S]*?\n---\n?/, ''); // Fenced code blocks ```lang ... ``` t = t.replace(/```[\s\S]*?```/g, ' '); // Inline code `...` t = t.replace(/`[^`\n]*`/g, ' '); // Block quotes (line-leading >) t = t.replace(/^>\s?/gm, ''); // Markdown links [text](url) → text t = t.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1'); // Images ![alt](url) → alt (handled before link strip if alt present; // but link strip already removed; so handle remaining bang-prefix) t = t.replace(/!\[([^\]]*)\]/g, '$1'); // Emphasis: ** _ * ~~ (anchored by char, non-greedy) t = t.replace(/\*\*([^*]+)\*\*/g, '$1'); t = t.replace(/__([^_]+)__/g, '$1'); t = t.replace(/\*([^*\n]+)\*/g, '$1'); t = t.replace(/_([^_\n]+)_/g, '$1'); t = t.replace(/~~([^~]+)~~/g, '$1'); // Heading markers (line-leading #+) t = t.replace(/^#{1,6}\s+/gm, ''); // Horizontal rules t = t.replace(/^[-*_]{3,}\s*$/gm, ' '); // Table pipes t = t.replace(/\|/g, ' '); return t; } // === Tokenization === // NFKC normalize, lowercase, split on Unicode letter/number runs. export function tokenize(text) { const stripped = stripMarkdown(text); const normalized = stripped.normalize('NFKC').toLowerCase(); // Match runs of letters/numbers (Unicode-aware via /u flag) const matches = normalized.match(/[\p{L}\p{N}]+/gu); return matches || []; } // === Shingles (n-grams of words) === export function shingles(tokens, n) { if (tokens.length < n) return []; const out = []; for (let i = 0; i <= tokens.length - n; i++) { out.push(tokens.slice(i, i + n).join(' ')); } return out; } // === Overlap metrics === // // Returns {containment, longestRun, draftShingleCount, sharedCount}. // Containment = |draft ∩ source| / |draft| (asymmetric: how much of draft // is reused from source). Longest-run = max consecutive draft shingles also // in source. export function overlap(draftTokens, sourceTokens, n) { const draftShingles = shingles(draftTokens, n); const sourceShingles = shingles(sourceTokens, n); if (draftShingles.length === 0) { return { containment: 0, longestRun: 0, draftShingleCount: 0, sharedCount: 0 }; } const sourceSet = new Set(sourceShingles); let shared = 0; let longest = 0; let current = 0; for (const sh of draftShingles) { if (sourceSet.has(sh)) { shared += 1; current += 1; if (current > longest) longest = current; } else { current = 0; } } const containment = shared / draftShingles.length; return { containment, longestRun: longest, draftShingleCount: draftShingles.length, sharedCount: shared, }; } // === Verdict dispatch === export function verdict(metrics, opts = {}) { const { acceptContainmentMax = ACCEPT_CONTAINMENT_MAX, rejectContainmentMin = REJECT_CONTAINMENT_MIN, acceptRunMax = ACCEPT_RUN_MAX, rejectRunMin = REJECT_RUN_MIN, } = opts; const { containment, longestRun } = metrics; if (containment >= rejectContainmentMin || longestRun >= rejectRunMin) { const reasons = []; if (containment >= rejectContainmentMin) { reasons.push(`containment ${containment.toFixed(3)} >= ${rejectContainmentMin}`); } if (longestRun >= rejectRunMin) { reasons.push(`longestRun ${longestRun} >= ${rejectRunMin}`); } return { verdict: 'rejected', reasons }; } if (containment < acceptContainmentMax && longestRun < acceptRunMax) { return { verdict: 'accepted', reasons: [] }; } const reasons = []; if (containment >= acceptContainmentMax) { reasons.push(`containment ${containment.toFixed(3)} in [${acceptContainmentMax}, ${rejectContainmentMin})`); } if (longestRun >= acceptRunMax) { reasons.push(`longestRun ${longestRun} in [${acceptRunMax}, ${rejectRunMin})`); } return { verdict: 'needs-review', reasons }; } // === Top-level analysis === export function analyze(draftText, sourceText) { const draftTokens = tokenize(draftText); const sourceTokens = tokenize(sourceText); const minWords = Math.min(draftTokens.length, sourceTokens.length); const n = minWords < SHORT_FALLBACK_THRESHOLD ? SHINGLE_N_FALLBACK : SHINGLE_N_DEFAULT; if (minWords < TOO_SHORT_THRESHOLD) { return { verdict: 'needs-review', reasons: [`min word count ${minWords} < ${TOO_SHORT_THRESHOLD}`], reason: 'too-short-to-score', containment: 0, longestRun: 0, thresholds: { accept: ACCEPT_CONTAINMENT_MAX, reject: REJECT_CONTAINMENT_MIN, minRun: REJECT_RUN_MIN, }, shingleSize: n, draftWords: draftTokens.length, sourceWords: sourceTokens.length, draftShingles: 0, sharedShingles: 0, }; } const metrics = overlap(draftTokens, sourceTokens, n); const v = verdict(metrics); return { verdict: v.verdict, reasons: v.reasons, containment: metrics.containment, longestRun: metrics.longestRun, thresholds: { accept: ACCEPT_CONTAINMENT_MAX, reject: REJECT_CONTAINMENT_MIN, minRun: REJECT_RUN_MIN, }, shingleSize: n, draftWords: draftTokens.length, sourceWords: sourceTokens.length, draftShingles: metrics.draftShingleCount, sharedShingles: metrics.sharedCount, }; } // === CLI entry === async function main() { const args = argv.slice(2); if (args.length === 0 || args.includes('--help') || args.includes('-h')) { stdout.write(USAGE); exit(0); } if (args.length !== 2) { stderr.write(`Error: expected 2 arguments (draft-path, source-path), got ${args.length}\n`); stderr.write(USAGE); exit(2); } const [draftPath, sourcePath] = args; let draftText, sourceText; try { draftText = await readFile(draftPath, 'utf8'); } catch (err) { stderr.write(`Error reading draft ${draftPath}: ${err.message}\n`); exit(1); } try { sourceText = await readFile(sourcePath, 'utf8'); } catch (err) { stderr.write(`Error reading source ${sourcePath}: ${err.message}\n`); exit(1); } const result = analyze(draftText, sourceText); stdout.write(JSON.stringify(result, null, 2) + '\n'); exit(0); } // Only run main when invoked as CLI (not when imported) const invokedAsScript = import.meta.url === `file://${process.argv[1]}`; if (invokedAsScript) { main(); }