251 lines
8.5 KiB
JavaScript
251 lines
8.5 KiB
JavaScript
#!/usr/bin/env node
|
|
// ngram-overlap.mjs — IP-hygiene check for skill-factory drafts.
|
|
//
|
|
// Computes word-n-gram containment similarity between a draft and source,
|
|
// plus a longest-consecutive-shingle-run signal. Verdict drives whether the
|
|
// draft passes IP-hygiene (`accepted`), warrants human review
|
|
// (`needs-review`), or must be rejected as too-close-to-source (`rejected`).
|
|
//
|
|
// Algorithm (research brief 01 §Recommendation):
|
|
// - Word 5-gram containment: |shingles(draft) ∩ shingles(source)| / |shingles(draft)|
|
|
// - Longest-run secondary signal: max consecutive draft shingles also in source
|
|
// - Verdict thresholds:
|
|
// containment <0.15 AND longestRun <8 → accepted
|
|
// containment ≥0.35 OR longestRun ≥15 → rejected
|
|
// otherwise → needs-review
|
|
// - Short-text fallback: shingles n=4 when min(words) <500; verdict
|
|
// `needs-review` with `reason: too-short-to-score` when min(words) <300.
|
|
//
|
|
// Pure Node stdlib. No npm dependencies.
|
|
//
|
|
// CLI:
|
|
// node scripts/ngram-overlap.mjs <draft-path> <source-path>
|
|
// node scripts/ngram-overlap.mjs --help
|
|
|
|
import { readFile } from 'node:fs/promises';
|
|
import { argv, exit, stdout, stderr } from 'node:process';
|
|
|
|
// === Constants (research-backed defaults; see research brief 01 §Open Questions) ===
|
|
export const SHINGLE_N_DEFAULT = 5;
|
|
export const SHINGLE_N_FALLBACK = 4;
|
|
export const SHORT_FALLBACK_THRESHOLD = 500;
|
|
export const TOO_SHORT_THRESHOLD = 300;
|
|
export const ACCEPT_CONTAINMENT_MAX = 0.15;
|
|
export const REJECT_CONTAINMENT_MIN = 0.35;
|
|
export const ACCEPT_RUN_MAX = 8;
|
|
export const REJECT_RUN_MIN = 15;
|
|
|
|
const USAGE = `Usage: node scripts/ngram-overlap.mjs <draft-path> <source-path>
|
|
|
|
Computes word-n-gram containment overlap of draft against source.
|
|
Outputs JSON to stdout. Exit 0 on success, non-zero only on I/O error.
|
|
|
|
Verdict bands:
|
|
accepted containment <${ACCEPT_CONTAINMENT_MAX} AND longestRun <${ACCEPT_RUN_MAX}
|
|
needs-review between bands, or min words <${TOO_SHORT_THRESHOLD}
|
|
rejected containment >=${REJECT_CONTAINMENT_MIN} OR longestRun >=${REJECT_RUN_MIN}
|
|
`;
|
|
|
|
// === Markdown stripping (research brief 01 §step 2) ===
|
|
// Strict order: frontmatter, fenced code, inline code, block quotes, links,
|
|
// images, emphasis, headings, hr, table pipes.
|
|
export function stripMarkdown(text) {
|
|
let t = text;
|
|
// YAML frontmatter (only at file start)
|
|
t = t.replace(/^---\n[\s\S]*?\n---\n?/, '');
|
|
// Fenced code blocks ```lang ... ```
|
|
t = t.replace(/```[\s\S]*?```/g, ' ');
|
|
// Inline code `...`
|
|
t = t.replace(/`[^`\n]*`/g, ' ');
|
|
// Block quotes (line-leading >)
|
|
t = t.replace(/^>\s?/gm, '');
|
|
// Markdown links [text](url) → text
|
|
t = t.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
|
// Images  → alt (handled before link strip if alt present;
|
|
// but link strip already removed; so handle remaining bang-prefix)
|
|
t = t.replace(/!\[([^\]]*)\]/g, '$1');
|
|
// Emphasis: ** _ * ~~ (anchored by char, non-greedy)
|
|
t = t.replace(/\*\*([^*]+)\*\*/g, '$1');
|
|
t = t.replace(/__([^_]+)__/g, '$1');
|
|
t = t.replace(/\*([^*\n]+)\*/g, '$1');
|
|
t = t.replace(/_([^_\n]+)_/g, '$1');
|
|
t = t.replace(/~~([^~]+)~~/g, '$1');
|
|
// Heading markers (line-leading #+)
|
|
t = t.replace(/^#{1,6}\s+/gm, '');
|
|
// Horizontal rules
|
|
t = t.replace(/^[-*_]{3,}\s*$/gm, ' ');
|
|
// Table pipes
|
|
t = t.replace(/\|/g, ' ');
|
|
return t;
|
|
}
|
|
|
|
// === Tokenization ===
|
|
// NFKC normalize, lowercase, split on Unicode letter/number runs.
|
|
export function tokenize(text) {
|
|
const stripped = stripMarkdown(text);
|
|
const normalized = stripped.normalize('NFKC').toLowerCase();
|
|
// Match runs of letters/numbers (Unicode-aware via /u flag)
|
|
const matches = normalized.match(/[\p{L}\p{N}]+/gu);
|
|
return matches || [];
|
|
}
|
|
|
|
// === Shingles (n-grams of words) ===
|
|
export function shingles(tokens, n) {
|
|
if (tokens.length < n) return [];
|
|
const out = [];
|
|
for (let i = 0; i <= tokens.length - n; i++) {
|
|
out.push(tokens.slice(i, i + n).join(' '));
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// === Overlap metrics ===
|
|
//
|
|
// Returns {containment, longestRun, draftShingleCount, sharedCount}.
|
|
// Containment = |draft ∩ source| / |draft| (asymmetric: how much of draft
|
|
// is reused from source). Longest-run = max consecutive draft shingles also
|
|
// in source.
|
|
export function overlap(draftTokens, sourceTokens, n) {
|
|
const draftShingles = shingles(draftTokens, n);
|
|
const sourceShingles = shingles(sourceTokens, n);
|
|
if (draftShingles.length === 0) {
|
|
return { containment: 0, longestRun: 0, draftShingleCount: 0, sharedCount: 0 };
|
|
}
|
|
const sourceSet = new Set(sourceShingles);
|
|
let shared = 0;
|
|
let longest = 0;
|
|
let current = 0;
|
|
for (const sh of draftShingles) {
|
|
if (sourceSet.has(sh)) {
|
|
shared += 1;
|
|
current += 1;
|
|
if (current > longest) longest = current;
|
|
} else {
|
|
current = 0;
|
|
}
|
|
}
|
|
const containment = shared / draftShingles.length;
|
|
return {
|
|
containment,
|
|
longestRun: longest,
|
|
draftShingleCount: draftShingles.length,
|
|
sharedCount: shared,
|
|
};
|
|
}
|
|
|
|
// === Verdict dispatch ===
|
|
export function verdict(metrics, opts = {}) {
|
|
const {
|
|
acceptContainmentMax = ACCEPT_CONTAINMENT_MAX,
|
|
rejectContainmentMin = REJECT_CONTAINMENT_MIN,
|
|
acceptRunMax = ACCEPT_RUN_MAX,
|
|
rejectRunMin = REJECT_RUN_MIN,
|
|
} = opts;
|
|
const { containment, longestRun } = metrics;
|
|
if (containment >= rejectContainmentMin || longestRun >= rejectRunMin) {
|
|
const reasons = [];
|
|
if (containment >= rejectContainmentMin) {
|
|
reasons.push(`containment ${containment.toFixed(3)} >= ${rejectContainmentMin}`);
|
|
}
|
|
if (longestRun >= rejectRunMin) {
|
|
reasons.push(`longestRun ${longestRun} >= ${rejectRunMin}`);
|
|
}
|
|
return { verdict: 'rejected', reasons };
|
|
}
|
|
if (containment < acceptContainmentMax && longestRun < acceptRunMax) {
|
|
return { verdict: 'accepted', reasons: [] };
|
|
}
|
|
const reasons = [];
|
|
if (containment >= acceptContainmentMax) {
|
|
reasons.push(`containment ${containment.toFixed(3)} in [${acceptContainmentMax}, ${rejectContainmentMin})`);
|
|
}
|
|
if (longestRun >= acceptRunMax) {
|
|
reasons.push(`longestRun ${longestRun} in [${acceptRunMax}, ${rejectRunMin})`);
|
|
}
|
|
return { verdict: 'needs-review', reasons };
|
|
}
|
|
|
|
// === Top-level analysis ===
|
|
export function analyze(draftText, sourceText) {
|
|
const draftTokens = tokenize(draftText);
|
|
const sourceTokens = tokenize(sourceText);
|
|
const minWords = Math.min(draftTokens.length, sourceTokens.length);
|
|
const n = minWords < SHORT_FALLBACK_THRESHOLD ? SHINGLE_N_FALLBACK : SHINGLE_N_DEFAULT;
|
|
|
|
if (minWords < TOO_SHORT_THRESHOLD) {
|
|
return {
|
|
verdict: 'needs-review',
|
|
reasons: [`min word count ${minWords} < ${TOO_SHORT_THRESHOLD}`],
|
|
reason: 'too-short-to-score',
|
|
containment: 0,
|
|
longestRun: 0,
|
|
thresholds: {
|
|
accept: ACCEPT_CONTAINMENT_MAX,
|
|
reject: REJECT_CONTAINMENT_MIN,
|
|
minRun: REJECT_RUN_MIN,
|
|
},
|
|
shingleSize: n,
|
|
draftWords: draftTokens.length,
|
|
sourceWords: sourceTokens.length,
|
|
draftShingles: 0,
|
|
sharedShingles: 0,
|
|
};
|
|
}
|
|
|
|
const metrics = overlap(draftTokens, sourceTokens, n);
|
|
const v = verdict(metrics);
|
|
|
|
return {
|
|
verdict: v.verdict,
|
|
reasons: v.reasons,
|
|
containment: metrics.containment,
|
|
longestRun: metrics.longestRun,
|
|
thresholds: {
|
|
accept: ACCEPT_CONTAINMENT_MAX,
|
|
reject: REJECT_CONTAINMENT_MIN,
|
|
minRun: REJECT_RUN_MIN,
|
|
},
|
|
shingleSize: n,
|
|
draftWords: draftTokens.length,
|
|
sourceWords: sourceTokens.length,
|
|
draftShingles: metrics.draftShingleCount,
|
|
sharedShingles: metrics.sharedCount,
|
|
};
|
|
}
|
|
|
|
// === CLI entry ===
|
|
async function main() {
|
|
const args = argv.slice(2);
|
|
if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
|
|
stdout.write(USAGE);
|
|
exit(0);
|
|
}
|
|
if (args.length !== 2) {
|
|
stderr.write(`Error: expected 2 arguments (draft-path, source-path), got ${args.length}\n`);
|
|
stderr.write(USAGE);
|
|
exit(2);
|
|
}
|
|
const [draftPath, sourcePath] = args;
|
|
let draftText, sourceText;
|
|
try {
|
|
draftText = await readFile(draftPath, 'utf8');
|
|
} catch (err) {
|
|
stderr.write(`Error reading draft ${draftPath}: ${err.message}\n`);
|
|
exit(1);
|
|
}
|
|
try {
|
|
sourceText = await readFile(sourcePath, 'utf8');
|
|
} catch (err) {
|
|
stderr.write(`Error reading source ${sourcePath}: ${err.message}\n`);
|
|
exit(1);
|
|
}
|
|
const result = analyze(draftText, sourceText);
|
|
stdout.write(JSON.stringify(result, null, 2) + '\n');
|
|
exit(0);
|
|
}
|
|
|
|
// Only run main when invoked as CLI (not when imported)
|
|
const invokedAsScript = import.meta.url === `file://${process.argv[1]}`;
|
|
if (invokedAsScript) {
|
|
main();
|
|
}
|