fix(dep): B7 — token-overlap typosquat heuristic alongside Levenshtein

Critical-review §2 B7 finding: pure Levenshtein <=2 misses the most common
modern typosquat pattern — popular-name + token-injection suffix. Examples:
  lodash → lodash-utils    (edit distance 6, not flagged pre-B7)
  react  → react-helper    (edit distance 7, not flagged pre-B7)
  express → express-wrapper (edit distance 8, not flagged pre-B7)

Three coordinated edits:

scanners/lib/string-utils.mjs
- Adds tokenize(name): string[]    splits on -/_, lowercases
- Adds tokenOverlap(a, b): number  intersection.size / min(|a|,|b|)
- Adds TYPOSQUAT_SUSPICIOUS_TOKENS frozen list of common typosquat
  suffixes. Excludes language-extension tokens (js, jsx, ts, tsx) — the
  v7.0.0 allowlist contains `tsx` as a legit package and including the
  same token in the suspicious set creates a contradiction. Caught by
  the new allowlist-intersection-guard test. Also excludes 'pro'
  (legitimate edition marker).

scanners/dep-auditor.mjs + scanners/supply-chain-recheck.mjs
- New checkTyposquatTokenOverlap() helper — fires AFTER Levenshtein 1/2
  branches, only when:
    1. popular package's tokens ⊆ declared name's tokens (strict superset)
    2. declared name has at least one suspicious suffix
    3. popular package is in topCutoff window
  All three conditions required — conservative by design. Allowlist
  precedence preserved (existing 22 npm + 13 PyPI entries always pass).
  MEDIUM severity, NOT block. New finding title prefix:
  "Possible typosquatting via token-overlap".

Tests: +21 cases across two new files
- tests/lib/string-utils-tokens.test.mjs (15) — tokenize, tokenOverlap,
  TYPOSQUAT_SUSPICIOUS_TOKENS frozen contract, allowlist-intersection
  guard (caught the tsx conflict on first run)
- tests/scanners/dep-token-overlap.test.mjs (7) — integration via
  in-memory tmpdir fixtures: lodash-utils flagged, react-helper flagged,
  express-wrapper flagged, lodash exact NOT flagged, allowlist tools
  (knip/tsx/nx/rimraf) NOT flagged, react-router-dom (no suspicious
  suffix) NOT flagged, react itself (equal token set, not superset)
  NOT flagged.

Existing dep.test.mjs and supply-chain-recheck.test.mjs unchanged —
all green (149 → 149 regression guard).

Suite: 1570 → 1591 (+21). All green.
This commit is contained in:
Kjell Tore Guttormsen 2026-04-29 14:10:53 +02:00
commit 5f8f2d3c41
5 changed files with 438 additions and 2 deletions

View file

@ -16,7 +16,7 @@ import { existsSync, readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { levenshtein } from './lib/string-utils.mjs';
import { levenshtein, tokenize, tokenOverlap, TYPOSQUAT_SUSPICIOUS_TOKENS } from './lib/string-utils.mjs';
import {
NPM_COMPROMISED, PIP_COMPROMISED, CARGO_COMPROMISED, GEM_COMPROMISED,
isCompromised, extractOSVSeverity, queryOSVBatch, OSV_ECOSYSTEM_MAP,
@ -337,11 +337,55 @@ function checkTyposquatting(deps, topList, topCutoff, ecosystem, lockfile, allow
recommendation:
`Confirm "${dep.name}" is the correct package. Check publish date and author on the registry.`,
}));
} else {
// B7 (v7.2.0): token-overlap fallback. Catches typosquats with edit
// distance >= 3 that contain all tokens of a popular package plus a
// suspicious suffix (e.g. `lodash-utils` vs `lodash`).
const tokenFinding = checkTyposquatTokenOverlap(dep, normalized, topList, topCutoff, ecosystem, lockfile);
if (tokenFinding) results.push(tokenFinding);
}
}
return results;
}
function checkTyposquatTokenOverlap(dep, normalized, topList, topCutoff, ecosystem, lockfile) {
const declaredTokens = new Set(tokenize(normalized));
if (declaredTokens.size < 2) return null;
const suspiciousSuffixes = TYPOSQUAT_SUSPICIOUS_TOKENS.filter(t => declaredTokens.has(t));
if (suspiciousSuffixes.length === 0) return null;
const limit = Math.min(topCutoff, topList.length);
for (let i = 0; i < limit; i++) {
const topPkg = topList[i];
if (normalized === topPkg) return null;
const topTokens = tokenize(topPkg);
if (topTokens.length === 0) continue;
const allContained = topTokens.every(t => declaredTokens.has(t));
if (!allContained) continue;
if (topTokens.length === declaredTokens.size) continue;
const overlap = tokenOverlap(normalized, topPkg);
if (overlap < 0.66) continue;
return finding({
scanner: 'SCR',
severity: SEVERITY.MEDIUM,
title: `Possible typosquatting via token-overlap: "${dep.name}" vs "${topPkg}"`,
description:
`"${dep.name}" in ${lockfile} contains all tokens of the popular ${ecosystem} package ` +
`"${topPkg}" plus a suspicious suffix (${suspiciousSuffixes.join(', ')}). ` +
`This is a common typosquat pattern: attackers register popular-name-plus-suffix ` +
`packages to capture installs.`,
file: lockfile,
evidence: `"${dep.name}" tokens ⊃ "${topPkg}" tokens; suffix=${suspiciousSuffixes.join(',')}; overlap=${overlap.toFixed(2)}`,
owasp: 'LLM03',
recommendation:
`Verify "${dep.name}" is intentional. If you meant "${topPkg}", correct the dependency. ` +
`If "${dep.name}" is a legitimate utility, add it to knowledge/typosquat-allowlist.json under "${ecosystem}".`,
});
}
return null;
}
// ---------------------------------------------------------------------------
// Main scanner export
// ---------------------------------------------------------------------------