fix(dep): B7 — token-overlap typosquat heuristic alongside Levenshtein

Critical-review §2 B7 finding: pure Levenshtein <=2 misses the most common
modern typosquat pattern — popular-name + token-injection suffix. Examples:
  lodash → lodash-utils    (edit distance 6, not flagged pre-B7)
  react  → react-helper    (edit distance 7, not flagged pre-B7)
  express → express-wrapper (edit distance 8, not flagged pre-B7)

Three coordinated edits:

scanners/lib/string-utils.mjs
- Adds tokenize(name): string[]    splits on -/_, lowercases
- Adds tokenOverlap(a, b): number  intersection.size / min(|a|,|b|)
- Adds TYPOSQUAT_SUSPICIOUS_TOKENS frozen list of common typosquat
  suffixes. Excludes language-extension tokens (js, jsx, ts, tsx) — the
  v7.0.0 allowlist contains `tsx` as a legit package and including the
  same token in the suspicious set creates a contradiction. Caught by
  the new allowlist-intersection-guard test. Also excludes 'pro'
  (legitimate edition marker).

scanners/dep-auditor.mjs + scanners/supply-chain-recheck.mjs
- New checkTyposquatTokenOverlap() helper — fires AFTER Levenshtein 1/2
  branches, only when:
    1. popular package's tokens ⊆ declared name's tokens (strict superset)
    2. declared name has at least one suspicious suffix
    3. popular package is in topCutoff window
  All three conditions required — conservative by design. Allowlist
  precedence preserved (existing 22 npm + 13 PyPI entries always pass).
  MEDIUM severity, NOT block. New finding title prefix:
  "Possible typosquatting via token-overlap".

Tests: +21 cases across two new files
- tests/lib/string-utils-tokens.test.mjs (15) — tokenize, tokenOverlap,
  TYPOSQUAT_SUSPICIOUS_TOKENS frozen contract, allowlist-intersection
  guard (caught the tsx conflict on first run)
- tests/scanners/dep-token-overlap.test.mjs (7) — integration via
  in-memory tmpdir fixtures: lodash-utils flagged, react-helper flagged,
  express-wrapper flagged, lodash exact NOT flagged, allowlist tools
  (knip/tsx/nx/rimraf) NOT flagged, react-router-dom (no suspicious
  suffix) NOT flagged, react itself (equal token set, not superset)
  NOT flagged.

Existing dep.test.mjs and supply-chain-recheck.test.mjs unchanged —
all green (149 → 149 regression guard).

Suite: 1570 → 1591 (+21). All green.
This commit is contained in:
Kjell Tore Guttormsen 2026-04-29 14:10:53 +02:00
commit 5f8f2d3c41
5 changed files with 438 additions and 2 deletions

View file

@ -6,7 +6,7 @@
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { levenshtein } from './lib/string-utils.mjs';
import { levenshtein, tokenize, tokenOverlap, TYPOSQUAT_SUSPICIOUS_TOKENS } from './lib/string-utils.mjs';
import { readFile } from 'node:fs/promises';
import { join, dirname } from 'node:path';
import { existsSync } from 'node:fs';
@ -341,6 +341,72 @@ function checkTyposquatting(declaredName, topList, top200Cutoff, ecosystem, sour
});
}
// B7 (v7.2.0): token-overlap heuristic. Flags typosquats that Levenshtein
// misses because the attacker added a suspicious suffix token (e.g.
// `lodash-utils` vs `lodash`, edit distance 6). Conservative — requires
// BOTH a high token-overlap (entire popular name's tokens are a subset
// of the declared name) AND at least one suspicious suffix token.
const tokenOverlapFinding = checkTyposquatTokenOverlap(
declaredName,
topList,
top200Cutoff,
ecosystem,
sourceFile,
);
if (tokenOverlapFinding) return tokenOverlapFinding;
return null;
}
/**
* B7 token-overlap heuristic complementary to Levenshtein. Returns a
* MEDIUM finding when:
* 1. The declared name's tokens contain ALL of a popular package's tokens
* (popular package's tokens ⊆ declared name's tokens), AND
* 2. The declared name has at least one suspicious suffix token
* (`-utils`, `-helper`, `-pro`, `-cli`, `-wrapper`, etc.), AND
* 3. The popular package is in the top200Cutoff window
*
* Returns null if no match. Allowlist precedence is enforced by the caller
* (returning null at the top of `checkTyposquatting` before this is reached).
*/
function checkTyposquatTokenOverlap(declaredName, topList, top200Cutoff, ecosystem, sourceFile) {
const declaredTokens = new Set(tokenize(declaredName));
if (declaredTokens.size < 2) return null;
const suspiciousSuffixes = TYPOSQUAT_SUSPICIOUS_TOKENS.filter(t => declaredTokens.has(t));
if (suspiciousSuffixes.length === 0) return null;
const limit = Math.min(top200Cutoff, topList.length);
for (let i = 0; i < limit; i++) {
const topPkg = topList[i];
if (declaredName === topPkg) return null;
const topTokens = tokenize(topPkg);
if (topTokens.length === 0) continue;
const allContained = topTokens.every(t => declaredTokens.has(t));
if (!allContained) continue;
if (topTokens.length === declaredTokens.size) continue; // exact-token-set is not a typosquat
const overlap = tokenOverlap(declaredName, topPkg);
if (overlap < 0.66) continue;
return finding({
scanner: 'DEP',
severity: SEVERITY.MEDIUM,
title: `Possible typosquatting via token-overlap: "${declaredName}" vs "${topPkg}"`,
description:
`The declared ${ecosystem} package "${declaredName}" contains all tokens of the ` +
`popular package "${topPkg}" plus a suspicious suffix (${suspiciousSuffixes.join(', ')}). ` +
`This is a common typosquat pattern: attackers register popular-name-plus-suffix ` +
`packages to capture installs from users misremembering the canonical name.`,
file: sourceFile,
evidence: `"${declaredName}" tokens ⊃ "${topPkg}" tokens; suffix=${suspiciousSuffixes.join(',')}; overlap=${overlap.toFixed(2)}`,
owasp: 'LLM03',
recommendation:
`Verify that "${declaredName}" is intentional. If you meant "${topPkg}", ` +
`correct the dependency name. If "${declaredName}" is a legitimate utility ` +
`package, add it to knowledge/typosquat-allowlist.json under "${ecosystem}".`,
});
}
return null;
}

View file

@ -54,6 +54,72 @@ export function levenshtein(a, b) {
return prev[n];
}
/**
* Split a package name into lowercase tokens on `-` and `_` boundaries.
* Used by the B7 typosquat token-overlap heuristic. Empty tokens are
* dropped. Single-character tokens are kept (some package names like
* `a-b` are real).
*
* @param {string} name
* @returns {string[]}
*/
export function tokenize(name) {
if (!name) return [];
return name
.toLowerCase()
.split(/[-_]+/)
.filter(t => t.length > 0);
}
/**
* Token-overlap ratio between two package names. Returns the size of the
* intersection divided by the size of the smaller token set. Returns 0 if
* either input is empty.
*
* Example: `tokenOverlap('lodash-utils', 'lodash')` 1.0
* `tokenOverlap('react-router-dom', 'react')` 1.0
* `tokenOverlap('react-helper', 'react-router')` 0.5
* `tokenOverlap('foo', 'bar')` 0.0
*
* Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein
* Levenshtein <=2 catches small typos; token-overlap catches
* popular-name-with-suffix typosquats.
*
* @param {string} a
* @param {string} b
* @returns {number} 0..1
*/
export function tokenOverlap(a, b) {
const ta = new Set(tokenize(a));
const tb = new Set(tokenize(b));
if (ta.size === 0 || tb.size === 0) return 0;
let intersection = 0;
for (const t of ta) if (tb.has(t)) intersection++;
return intersection / Math.min(ta.size, tb.size);
}
/**
* Suspicious suffix tokens commonly used by typosquats to dress up a
* popular package name. Module-level for B7 reuse.
*
* Excluded by design (would conflict with the v7.0.0 typosquat allowlist
* or trigger false positives on legitimate packages):
* - `js`, `jsx`, `ts`, `tsx` language-extension suffixes used by many
* legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The
* v7.0.0 allowlist contains `tsx` directly; including the same token
* in the suspicious set would create an internal contradiction.
* - `pro` too common as a legitimate edition marker (`vue-pro`,
* `tailwindcss-pro`).
*
* Kept tokens are the unambiguous typosquat suffixes: utility/helper
* dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers.
*/
export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([
'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras',
'bin', 'cli', 'tool', 'tools',
'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim',
]);
/**
* Check if a string looks like base64-encoded data.
* @param {string} s

View file

@ -16,7 +16,7 @@ import { existsSync, readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { levenshtein } from './lib/string-utils.mjs';
import { levenshtein, tokenize, tokenOverlap, TYPOSQUAT_SUSPICIOUS_TOKENS } from './lib/string-utils.mjs';
import {
NPM_COMPROMISED, PIP_COMPROMISED, CARGO_COMPROMISED, GEM_COMPROMISED,
isCompromised, extractOSVSeverity, queryOSVBatch, OSV_ECOSYSTEM_MAP,
@ -337,11 +337,55 @@ function checkTyposquatting(deps, topList, topCutoff, ecosystem, lockfile, allow
recommendation:
`Confirm "${dep.name}" is the correct package. Check publish date and author on the registry.`,
}));
} else {
// B7 (v7.2.0): token-overlap fallback. Catches typosquats with edit
// distance >= 3 that contain all tokens of a popular package plus a
// suspicious suffix (e.g. `lodash-utils` vs `lodash`).
const tokenFinding = checkTyposquatTokenOverlap(dep, normalized, topList, topCutoff, ecosystem, lockfile);
if (tokenFinding) results.push(tokenFinding);
}
}
return results;
}
function checkTyposquatTokenOverlap(dep, normalized, topList, topCutoff, ecosystem, lockfile) {
const declaredTokens = new Set(tokenize(normalized));
if (declaredTokens.size < 2) return null;
const suspiciousSuffixes = TYPOSQUAT_SUSPICIOUS_TOKENS.filter(t => declaredTokens.has(t));
if (suspiciousSuffixes.length === 0) return null;
const limit = Math.min(topCutoff, topList.length);
for (let i = 0; i < limit; i++) {
const topPkg = topList[i];
if (normalized === topPkg) return null;
const topTokens = tokenize(topPkg);
if (topTokens.length === 0) continue;
const allContained = topTokens.every(t => declaredTokens.has(t));
if (!allContained) continue;
if (topTokens.length === declaredTokens.size) continue;
const overlap = tokenOverlap(normalized, topPkg);
if (overlap < 0.66) continue;
return finding({
scanner: 'SCR',
severity: SEVERITY.MEDIUM,
title: `Possible typosquatting via token-overlap: "${dep.name}" vs "${topPkg}"`,
description:
`"${dep.name}" in ${lockfile} contains all tokens of the popular ${ecosystem} package ` +
`"${topPkg}" plus a suspicious suffix (${suspiciousSuffixes.join(', ')}). ` +
`This is a common typosquat pattern: attackers register popular-name-plus-suffix ` +
`packages to capture installs.`,
file: lockfile,
evidence: `"${dep.name}" tokens ⊃ "${topPkg}" tokens; suffix=${suspiciousSuffixes.join(',')}; overlap=${overlap.toFixed(2)}`,
owasp: 'LLM03',
recommendation:
`Verify "${dep.name}" is intentional. If you meant "${topPkg}", correct the dependency. ` +
`If "${dep.name}" is a legitimate utility, add it to knowledge/typosquat-allowlist.json under "${ecosystem}".`,
});
}
return null;
}
// ---------------------------------------------------------------------------
// Main scanner export
// ---------------------------------------------------------------------------

View file

@ -0,0 +1,110 @@
// string-utils-tokens.test.mjs — B7 (v7.2.0) — tokenize + tokenOverlap helpers
//
// These helpers are used by dep-auditor and supply-chain-recheck to detect
// typosquats with edit distance >= 3 that contain all tokens of a popular
// package plus a suspicious suffix (e.g. `lodash-utils` vs `lodash`).
//
// Critical-review §2 B7 finding: pure Levenshtein <=2 misses the most common
// modern typosquat pattern — popular-name + token-injection suffix.
import { describe, it } from 'node:test';
import assert from 'node:assert/strict';
import {
tokenize,
tokenOverlap,
TYPOSQUAT_SUSPICIOUS_TOKENS,
} from '../../scanners/lib/string-utils.mjs';
describe('tokenize (B7)', () => {
it('splits on hyphens', () => {
assert.deepEqual(tokenize('lodash-utils'), ['lodash', 'utils']);
});
it('splits on underscores', () => {
assert.deepEqual(tokenize('react_helper'), ['react', 'helper']);
});
it('splits on mixed separators', () => {
assert.deepEqual(tokenize('foo-bar_baz'), ['foo', 'bar', 'baz']);
});
it('lowercases tokens', () => {
assert.deepEqual(tokenize('LODASH-Utils'), ['lodash', 'utils']);
});
it('drops empty tokens from consecutive separators', () => {
assert.deepEqual(tokenize('foo--bar'), ['foo', 'bar']);
});
it('returns empty array for empty input', () => {
assert.deepEqual(tokenize(''), []);
assert.deepEqual(tokenize(null), []);
assert.deepEqual(tokenize(undefined), []);
});
it('returns single-element array for token with no separators', () => {
assert.deepEqual(tokenize('lodash'), ['lodash']);
});
});
describe('tokenOverlap (B7)', () => {
it('returns 1.0 when popular tokens are subset of declared', () => {
assert.equal(tokenOverlap('lodash-utils', 'lodash'), 1.0);
assert.equal(tokenOverlap('react-router-dom', 'react'), 1.0);
});
it('returns 1.0 for identical token sets', () => {
assert.equal(tokenOverlap('lodash', 'lodash'), 1.0);
});
it('returns 0.5 for half-overlap', () => {
// 'react-helper' tokens = {react, helper}; 'react-router' tokens = {react, router}
// intersection = {react}, min size = 2, overlap = 1/2 = 0.5
assert.equal(tokenOverlap('react-helper', 'react-router'), 0.5);
});
it('returns 0 for disjoint tokens', () => {
assert.equal(tokenOverlap('foo', 'bar'), 0);
});
it('returns 0 for empty inputs', () => {
assert.equal(tokenOverlap('', 'lodash'), 0);
assert.equal(tokenOverlap('lodash', ''), 0);
});
it('is symmetric for sets of same size', () => {
const a = tokenOverlap('foo-bar', 'foo-baz');
const b = tokenOverlap('foo-baz', 'foo-bar');
assert.equal(a, b);
});
});
describe('TYPOSQUAT_SUSPICIOUS_TOKENS (B7)', () => {
it('contains common typosquat suffixes', () => {
for (const t of ['utils', 'helper', 'core', 'plus', 'cli', 'wrapper']) {
assert.ok(
TYPOSQUAT_SUSPICIOUS_TOKENS.includes(t),
`expected '${t}' in TYPOSQUAT_SUSPICIOUS_TOKENS`,
);
}
});
it('is frozen (cannot be mutated)', () => {
assert.throws(() => {
TYPOSQUAT_SUSPICIOUS_TOKENS.push('newtoken');
});
});
it('does NOT include legitimate short-name tools (allowlist intersection guard)', () => {
// These are legitimate package names from the v7.0.0 typosquat allowlist.
// The token-overlap heuristic would NEVER flag them as suspicious because
// they don't decompose into multiple tokens that include a popular package.
// But guard the suspicious-tokens list against accidentally including them.
for (const t of ['knip', 'tsx', 'nx', 'uv', 'ruff', 'oxlint', 'rimraf']) {
assert.ok(
!TYPOSQUAT_SUSPICIOUS_TOKENS.includes(t),
`'${t}' must not be in TYPOSQUAT_SUSPICIOUS_TOKENS — it is a legitimate tool`,
);
}
});
});

View file

@ -0,0 +1,150 @@
// dep-token-overlap.test.mjs — B7 (v7.2.0) — typosquat token-overlap integration
//
// Verifies that dep-auditor's checkTyposquatting now flags packages with
// edit distance >= 3 that contain all tokens of a popular package plus a
// suspicious suffix (e.g. `lodash-utils` vs `lodash`).
//
// Builds an in-memory fixture in tmpdir per test instead of mutating the
// shared `tests/fixtures/dep-test/` fixture (which other tests assert
// exact finding counts against).
import { describe, it, before, after, beforeEach } from 'node:test';
import assert from 'node:assert/strict';
import { mkdtemp, writeFile, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { resetCounter } from '../../scanners/lib/output.mjs';
import { scan } from '../../scanners/dep-auditor.mjs';
async function makeFixture(deps) {
const dir = await mkdtemp(join(tmpdir(), 'llm-security-dep-token-'));
await writeFile(
join(dir, 'package.json'),
JSON.stringify({ name: 'token-overlap-fixture', version: '1.0.0', dependencies: deps }, null, 2),
'utf8',
);
return dir;
}
describe('dep-auditor B7 — token-overlap typosquat heuristic', () => {
beforeEach(() => {
resetCounter();
});
it('flags lodash-utils as token-overlap typosquat of lodash', async () => {
const dir = await makeFixture({ 'lodash-utils': '^1.0.0' });
try {
const result = await scan(dir, { files: [] });
const finding = result.findings.find(
f => f.title.toLowerCase().includes('token-overlap') &&
f.title.includes('lodash-utils') &&
f.title.includes('lodash')
);
assert.ok(
finding,
`expected token-overlap finding for lodash-utils. Got: ${result.findings.map(f => f.title).join('; ')}`,
);
assert.equal(finding.severity, 'medium');
} finally {
await rm(dir, { recursive: true, force: true });
}
});
it('flags react-helper as token-overlap typosquat of react', async () => {
const dir = await makeFixture({ 'react-helper': '^1.0.0' });
try {
const result = await scan(dir, { files: [] });
const finding = result.findings.find(
f => f.title.toLowerCase().includes('token-overlap') &&
f.title.includes('react-helper')
);
assert.ok(finding, `expected react-helper to be flagged. Got: ${result.findings.map(f => f.title).join('; ')}`);
} finally {
await rm(dir, { recursive: true, force: true });
}
});
it('flags express-wrapper as token-overlap typosquat of express', async () => {
const dir = await makeFixture({ 'express-wrapper': '^1.0.0' });
try {
const result = await scan(dir, { files: [] });
const finding = result.findings.find(
f => f.title.toLowerCase().includes('token-overlap') &&
f.title.includes('express-wrapper')
);
assert.ok(finding, `expected express-wrapper to be flagged. Got: ${result.findings.map(f => f.title).join('; ')}`);
} finally {
await rm(dir, { recursive: true, force: true });
}
});
it('does NOT flag legitimate package lodash (exact match)', async () => {
const dir = await makeFixture({ 'lodash': '^4.17.0' });
try {
const result = await scan(dir, { files: [] });
const typosquatFindings = result.findings.filter(
f => f.title.toLowerCase().includes('typosquat')
);
assert.equal(typosquatFindings.length, 0, `lodash must not be flagged. Got: ${typosquatFindings.map(f => f.title).join('; ')}`);
} finally {
await rm(dir, { recursive: true, force: true });
}
});
it('does NOT flag legitimate short-name tools from allowlist', async () => {
// These are in knowledge/typosquat-allowlist.json (22 npm + 5 PyPI from v7.0.0).
// The allowlist check fires BEFORE Levenshtein and BEFORE token-overlap.
const dir = await makeFixture({
'knip': '^5.0.0',
'tsx': '^4.0.0',
'nx': '^17.0.0',
'rimraf': '^5.0.0',
});
try {
const result = await scan(dir, { files: [] });
const typosquatFindings = result.findings.filter(
f => f.title.toLowerCase().includes('typosquat')
);
assert.equal(
typosquatFindings.length, 0,
`allowlisted short-name tools must not be flagged. Got: ${typosquatFindings.map(f => f.title).join('; ')}`,
);
} finally {
await rm(dir, { recursive: true, force: true });
}
});
it('does NOT flag packages with no suspicious suffix even with token overlap', async () => {
// `react-router-dom` contains `react` token + `router` + `dom` — but neither
// `router` nor `dom` is in TYPOSQUAT_SUSPICIOUS_TOKENS. This is a legitimate
// ecosystem name and must pass.
const dir = await makeFixture({ 'react-router-dom': '^6.0.0' });
try {
const result = await scan(dir, { files: [] });
const tokenOverlapFindings = result.findings.filter(
f => f.title.toLowerCase().includes('token-overlap')
);
assert.equal(
tokenOverlapFindings.length, 0,
`react-router-dom must not be flagged as token-overlap typosquat`,
);
} finally {
await rm(dir, { recursive: true, force: true });
}
});
it('does NOT flag packages whose all tokens match a popular package (subset, not superset)', async () => {
// `react` itself has tokens {react} which equal the popular `react` token set.
// The heuristic requires declared ⊃ popular (strict superset), not equal.
const dir = await makeFixture({ 'react': '^18.0.0' });
try {
const result = await scan(dir, { files: [] });
const tokenOverlapFindings = result.findings.filter(
f => f.title.toLowerCase().includes('token-overlap')
);
assert.equal(tokenOverlapFindings.length, 0);
} finally {
await rm(dir, { recursive: true, force: true });
}
});
});