fix(dep): B7 — token-overlap typosquat heuristic alongside Levenshtein
Critical-review §2 B7 finding: pure Levenshtein <=2 misses the most common
modern typosquat pattern — popular-name + token-injection suffix. Examples:
lodash → lodash-utils (edit distance 6, not flagged pre-B7)
react → react-helper (edit distance 7, not flagged pre-B7)
express → express-wrapper (edit distance 8, not flagged pre-B7)
Three coordinated edits:
scanners/lib/string-utils.mjs
- Adds tokenize(name): string[] splits on -/_, lowercases
- Adds tokenOverlap(a, b): number intersection.size / min(|a|,|b|)
- Adds TYPOSQUAT_SUSPICIOUS_TOKENS frozen list of common typosquat
suffixes. Excludes language-extension tokens (js, jsx, ts, tsx) — the
v7.0.0 allowlist contains `tsx` as a legit package and including the
same token in the suspicious set creates a contradiction. Caught by
the new allowlist-intersection-guard test. Also excludes 'pro'
(legitimate edition marker).
scanners/dep-auditor.mjs + scanners/supply-chain-recheck.mjs
- New checkTyposquatTokenOverlap() helper — fires AFTER Levenshtein 1/2
branches, only when:
1. popular package's tokens ⊆ declared name's tokens (strict superset)
2. declared name has at least one suspicious suffix
3. popular package is in topCutoff window
All three conditions required — conservative by design. Allowlist
precedence preserved (existing 22 npm + 13 PyPI entries always pass).
MEDIUM severity, NOT block. New finding title prefix:
"Possible typosquatting via token-overlap".
Tests: +21 cases across two new files
- tests/lib/string-utils-tokens.test.mjs (15) — tokenize, tokenOverlap,
TYPOSQUAT_SUSPICIOUS_TOKENS frozen contract, allowlist-intersection
guard (caught the tsx conflict on first run)
- tests/scanners/dep-token-overlap.test.mjs (7) — integration via
in-memory tmpdir fixtures: lodash-utils flagged, react-helper flagged,
express-wrapper flagged, lodash exact NOT flagged, allowlist tools
(knip/tsx/nx/rimraf) NOT flagged, react-router-dom (no suspicious
suffix) NOT flagged, react itself (equal token set, not superset)
NOT flagged.
Existing dep.test.mjs and supply-chain-recheck.test.mjs unchanged —
all green (149 → 149 regression guard).
Suite: 1570 → 1591 (+21). All green.
This commit is contained in:
parent
68b9ea2692
commit
5f8f2d3c41
5 changed files with 438 additions and 2 deletions
110
plugins/llm-security/tests/lib/string-utils-tokens.test.mjs
Normal file
110
plugins/llm-security/tests/lib/string-utils-tokens.test.mjs
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
// string-utils-tokens.test.mjs — B7 (v7.2.0) — tokenize + tokenOverlap helpers
|
||||
//
|
||||
// These helpers are used by dep-auditor and supply-chain-recheck to detect
|
||||
// typosquats with edit distance >= 3 that contain all tokens of a popular
|
||||
// package plus a suspicious suffix (e.g. `lodash-utils` vs `lodash`).
|
||||
//
|
||||
// Critical-review §2 B7 finding: pure Levenshtein <=2 misses the most common
|
||||
// modern typosquat pattern — popular-name + token-injection suffix.
|
||||
|
||||
import { describe, it } from 'node:test';
|
||||
import assert from 'node:assert/strict';
|
||||
import {
|
||||
tokenize,
|
||||
tokenOverlap,
|
||||
TYPOSQUAT_SUSPICIOUS_TOKENS,
|
||||
} from '../../scanners/lib/string-utils.mjs';
|
||||
|
||||
describe('tokenize (B7)', () => {
|
||||
it('splits on hyphens', () => {
|
||||
assert.deepEqual(tokenize('lodash-utils'), ['lodash', 'utils']);
|
||||
});
|
||||
|
||||
it('splits on underscores', () => {
|
||||
assert.deepEqual(tokenize('react_helper'), ['react', 'helper']);
|
||||
});
|
||||
|
||||
it('splits on mixed separators', () => {
|
||||
assert.deepEqual(tokenize('foo-bar_baz'), ['foo', 'bar', 'baz']);
|
||||
});
|
||||
|
||||
it('lowercases tokens', () => {
|
||||
assert.deepEqual(tokenize('LODASH-Utils'), ['lodash', 'utils']);
|
||||
});
|
||||
|
||||
it('drops empty tokens from consecutive separators', () => {
|
||||
assert.deepEqual(tokenize('foo--bar'), ['foo', 'bar']);
|
||||
});
|
||||
|
||||
it('returns empty array for empty input', () => {
|
||||
assert.deepEqual(tokenize(''), []);
|
||||
assert.deepEqual(tokenize(null), []);
|
||||
assert.deepEqual(tokenize(undefined), []);
|
||||
});
|
||||
|
||||
it('returns single-element array for token with no separators', () => {
|
||||
assert.deepEqual(tokenize('lodash'), ['lodash']);
|
||||
});
|
||||
});
|
||||
|
||||
describe('tokenOverlap (B7)', () => {
|
||||
it('returns 1.0 when popular tokens are subset of declared', () => {
|
||||
assert.equal(tokenOverlap('lodash-utils', 'lodash'), 1.0);
|
||||
assert.equal(tokenOverlap('react-router-dom', 'react'), 1.0);
|
||||
});
|
||||
|
||||
it('returns 1.0 for identical token sets', () => {
|
||||
assert.equal(tokenOverlap('lodash', 'lodash'), 1.0);
|
||||
});
|
||||
|
||||
it('returns 0.5 for half-overlap', () => {
|
||||
// 'react-helper' tokens = {react, helper}; 'react-router' tokens = {react, router}
|
||||
// intersection = {react}, min size = 2, overlap = 1/2 = 0.5
|
||||
assert.equal(tokenOverlap('react-helper', 'react-router'), 0.5);
|
||||
});
|
||||
|
||||
it('returns 0 for disjoint tokens', () => {
|
||||
assert.equal(tokenOverlap('foo', 'bar'), 0);
|
||||
});
|
||||
|
||||
it('returns 0 for empty inputs', () => {
|
||||
assert.equal(tokenOverlap('', 'lodash'), 0);
|
||||
assert.equal(tokenOverlap('lodash', ''), 0);
|
||||
});
|
||||
|
||||
it('is symmetric for sets of same size', () => {
|
||||
const a = tokenOverlap('foo-bar', 'foo-baz');
|
||||
const b = tokenOverlap('foo-baz', 'foo-bar');
|
||||
assert.equal(a, b);
|
||||
});
|
||||
});
|
||||
|
||||
describe('TYPOSQUAT_SUSPICIOUS_TOKENS (B7)', () => {
|
||||
it('contains common typosquat suffixes', () => {
|
||||
for (const t of ['utils', 'helper', 'core', 'plus', 'cli', 'wrapper']) {
|
||||
assert.ok(
|
||||
TYPOSQUAT_SUSPICIOUS_TOKENS.includes(t),
|
||||
`expected '${t}' in TYPOSQUAT_SUSPICIOUS_TOKENS`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
it('is frozen (cannot be mutated)', () => {
|
||||
assert.throws(() => {
|
||||
TYPOSQUAT_SUSPICIOUS_TOKENS.push('newtoken');
|
||||
});
|
||||
});
|
||||
|
||||
it('does NOT include legitimate short-name tools (allowlist intersection guard)', () => {
|
||||
// These are legitimate package names from the v7.0.0 typosquat allowlist.
|
||||
// The token-overlap heuristic would NEVER flag them as suspicious because
|
||||
// they don't decompose into multiple tokens that include a popular package.
|
||||
// But guard the suspicious-tokens list against accidentally including them.
|
||||
for (const t of ['knip', 'tsx', 'nx', 'uv', 'ruff', 'oxlint', 'rimraf']) {
|
||||
assert.ok(
|
||||
!TYPOSQUAT_SUSPICIOUS_TOKENS.includes(t),
|
||||
`'${t}' must not be in TYPOSQUAT_SUSPICIOUS_TOKENS — it is a legitimate tool`,
|
||||
);
|
||||
}
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue