fix(dep): B7 — token-overlap typosquat heuristic alongside Levenshtein
Critical-review §2 B7 finding: pure Levenshtein <=2 misses the most common
modern typosquat pattern — popular-name + token-injection suffix. Examples:
lodash → lodash-utils (edit distance 6, not flagged pre-B7)
react → react-helper (edit distance 7, not flagged pre-B7)
express → express-wrapper (edit distance 8, not flagged pre-B7)
Three coordinated edits:
scanners/lib/string-utils.mjs
- Adds tokenize(name): string[] splits on -/_, lowercases
- Adds tokenOverlap(a, b): number intersection.size / min(|a|,|b|)
- Adds TYPOSQUAT_SUSPICIOUS_TOKENS frozen list of common typosquat
suffixes. Excludes language-extension tokens (js, jsx, ts, tsx) — the
v7.0.0 allowlist contains `tsx` as a legit package and including the
same token in the suspicious set creates a contradiction. Caught by
the new allowlist-intersection-guard test. Also excludes 'pro'
(legitimate edition marker).
scanners/dep-auditor.mjs + scanners/supply-chain-recheck.mjs
- New checkTyposquatTokenOverlap() helper — fires AFTER Levenshtein 1/2
branches, only when:
1. popular package's tokens ⊆ declared name's tokens (strict superset)
2. declared name has at least one suspicious suffix
3. popular package is in topCutoff window
All three conditions required — conservative by design. Allowlist
precedence preserved (existing 22 npm + 13 PyPI entries always pass).
MEDIUM severity, NOT block. New finding title prefix:
"Possible typosquatting via token-overlap".
Tests: +21 cases across two new files
- tests/lib/string-utils-tokens.test.mjs (15) — tokenize, tokenOverlap,
TYPOSQUAT_SUSPICIOUS_TOKENS frozen contract, allowlist-intersection
guard (caught the tsx conflict on first run)
- tests/scanners/dep-token-overlap.test.mjs (7) — integration via
in-memory tmpdir fixtures: lodash-utils flagged, react-helper flagged,
express-wrapper flagged, lodash exact NOT flagged, allowlist tools
(knip/tsx/nx/rimraf) NOT flagged, react-router-dom (no suspicious
suffix) NOT flagged, react itself (equal token set, not superset)
NOT flagged.
Existing dep.test.mjs and supply-chain-recheck.test.mjs unchanged —
all green (149 → 149 regression guard).
Suite: 1570 → 1591 (+21). All green.
This commit is contained in:
parent
68b9ea2692
commit
5f8f2d3c41
5 changed files with 438 additions and 2 deletions
|
|
@ -6,7 +6,7 @@
|
|||
|
||||
import { finding, scannerResult } from './lib/output.mjs';
|
||||
import { SEVERITY } from './lib/severity.mjs';
|
||||
import { levenshtein } from './lib/string-utils.mjs';
|
||||
import { levenshtein, tokenize, tokenOverlap, TYPOSQUAT_SUSPICIOUS_TOKENS } from './lib/string-utils.mjs';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { join, dirname } from 'node:path';
|
||||
import { existsSync } from 'node:fs';
|
||||
|
|
@ -341,6 +341,72 @@ function checkTyposquatting(declaredName, topList, top200Cutoff, ecosystem, sour
|
|||
});
|
||||
}
|
||||
|
||||
// B7 (v7.2.0): token-overlap heuristic. Flags typosquats that Levenshtein
|
||||
// misses because the attacker added a suspicious suffix token (e.g.
|
||||
// `lodash-utils` vs `lodash`, edit distance 6). Conservative — requires
|
||||
// BOTH a high token-overlap (entire popular name's tokens are a subset
|
||||
// of the declared name) AND at least one suspicious suffix token.
|
||||
const tokenOverlapFinding = checkTyposquatTokenOverlap(
|
||||
declaredName,
|
||||
topList,
|
||||
top200Cutoff,
|
||||
ecosystem,
|
||||
sourceFile,
|
||||
);
|
||||
if (tokenOverlapFinding) return tokenOverlapFinding;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* B7 token-overlap heuristic — complementary to Levenshtein. Returns a
|
||||
* MEDIUM finding when:
|
||||
* 1. The declared name's tokens contain ALL of a popular package's tokens
|
||||
* (popular package's tokens ⊆ declared name's tokens), AND
|
||||
* 2. The declared name has at least one suspicious suffix token
|
||||
* (`-utils`, `-helper`, `-pro`, `-cli`, `-wrapper`, etc.), AND
|
||||
* 3. The popular package is in the top200Cutoff window
|
||||
*
|
||||
* Returns null if no match. Allowlist precedence is enforced by the caller
|
||||
* (returning null at the top of `checkTyposquatting` before this is reached).
|
||||
*/
|
||||
function checkTyposquatTokenOverlap(declaredName, topList, top200Cutoff, ecosystem, sourceFile) {
|
||||
const declaredTokens = new Set(tokenize(declaredName));
|
||||
if (declaredTokens.size < 2) return null;
|
||||
|
||||
const suspiciousSuffixes = TYPOSQUAT_SUSPICIOUS_TOKENS.filter(t => declaredTokens.has(t));
|
||||
if (suspiciousSuffixes.length === 0) return null;
|
||||
|
||||
const limit = Math.min(top200Cutoff, topList.length);
|
||||
for (let i = 0; i < limit; i++) {
|
||||
const topPkg = topList[i];
|
||||
if (declaredName === topPkg) return null;
|
||||
const topTokens = tokenize(topPkg);
|
||||
if (topTokens.length === 0) continue;
|
||||
const allContained = topTokens.every(t => declaredTokens.has(t));
|
||||
if (!allContained) continue;
|
||||
if (topTokens.length === declaredTokens.size) continue; // exact-token-set is not a typosquat
|
||||
const overlap = tokenOverlap(declaredName, topPkg);
|
||||
if (overlap < 0.66) continue;
|
||||
return finding({
|
||||
scanner: 'DEP',
|
||||
severity: SEVERITY.MEDIUM,
|
||||
title: `Possible typosquatting via token-overlap: "${declaredName}" vs "${topPkg}"`,
|
||||
description:
|
||||
`The declared ${ecosystem} package "${declaredName}" contains all tokens of the ` +
|
||||
`popular package "${topPkg}" plus a suspicious suffix (${suspiciousSuffixes.join(', ')}). ` +
|
||||
`This is a common typosquat pattern: attackers register popular-name-plus-suffix ` +
|
||||
`packages to capture installs from users misremembering the canonical name.`,
|
||||
file: sourceFile,
|
||||
evidence: `"${declaredName}" tokens ⊃ "${topPkg}" tokens; suffix=${suspiciousSuffixes.join(',')}; overlap=${overlap.toFixed(2)}`,
|
||||
owasp: 'LLM03',
|
||||
recommendation:
|
||||
`Verify that "${declaredName}" is intentional. If you meant "${topPkg}", ` +
|
||||
`correct the dependency name. If "${declaredName}" is a legitimate utility ` +
|
||||
`package, add it to knowledge/typosquat-allowlist.json under "${ecosystem}".`,
|
||||
});
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -54,6 +54,72 @@ export function levenshtein(a, b) {
|
|||
return prev[n];
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a package name into lowercase tokens on `-` and `_` boundaries.
|
||||
* Used by the B7 typosquat token-overlap heuristic. Empty tokens are
|
||||
* dropped. Single-character tokens are kept (some package names like
|
||||
* `a-b` are real).
|
||||
*
|
||||
* @param {string} name
|
||||
* @returns {string[]}
|
||||
*/
|
||||
export function tokenize(name) {
|
||||
if (!name) return [];
|
||||
return name
|
||||
.toLowerCase()
|
||||
.split(/[-_]+/)
|
||||
.filter(t => t.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Token-overlap ratio between two package names. Returns the size of the
|
||||
* intersection divided by the size of the smaller token set. Returns 0 if
|
||||
* either input is empty.
|
||||
*
|
||||
* Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0
|
||||
* `tokenOverlap('react-router-dom', 'react')` → 1.0
|
||||
* `tokenOverlap('react-helper', 'react-router')` → 0.5
|
||||
* `tokenOverlap('foo', 'bar')` → 0.0
|
||||
*
|
||||
* Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein —
|
||||
* Levenshtein <=2 catches small typos; token-overlap catches
|
||||
* popular-name-with-suffix typosquats.
|
||||
*
|
||||
* @param {string} a
|
||||
* @param {string} b
|
||||
* @returns {number} 0..1
|
||||
*/
|
||||
export function tokenOverlap(a, b) {
|
||||
const ta = new Set(tokenize(a));
|
||||
const tb = new Set(tokenize(b));
|
||||
if (ta.size === 0 || tb.size === 0) return 0;
|
||||
let intersection = 0;
|
||||
for (const t of ta) if (tb.has(t)) intersection++;
|
||||
return intersection / Math.min(ta.size, tb.size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Suspicious suffix tokens commonly used by typosquats to dress up a
|
||||
* popular package name. Module-level for B7 reuse.
|
||||
*
|
||||
* Excluded by design (would conflict with the v7.0.0 typosquat allowlist
|
||||
* or trigger false positives on legitimate packages):
|
||||
* - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many
|
||||
* legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The
|
||||
* v7.0.0 allowlist contains `tsx` directly; including the same token
|
||||
* in the suspicious set would create an internal contradiction.
|
||||
* - `pro` — too common as a legitimate edition marker (`vue-pro`,
|
||||
* `tailwindcss-pro`).
|
||||
*
|
||||
* Kept tokens are the unambiguous typosquat suffixes: utility/helper
|
||||
* dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers.
|
||||
*/
|
||||
export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([
|
||||
'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras',
|
||||
'bin', 'cli', 'tool', 'tools',
|
||||
'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim',
|
||||
]);
|
||||
|
||||
/**
|
||||
* Check if a string looks like base64-encoded data.
|
||||
* @param {string} s
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ import { existsSync, readFileSync } from 'node:fs';
|
|||
import { fileURLToPath } from 'node:url';
|
||||
import { finding, scannerResult } from './lib/output.mjs';
|
||||
import { SEVERITY } from './lib/severity.mjs';
|
||||
import { levenshtein } from './lib/string-utils.mjs';
|
||||
import { levenshtein, tokenize, tokenOverlap, TYPOSQUAT_SUSPICIOUS_TOKENS } from './lib/string-utils.mjs';
|
||||
import {
|
||||
NPM_COMPROMISED, PIP_COMPROMISED, CARGO_COMPROMISED, GEM_COMPROMISED,
|
||||
isCompromised, extractOSVSeverity, queryOSVBatch, OSV_ECOSYSTEM_MAP,
|
||||
|
|
@ -337,11 +337,55 @@ function checkTyposquatting(deps, topList, topCutoff, ecosystem, lockfile, allow
|
|||
recommendation:
|
||||
`Confirm "${dep.name}" is the correct package. Check publish date and author on the registry.`,
|
||||
}));
|
||||
} else {
|
||||
// B7 (v7.2.0): token-overlap fallback. Catches typosquats with edit
|
||||
// distance >= 3 that contain all tokens of a popular package plus a
|
||||
// suspicious suffix (e.g. `lodash-utils` vs `lodash`).
|
||||
const tokenFinding = checkTyposquatTokenOverlap(dep, normalized, topList, topCutoff, ecosystem, lockfile);
|
||||
if (tokenFinding) results.push(tokenFinding);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
function checkTyposquatTokenOverlap(dep, normalized, topList, topCutoff, ecosystem, lockfile) {
|
||||
const declaredTokens = new Set(tokenize(normalized));
|
||||
if (declaredTokens.size < 2) return null;
|
||||
|
||||
const suspiciousSuffixes = TYPOSQUAT_SUSPICIOUS_TOKENS.filter(t => declaredTokens.has(t));
|
||||
if (suspiciousSuffixes.length === 0) return null;
|
||||
|
||||
const limit = Math.min(topCutoff, topList.length);
|
||||
for (let i = 0; i < limit; i++) {
|
||||
const topPkg = topList[i];
|
||||
if (normalized === topPkg) return null;
|
||||
const topTokens = tokenize(topPkg);
|
||||
if (topTokens.length === 0) continue;
|
||||
const allContained = topTokens.every(t => declaredTokens.has(t));
|
||||
if (!allContained) continue;
|
||||
if (topTokens.length === declaredTokens.size) continue;
|
||||
const overlap = tokenOverlap(normalized, topPkg);
|
||||
if (overlap < 0.66) continue;
|
||||
return finding({
|
||||
scanner: 'SCR',
|
||||
severity: SEVERITY.MEDIUM,
|
||||
title: `Possible typosquatting via token-overlap: "${dep.name}" vs "${topPkg}"`,
|
||||
description:
|
||||
`"${dep.name}" in ${lockfile} contains all tokens of the popular ${ecosystem} package ` +
|
||||
`"${topPkg}" plus a suspicious suffix (${suspiciousSuffixes.join(', ')}). ` +
|
||||
`This is a common typosquat pattern: attackers register popular-name-plus-suffix ` +
|
||||
`packages to capture installs.`,
|
||||
file: lockfile,
|
||||
evidence: `"${dep.name}" tokens ⊃ "${topPkg}" tokens; suffix=${suspiciousSuffixes.join(',')}; overlap=${overlap.toFixed(2)}`,
|
||||
owasp: 'LLM03',
|
||||
recommendation:
|
||||
`Verify "${dep.name}" is intentional. If you meant "${topPkg}", correct the dependency. ` +
|
||||
`If "${dep.name}" is a legitimate utility, add it to knowledge/typosquat-allowlist.json under "${ecosystem}".`,
|
||||
});
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main scanner export
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue