Critical-review §2 B7 finding: pure Levenshtein <=2 misses the most common
modern typosquat pattern — popular-name + token-injection suffix. Examples:
lodash → lodash-utils (edit distance 6, not flagged pre-B7)
react → react-helper (edit distance 7, not flagged pre-B7)
express → express-wrapper (edit distance 8, not flagged pre-B7)
Three coordinated edits:
scanners/lib/string-utils.mjs
- Adds tokenize(name): string[] splits on -/_, lowercases
- Adds tokenOverlap(a, b): number intersection.size / min(|a|,|b|)
- Adds TYPOSQUAT_SUSPICIOUS_TOKENS frozen list of common typosquat
suffixes. Excludes language-extension tokens (js, jsx, ts, tsx) — the
v7.0.0 allowlist contains `tsx` as a legit package and including the
same token in the suspicious set creates a contradiction. Caught by
the new allowlist-intersection-guard test. Also excludes 'pro'
(legitimate edition marker).
scanners/dep-auditor.mjs + scanners/supply-chain-recheck.mjs
- New checkTyposquatTokenOverlap() helper — fires AFTER Levenshtein 1/2
branches, only when:
1. popular package's tokens ⊆ declared name's tokens (strict superset)
2. declared name has at least one suspicious suffix
3. popular package is in topCutoff window
All three conditions required — conservative by design. Allowlist
precedence preserved (existing 22 npm + 13 PyPI entries always pass).
MEDIUM severity, NOT block. New finding title prefix:
"Possible typosquatting via token-overlap".
Tests: +21 cases across two new files
- tests/lib/string-utils-tokens.test.mjs (15) — tokenize, tokenOverlap,
TYPOSQUAT_SUSPICIOUS_TOKENS frozen contract, allowlist-intersection
guard (caught the tsx conflict on first run)
- tests/scanners/dep-token-overlap.test.mjs (7) — integration via
in-memory tmpdir fixtures: lodash-utils flagged, react-helper flagged,
express-wrapper flagged, lodash exact NOT flagged, allowlist tools
(knip/tsx/nx/rimraf) NOT flagged, react-router-dom (no suspicious
suffix) NOT flagged, react itself (equal token set, not superset)
NOT flagged.
Existing dep.test.mjs and supply-chain-recheck.test.mjs unchanged —
all green (149 → 149 regression guard).
Suite: 1570 → 1591 (+21). All green.
388 lines
12 KiB
JavaScript
388 lines
12 KiB
JavaScript
// string-utils.mjs — Entropy, Levenshtein, base64 detection, redaction, decoding
|
|
// Zero dependencies.
|
|
|
|
/**
|
|
* Shannon entropy of a string (bits per character).
|
|
* @param {string} s
|
|
* @returns {number}
|
|
*/
|
|
export function shannonEntropy(s) {
|
|
if (s.length === 0) return 0;
|
|
const freq = new Map();
|
|
for (const ch of s) {
|
|
freq.set(ch, (freq.get(ch) || 0) + 1);
|
|
}
|
|
let H = 0;
|
|
const len = s.length;
|
|
for (const count of freq.values()) {
|
|
const p = count / len;
|
|
H -= p * Math.log2(p);
|
|
}
|
|
return H;
|
|
}
|
|
|
|
/**
|
|
* Levenshtein edit distance between two strings.
|
|
* @param {string} a
|
|
* @param {string} b
|
|
* @returns {number}
|
|
*/
|
|
export function levenshtein(a, b) {
|
|
if (a === b) return 0;
|
|
if (a.length === 0) return b.length;
|
|
if (b.length === 0) return a.length;
|
|
|
|
const m = a.length;
|
|
const n = b.length;
|
|
// Single-row optimization
|
|
let prev = new Array(n + 1);
|
|
let curr = new Array(n + 1);
|
|
for (let j = 0; j <= n; j++) prev[j] = j;
|
|
|
|
for (let i = 1; i <= m; i++) {
|
|
curr[0] = i;
|
|
for (let j = 1; j <= n; j++) {
|
|
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
curr[j] = Math.min(
|
|
prev[j] + 1, // deletion
|
|
curr[j - 1] + 1, // insertion
|
|
prev[j - 1] + cost // substitution
|
|
);
|
|
}
|
|
[prev, curr] = [curr, prev];
|
|
}
|
|
return prev[n];
|
|
}
|
|
|
|
/**
|
|
* Split a package name into lowercase tokens on `-` and `_` boundaries.
|
|
* Used by the B7 typosquat token-overlap heuristic. Empty tokens are
|
|
* dropped. Single-character tokens are kept (some package names like
|
|
* `a-b` are real).
|
|
*
|
|
* @param {string} name
|
|
* @returns {string[]}
|
|
*/
|
|
export function tokenize(name) {
|
|
if (!name) return [];
|
|
return name
|
|
.toLowerCase()
|
|
.split(/[-_]+/)
|
|
.filter(t => t.length > 0);
|
|
}
|
|
|
|
/**
|
|
* Token-overlap ratio between two package names. Returns the size of the
|
|
* intersection divided by the size of the smaller token set. Returns 0 if
|
|
* either input is empty.
|
|
*
|
|
* Example: `tokenOverlap('lodash-utils', 'lodash')` → 1.0
|
|
* `tokenOverlap('react-router-dom', 'react')` → 1.0
|
|
* `tokenOverlap('react-helper', 'react-router')` → 0.5
|
|
* `tokenOverlap('foo', 'bar')` → 0.0
|
|
*
|
|
* Used by B7 (v7.2.0) as a complementary signal alongside Levenshtein —
|
|
* Levenshtein <=2 catches small typos; token-overlap catches
|
|
* popular-name-with-suffix typosquats.
|
|
*
|
|
* @param {string} a
|
|
* @param {string} b
|
|
* @returns {number} 0..1
|
|
*/
|
|
export function tokenOverlap(a, b) {
|
|
const ta = new Set(tokenize(a));
|
|
const tb = new Set(tokenize(b));
|
|
if (ta.size === 0 || tb.size === 0) return 0;
|
|
let intersection = 0;
|
|
for (const t of ta) if (tb.has(t)) intersection++;
|
|
return intersection / Math.min(ta.size, tb.size);
|
|
}
|
|
|
|
/**
|
|
* Suspicious suffix tokens commonly used by typosquats to dress up a
|
|
* popular package name. Module-level for B7 reuse.
|
|
*
|
|
* Excluded by design (would conflict with the v7.0.0 typosquat allowlist
|
|
* or trigger false positives on legitimate packages):
|
|
* - `js`, `jsx`, `ts`, `tsx` — language-extension suffixes used by many
|
|
* legitimate packages (`react-jsx`, the `tsx` runtime, etc.). The
|
|
* v7.0.0 allowlist contains `tsx` directly; including the same token
|
|
* in the suspicious set would create an internal contradiction.
|
|
* - `pro` — too common as a legitimate edition marker (`vue-pro`,
|
|
* `tailwindcss-pro`).
|
|
*
|
|
* Kept tokens are the unambiguous typosquat suffixes: utility/helper
|
|
* dressing, wrapper/shim packages, and tool/cli/sdk/kit qualifiers.
|
|
*/
|
|
export const TYPOSQUAT_SUSPICIOUS_TOKENS = Object.freeze([
|
|
'utils', 'util', 'helper', 'helpers', 'core', 'plus', 'extra', 'extras',
|
|
'bin', 'cli', 'tool', 'tools',
|
|
'wrapper', 'wrappers', 'lib', 'libs', 'kit', 'sdk', 'shim',
|
|
]);
|
|
|
|
/**
|
|
* Check if a string looks like base64-encoded data.
|
|
* @param {string} s
|
|
* @returns {boolean}
|
|
*/
|
|
export function isBase64Like(s) {
|
|
if (s.length < 20) return false;
|
|
// Must be mostly base64 chars and optionally end with =
|
|
return /^[A-Za-z0-9+/]{20,}={0,3}$/.test(s);
|
|
}
|
|
|
|
/**
|
|
* Check if a string looks like a hex-encoded blob.
|
|
* @param {string} s
|
|
* @returns {boolean}
|
|
*/
|
|
export function isHexBlob(s) {
|
|
if (s.length < 32) return false;
|
|
return /^(0x)?[0-9a-fA-F]{32,}$/.test(s);
|
|
}
|
|
|
|
/**
|
|
* Redact a string for safe display — show first 8 and last 4 chars.
|
|
* @param {string} s
|
|
* @param {number} [showStart=8]
|
|
* @param {number} [showEnd=4]
|
|
* @returns {string}
|
|
*/
|
|
export function redact(s, showStart = 8, showEnd = 4) {
|
|
if (s.length <= showStart + showEnd + 3) return s;
|
|
return `${s.slice(0, showStart)}...${s.slice(-showEnd)}`;
|
|
}
|
|
|
|
/**
|
|
* Extract string literals from a line of code.
|
|
* Handles single-quoted, double-quoted, and backtick strings.
|
|
* @param {string} line
|
|
* @returns {string[]}
|
|
*/
|
|
export function extractStringLiterals(line) {
|
|
const results = [];
|
|
const regex = /(?:"([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'|`([^`\\]*(?:\\.[^`\\]*)*)`)/g;
|
|
let match;
|
|
while ((match = regex.exec(line)) !== null) {
|
|
results.push(match[1] ?? match[2] ?? match[3]);
|
|
}
|
|
return results;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Encoding/obfuscation decoders
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Decode JavaScript/Unicode escape sequences: \uXXXX and \u{XXXXX}.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeUnicodeEscapes(s) {
|
|
return s
|
|
.replace(/\\u\{([0-9a-fA-F]{1,6})\}/g, (_, hex) => {
|
|
const cp = parseInt(hex, 16);
|
|
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
|
})
|
|
.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) =>
|
|
String.fromCodePoint(parseInt(hex, 16))
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Decode hex escape sequences: \xXX.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeHexEscapes(s) {
|
|
return s.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) =>
|
|
String.fromCharCode(parseInt(hex, 16))
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Decode URL percent-encoding: %XX.
|
|
* Uses decodeURIComponent with fallback for malformed sequences.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeUrlEncoding(s) {
|
|
// Fast path: no percent signs means nothing to decode
|
|
if (!s.includes('%')) return s;
|
|
try {
|
|
return decodeURIComponent(s);
|
|
} catch {
|
|
// Malformed sequences — decode individual %XX pairs
|
|
return s.replace(/%([0-9a-fA-F]{2})/g, (_, hex) =>
|
|
String.fromCharCode(parseInt(hex, 16))
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Attempt to decode a base64 string to UTF-8 text.
|
|
* Returns null if the input is not base64-like or decoded result is not readable text.
|
|
* @param {string} s
|
|
* @returns {string|null}
|
|
*/
|
|
export function tryDecodeBase64(s) {
|
|
if (!isBase64Like(s)) return null;
|
|
try {
|
|
const decoded = Buffer.from(s, 'base64').toString('utf-8');
|
|
// Check if result is mostly printable text (>= 80% printable ASCII)
|
|
const printable = decoded.replace(/[^\x20-\x7E\n\r\t]/g, '').length;
|
|
if (decoded.length === 0 || printable / decoded.length < 0.8) return null;
|
|
return decoded;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Decode HTML entities: named (< > & " '),
|
|
* decimal (i), and hex (i).
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeHtmlEntities(s) {
|
|
if (!s.includes('&')) return s;
|
|
const NAMED = {
|
|
'<': '<', '>': '>', '&': '&', '"': '"', ''': "'",
|
|
' ': ' ', '&tab;': '\t', '&newline;': '\n',
|
|
'(': '(', ')': ')', '[': '[', ']': ']',
|
|
'{': '{', '}': '}', '/': '/', '\': '\\',
|
|
':': ':', ';': ';', ',': ',', '.': '.',
|
|
'!': '!', '?': '?', '#': '#', '%': '%',
|
|
'=': '=', '+': '+', '−': '-', '*': '*',
|
|
'|': '|', '˜': '~', '`': '`', '^': '^',
|
|
'_': '_', '&at;': '@', '$': '$',
|
|
};
|
|
return s
|
|
.replace(/&#x([0-9a-fA-F]{1,6});/g, (_, hex) => {
|
|
const cp = parseInt(hex, 16);
|
|
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
|
})
|
|
.replace(/&#(\d{1,7});/g, (_, dec) => {
|
|
const cp = parseInt(dec, 10);
|
|
return cp <= 0x10FFFF ? String.fromCodePoint(cp) : _;
|
|
})
|
|
.replace(/&[a-zA-Z]{2,8};/g, (entity) => NAMED[entity] ?? entity);
|
|
}
|
|
|
|
/**
|
|
* Collapse letter-spaced text: "i g n o r e" → "ignore".
|
|
* Only collapses runs of single letters separated by spaces/tabs.
|
|
* Minimum 4 letters to avoid false positives on normal text.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function collapseLetterSpacing(s) {
|
|
// Match 4+ single-letter tokens separated by 1+ spaces/tabs
|
|
return s.replace(/\b([a-zA-Z]) (?:[a-zA-Z] ){2,}[a-zA-Z]\b/g, (match) =>
|
|
match.replace(/ /g, '')
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Unicode Tags steganography (U+E0000 block) — DeepMind traps kat. 1
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Decode Unicode Tags steganography: U+E0001-E007F → ASCII.
|
|
* Unicode Tags (U+E0000 block) can encode invisible ASCII text inside
|
|
* what appears to be empty or normal-looking strings.
|
|
* E.g., U+E0069 U+E0067 U+E006E → "ign"
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function decodeUnicodeTags(s) {
|
|
let result = '';
|
|
let decoded = '';
|
|
let inTagSequence = false;
|
|
|
|
for (const ch of s) {
|
|
const cp = ch.codePointAt(0);
|
|
if (cp >= 0xE0001 && cp <= 0xE007F) {
|
|
// Tag character — map to ASCII (subtract 0xE0000)
|
|
decoded += String.fromCharCode(cp - 0xE0000);
|
|
inTagSequence = true;
|
|
} else {
|
|
if (inTagSequence && decoded.length > 0) {
|
|
result += decoded;
|
|
decoded = '';
|
|
inTagSequence = false;
|
|
}
|
|
result += ch;
|
|
}
|
|
}
|
|
// Flush remaining tag sequence
|
|
if (decoded.length > 0) {
|
|
result += decoded;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Check if a string contains Unicode Tag characters (U+E0001-E007F).
|
|
* Presence of these characters is suspicious regardless of decoded content.
|
|
* @param {string} s
|
|
* @returns {boolean}
|
|
*/
|
|
export function containsUnicodeTags(s) {
|
|
for (const ch of s) {
|
|
const cp = ch.codePointAt(0);
|
|
if (cp >= 0xE0001 && cp <= 0xE007F) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// BIDI override stripping
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Strip BIDI override characters that can reorder text visually.
|
|
* U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO),
|
|
* U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI).
|
|
* These can hide injection by making text render differently than it parses.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function stripBidiOverrides(s) {
|
|
return s.replace(/[\u202A-\u202E\u2066-\u2069]/g, '');
|
|
}
|
|
|
|
/**
|
|
* Normalize a string by decoding all known obfuscation layers.
|
|
* Runs up to 3 iterations to catch multi-layered encoding (e.g., base64 of URL-encoded).
|
|
* Order per iteration: Unicode Tags -> BIDI strip -> HTML entities -> unicode escapes ->
|
|
* hex escapes -> URL encoding -> base64.
|
|
* After decoding: collapse letter-spaced text.
|
|
* @param {string} s
|
|
* @returns {string}
|
|
*/
|
|
export function normalizeForScan(s) {
|
|
let result = s;
|
|
const MAX_ITERATIONS = 3;
|
|
|
|
// Pre-decode: Unicode Tags and BIDI overrides (before the main loop)
|
|
result = decodeUnicodeTags(result);
|
|
result = stripBidiOverrides(result);
|
|
|
|
for (let i = 0; i < MAX_ITERATIONS; i++) {
|
|
const prev = result;
|
|
result = decodeHtmlEntities(result);
|
|
result = decodeUnicodeEscapes(result);
|
|
result = decodeHexEscapes(result);
|
|
result = decodeUrlEncoding(result);
|
|
const b64decoded = tryDecodeBase64(result);
|
|
if (b64decoded) result = b64decoded;
|
|
// Stable — no further decoding possible
|
|
if (result === prev) break;
|
|
}
|
|
|
|
// Post-decode: collapse letter-spaced evasion
|
|
result = collapseLetterSpacing(result);
|
|
|
|
return result;
|
|
}
|