ktg-plugin-marketplace/plugins/llm-security/scanners/supply-chain-recheck.mjs
Kjell Tore Guttormsen 5f8f2d3c41 fix(dep): B7 — token-overlap typosquat heuristic alongside Levenshtein
Critical-review §2 B7 finding: pure Levenshtein <=2 misses the most common
modern typosquat pattern — popular-name + token-injection suffix. Examples:
  lodash → lodash-utils    (edit distance 6, not flagged pre-B7)
  react  → react-helper    (edit distance 7, not flagged pre-B7)
  express → express-wrapper (edit distance 8, not flagged pre-B7)

Three coordinated edits:

scanners/lib/string-utils.mjs
- Adds tokenize(name): string[]    splits on -/_, lowercases
- Adds tokenOverlap(a, b): number  intersection.size / min(|a|,|b|)
- Adds TYPOSQUAT_SUSPICIOUS_TOKENS frozen list of common typosquat
  suffixes. Excludes language-extension tokens (js, jsx, ts, tsx) — the
  v7.0.0 allowlist contains `tsx` as a legit package and including the
  same token in the suspicious set creates a contradiction. Caught by
  the new allowlist-intersection-guard test. Also excludes 'pro'
  (legitimate edition marker).

scanners/dep-auditor.mjs + scanners/supply-chain-recheck.mjs
- New checkTyposquatTokenOverlap() helper — fires AFTER Levenshtein 1/2
  branches, only when:
    1. popular package's tokens ⊆ declared name's tokens (strict superset)
    2. declared name has at least one suspicious suffix
    3. popular package is in topCutoff window
  All three conditions required — conservative by design. Allowlist
  precedence preserved (existing 22 npm + 13 PyPI entries always pass).
  MEDIUM severity, NOT block. New finding title prefix:
  "Possible typosquatting via token-overlap".

Tests: +21 cases across two new files
- tests/lib/string-utils-tokens.test.mjs (15) — tokenize, tokenOverlap,
  TYPOSQUAT_SUSPICIOUS_TOKENS frozen contract, allowlist-intersection
  guard (caught the tsx conflict on first run)
- tests/scanners/dep-token-overlap.test.mjs (7) — integration via
  in-memory tmpdir fixtures: lodash-utils flagged, react-helper flagged,
  express-wrapper flagged, lodash exact NOT flagged, allowlist tools
  (knip/tsx/nx/rimraf) NOT flagged, react-router-dom (no suspicious
  suffix) NOT flagged, react itself (equal token set, not superset)
  NOT flagged.

Existing dep.test.mjs and supply-chain-recheck.test.mjs unchanged —
all green (149 → 149 regression guard).

Suite: 1570 → 1591 (+21). All green.
2026-04-29 14:10:53 +02:00

503 lines
19 KiB
JavaScript

// supply-chain-recheck.mjs — Periodic re-audit of installed dependencies
// Parses lockfiles (package-lock.json, yarn.lock, requirements.txt, Pipfile.lock)
// and checks against blocklists, OSV.dev batch API, and typosquat detection.
//
// Unlike pre-install-supply-chain.mjs (hook, checks at install time),
// this scanner checks what's ALREADY installed — catching deps that became
// compromised after installation.
//
// Scanner prefix: SCR
// OWASP coverage: LLM03 (Supply Chain), ASI04, AST06, MCP04
// Zero external dependencies — Node.js builtins only.
import { readFile } from 'node:fs/promises';
import { join, dirname } from 'node:path';
import { existsSync, readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { levenshtein, tokenize, tokenOverlap, TYPOSQUAT_SUSPICIOUS_TOKENS } from './lib/string-utils.mjs';
import {
NPM_COMPROMISED, PIP_COMPROMISED, CARGO_COMPROMISED, GEM_COMPROMISED,
isCompromised, extractOSVSeverity, queryOSVBatch, OSV_ECOSYSTEM_MAP,
} from './lib/supply-chain-data.mjs';
const __dirname = dirname(fileURLToPath(import.meta.url));
// ---------------------------------------------------------------------------
// Top-package knowledge base loader (for typosquat detection)
// ---------------------------------------------------------------------------
let _topPackages = null;
let _typosquatAllowlist = null;
async function loadTopPackages() {
if (_topPackages) return _topPackages;
const knowledgePath = join(__dirname, '..', 'knowledge', 'top-packages.json');
try {
const raw = await readFile(knowledgePath, 'utf8');
_topPackages = JSON.parse(raw);
} catch {
_topPackages = { npm: [], pypi: [] };
}
return _topPackages;
}
async function loadTyposquatAllowlist() {
if (_typosquatAllowlist) return _typosquatAllowlist;
const allowPath = join(__dirname, '..', 'knowledge', 'typosquat-allowlist.json');
try {
const raw = await readFile(allowPath, 'utf8');
const data = JSON.parse(raw);
_typosquatAllowlist = {
npm: new Set((data.npm || []).map(n => n.toLowerCase().replace(/[_.-]/g, '-'))),
pypi: new Set((data.pypi || []).map(n => n.toLowerCase().replace(/[_.-]/g, '-'))),
};
} catch {
_typosquatAllowlist = { npm: new Set(), pypi: new Set() };
}
return _typosquatAllowlist;
}
// ---------------------------------------------------------------------------
// Lockfile parsers — extract { name, version, ecosystem } tuples
// ---------------------------------------------------------------------------
/**
* Parse package-lock.json (v2/v3 format with packages field).
* @param {string} filePath - Absolute path to package-lock.json
* @returns {Promise<{ name: string, version: string, ecosystem: string }[]>}
*/
async function parsePackageLock(filePath) {
const deps = [];
try {
const raw = await readFile(filePath, 'utf8');
const lock = JSON.parse(raw);
// v3 format: packages object
const packages = lock.packages || {};
for (const [key, info] of Object.entries(packages)) {
if (key === '') continue; // Root package
const name = key.replace(/^node_modules\//, '');
if (name && info.version) {
deps.push({ name, version: info.version, ecosystem: 'npm' });
}
}
// v1 fallback: dependencies object
if (deps.length === 0 && lock.dependencies) {
for (const [name, info] of Object.entries(lock.dependencies)) {
if (info.version) {
deps.push({ name, version: info.version, ecosystem: 'npm' });
}
}
}
} catch { /* parse error — skip */ }
return deps;
}
/**
* Parse yarn.lock (v1 format).
* Extracts package name and resolved version from each entry.
* @param {string} filePath - Absolute path to yarn.lock
* @returns {Promise<{ name: string, version: string, ecosystem: string }[]>}
*/
async function parseYarnLock(filePath) {
const deps = [];
try {
const raw = await readFile(filePath, 'utf8');
const lines = raw.split('\n');
let currentPkg = null;
for (const line of lines) {
// Package header: "pkg@^1.0.0", "pkg@1.0.0:" or "@scope/pkg@^1.0.0":
if (!line.startsWith(' ') && !line.startsWith('#') && line.includes('@')) {
const trimmed = line.replace(/[":]/g, '').trim();
if (trimmed.startsWith('@')) {
// Scoped: @scope/pkg@version
const rest = trimmed.slice(1);
const atIdx = rest.indexOf('@');
if (atIdx > 0) currentPkg = '@' + rest.slice(0, atIdx);
} else {
const atIdx = trimmed.indexOf('@');
if (atIdx > 0) currentPkg = trimmed.slice(0, atIdx);
}
}
// Version line: " version "1.2.3""
const versionMatch = line.match(/^\s+version\s+"([^"]+)"/);
if (versionMatch && currentPkg) {
deps.push({ name: currentPkg, version: versionMatch[1], ecosystem: 'npm' });
currentPkg = null;
}
}
} catch { /* parse error — skip */ }
return deps;
}
/**
* Parse requirements.txt (pip format).
* @param {string} filePath - Absolute path to requirements.txt
* @returns {Promise<{ name: string, version: string|null, ecosystem: string }[]>}
*/
async function parseRequirementsTxt(filePath) {
const deps = [];
try {
const raw = await readFile(filePath, 'utf8');
for (const rawLine of raw.split('\n')) {
const line = rawLine.trim();
if (!line || line.startsWith('#') || line.startsWith('-')) continue;
const eqIdx = line.indexOf('==');
if (eqIdx > 0) {
deps.push({ name: line.slice(0, eqIdx).trim(), version: line.slice(eqIdx + 2).trim(), ecosystem: 'pip' });
} else {
const match = line.match(/^([a-zA-Z0-9_.-]+)/);
if (match) deps.push({ name: match[1], version: null, ecosystem: 'pip' });
}
}
} catch { /* parse error — skip */ }
return deps;
}
/**
* Parse Pipfile.lock (JSON format).
* @param {string} filePath - Absolute path to Pipfile.lock
* @returns {Promise<{ name: string, version: string, ecosystem: string }[]>}
*/
async function parsePipfileLock(filePath) {
const deps = [];
try {
const raw = await readFile(filePath, 'utf8');
const lock = JSON.parse(raw);
for (const section of ['default', 'develop']) {
const packages = lock[section] || {};
for (const [name, info] of Object.entries(packages)) {
const version = typeof info === 'object' && info.version
? info.version.replace(/^==/, '')
: null;
if (version) {
deps.push({ name, version, ecosystem: 'pip' });
}
}
}
} catch { /* parse error — skip */ }
return deps;
}
// ---------------------------------------------------------------------------
// Checks
// ---------------------------------------------------------------------------
/**
* Check all dependencies against blocklists.
* @param {{ name: string, version: string, ecosystem: string }[]} deps
* @param {string} lockfile - Source lockfile name for finding references
* @returns {object[]} findings
*/
function checkBlocklists(deps, lockfile) {
const results = [];
const lists = { npm: NPM_COMPROMISED, pip: PIP_COMPROMISED, cargo: CARGO_COMPROMISED, gem: GEM_COMPROMISED };
for (const dep of deps) {
const blocklist = lists[dep.ecosystem];
if (!blocklist) continue;
if (isCompromised(blocklist, dep.name, dep.version)) {
results.push(finding({
scanner: 'SCR',
severity: SEVERITY.CRITICAL,
title: `Compromised dependency: ${dep.name}@${dep.version || '*'}`,
description:
`"${dep.name}"${dep.version ? '@' + dep.version : ''} in ${lockfile} is on the known-compromised blocklist. ` +
`This package/version is associated with supply chain attacks (malware, data exfiltration, or sabotage).`,
file: lockfile,
evidence: `${dep.name}@${dep.version || 'any'} in ${dep.ecosystem} blocklist`,
owasp: 'LLM03',
recommendation:
`Remove "${dep.name}" immediately. If this was a transitive dependency, find and remove ` +
`the parent package that requires it. Audit your system for signs of compromise.`,
}));
}
}
return results;
}
/**
* Check dependencies against OSV.dev batch API for known vulnerabilities.
* @param {{ name: string, version: string, ecosystem: string }[]} deps
* @param {string} lockfile
* @returns {{ findings: object[], offline: boolean }}
*/
async function checkOSV(deps, lockfile) {
// Only query deps that have a version (OSV requires version)
const queryable = deps.filter(d => d.version && OSV_ECOSYSTEM_MAP[d.ecosystem]);
if (queryable.length === 0) return { findings: [], offline: false };
const { results, offline } = await queryOSVBatch(queryable);
if (offline) return { findings: [], offline: true };
const findings = [];
for (let i = 0; i < results.length; i++) {
const vulns = results[i]?.vulns || [];
if (vulns.length === 0) continue;
const dep = queryable[i];
let hasCritical = false;
for (const vuln of vulns) {
const severity = extractOSVSeverity(vuln);
const sevConst = severity === 'CRITICAL' ? SEVERITY.CRITICAL
: severity === 'HIGH' ? SEVERITY.HIGH
: SEVERITY.MEDIUM;
if (severity === 'CRITICAL') hasCritical = true;
findings.push(finding({
scanner: 'SCR',
severity: sevConst,
title: `Known vulnerability: ${dep.name}@${dep.version} (${vuln.id})`,
description:
`${vuln.id}: ${(vuln.summary || vuln.details || 'No description').slice(0, 200)}. ` +
`Found in ${lockfile}.`,
file: lockfile,
evidence: `${vuln.id}${dep.name}@${dep.version}`,
owasp: 'LLM03',
recommendation:
`Upgrade "${dep.name}" to a patched version. Check ${vuln.id} for fix details.`,
}));
}
}
return { findings, offline: false };
}
/**
* Check npm dependencies for typosquatting against top packages.
* @param {{ name: string, version: string, ecosystem: string }[]} deps
* @param {string[]} topList - Normalized top package names
* @param {number} topCutoff - Top N for stricter matching
* @param {string} ecosystem
* @param {string} lockfile
* @returns {object[]}
*/
function checkTyposquatting(deps, topList, topCutoff, ecosystem, lockfile, allowlist) {
const results = [];
const checked = new Set();
for (const dep of deps) {
if (dep.ecosystem !== ecosystem) continue;
const normalized = dep.name.toLowerCase().replace(/[_.-]/g, '-');
if (checked.has(normalized)) continue;
checked.add(normalized);
// Skip known legitimate packages
if (allowlist && allowlist.has(normalized)) continue;
let closestDist = Infinity;
let closestPkg = null;
let closestIdx = Infinity;
for (let i = 0; i < topList.length; i++) {
const topPkg = topList[i];
if (normalized === topPkg) { closestPkg = null; break; } // Exact match — legit
if (Math.abs(normalized.length - topPkg.length) > 2) continue;
const dist = levenshtein(normalized, topPkg);
if (dist < closestDist || (dist === closestDist && i < closestIdx)) {
closestDist = dist;
closestPkg = topPkg;
closestIdx = i;
}
}
if (!closestPkg) continue;
if (closestDist === 1) {
results.push(finding({
scanner: 'SCR',
severity: SEVERITY.HIGH,
title: `Possible typosquatting: "${dep.name}" vs "${closestPkg}" (edit distance 1)`,
description:
`"${dep.name}" in ${lockfile} is 1 character away from the popular ${ecosystem} package "${closestPkg}". ` +
`Typosquatting packages impersonate popular libraries to execute malicious code.`,
file: lockfile,
evidence: `"${dep.name}" → "${closestPkg}" (Levenshtein: 1)`,
owasp: 'LLM03',
recommendation:
`Verify "${dep.name}" is the intended package. If you meant "${closestPkg}", correct the dependency.`,
}));
} else if (closestDist === 2 && closestIdx < topCutoff) {
results.push(finding({
scanner: 'SCR',
severity: SEVERITY.MEDIUM,
title: `Potential typosquatting: "${dep.name}" vs "${closestPkg}" (edit distance 2)`,
description:
`"${dep.name}" in ${lockfile} is 2 characters away from the popular ${ecosystem} package "${closestPkg}" ` +
`(top ${topCutoff} by downloads).`,
file: lockfile,
evidence: `"${dep.name}" → "${closestPkg}" (Levenshtein: 2)`,
owasp: 'LLM03',
recommendation:
`Confirm "${dep.name}" is the correct package. Check publish date and author on the registry.`,
}));
} else {
// B7 (v7.2.0): token-overlap fallback. Catches typosquats with edit
// distance >= 3 that contain all tokens of a popular package plus a
// suspicious suffix (e.g. `lodash-utils` vs `lodash`).
const tokenFinding = checkTyposquatTokenOverlap(dep, normalized, topList, topCutoff, ecosystem, lockfile);
if (tokenFinding) results.push(tokenFinding);
}
}
return results;
}
function checkTyposquatTokenOverlap(dep, normalized, topList, topCutoff, ecosystem, lockfile) {
const declaredTokens = new Set(tokenize(normalized));
if (declaredTokens.size < 2) return null;
const suspiciousSuffixes = TYPOSQUAT_SUSPICIOUS_TOKENS.filter(t => declaredTokens.has(t));
if (suspiciousSuffixes.length === 0) return null;
const limit = Math.min(topCutoff, topList.length);
for (let i = 0; i < limit; i++) {
const topPkg = topList[i];
if (normalized === topPkg) return null;
const topTokens = tokenize(topPkg);
if (topTokens.length === 0) continue;
const allContained = topTokens.every(t => declaredTokens.has(t));
if (!allContained) continue;
if (topTokens.length === declaredTokens.size) continue;
const overlap = tokenOverlap(normalized, topPkg);
if (overlap < 0.66) continue;
return finding({
scanner: 'SCR',
severity: SEVERITY.MEDIUM,
title: `Possible typosquatting via token-overlap: "${dep.name}" vs "${topPkg}"`,
description:
`"${dep.name}" in ${lockfile} contains all tokens of the popular ${ecosystem} package ` +
`"${topPkg}" plus a suspicious suffix (${suspiciousSuffixes.join(', ')}). ` +
`This is a common typosquat pattern: attackers register popular-name-plus-suffix ` +
`packages to capture installs.`,
file: lockfile,
evidence: `"${dep.name}" tokens ⊃ "${topPkg}" tokens; suffix=${suspiciousSuffixes.join(',')}; overlap=${overlap.toFixed(2)}`,
owasp: 'LLM03',
recommendation:
`Verify "${dep.name}" is intentional. If you meant "${topPkg}", correct the dependency. ` +
`If "${dep.name}" is a legitimate utility, add it to knowledge/typosquat-allowlist.json under "${ecosystem}".`,
});
}
return null;
}
// ---------------------------------------------------------------------------
// Main scanner export
// ---------------------------------------------------------------------------
/**
* Scan targetPath lockfiles for supply chain issues.
*
* Detection categories:
* 1. Blocklist matches (known compromised packages) — CRITICAL
* 2. OSV.dev CVE/advisory detection (batch API) — CRITICAL/HIGH/MEDIUM
* 3. Typosquatting against top packages — HIGH/MEDIUM
*
* Lockfiles parsed: package-lock.json, yarn.lock, requirements.txt, Pipfile.lock
*
* @param {string} targetPath - Absolute root path being scanned
* @param {object} discovery - Unused (scanner reads lockfiles by convention)
* @returns {Promise<object>} - scannerResult envelope
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
const allFindings = [];
let filesScanned = 0;
let osvOffline = false;
// Discover lockfiles
const lockfiles = [
{ path: join(targetPath, 'package-lock.json'), parser: parsePackageLock, name: 'package-lock.json', ecosystem: 'npm' },
{ path: join(targetPath, 'yarn.lock'), parser: parseYarnLock, name: 'yarn.lock', ecosystem: 'npm' },
{ path: join(targetPath, 'requirements.txt'), parser: parseRequirementsTxt, name: 'requirements.txt', ecosystem: 'pip' },
{ path: join(targetPath, 'Pipfile.lock'), parser: parsePipfileLock, name: 'Pipfile.lock', ecosystem: 'pip' },
];
// Also check for requirements-*.txt variants
for (const variant of ['requirements-dev.txt', 'requirements-prod.txt', 'requirements.lock']) {
const varPath = join(targetPath, variant);
if (existsSync(varPath)) {
lockfiles.push({ path: varPath, parser: parseRequirementsTxt, name: variant, ecosystem: 'pip' });
}
}
const presentLockfiles = lockfiles.filter(l => existsSync(l.path));
if (presentLockfiles.length === 0) {
return scannerResult('supply-chain-recheck', 'skipped', [], 0, Date.now() - startMs);
}
try {
// Load top packages and allowlist for typosquat detection
const [topPkgs, allowlist] = await Promise.all([loadTopPackages(), loadTyposquatAllowlist()]);
const npmTop = topPkgs.npm.map(n => n.toLowerCase().replace(/[_.-]/g, '-'));
const pypiTop = topPkgs.pypi.map(n => n.toLowerCase().replace(/[_.-]/g, '-'));
// Parse all lockfiles
const allDeps = [];
for (const lockfile of presentLockfiles) {
filesScanned++;
const deps = await lockfile.parser(lockfile.path);
// 1. Blocklist check
allFindings.push(...checkBlocklists(deps, lockfile.name));
// 3. Typosquat check
if (lockfile.ecosystem === 'npm') {
allFindings.push(...checkTyposquatting(deps, npmTop, 200, 'npm', lockfile.name, allowlist.npm));
} else if (lockfile.ecosystem === 'pip') {
allFindings.push(...checkTyposquatting(deps, pypiTop, 100, 'pip', lockfile.name, allowlist.pypi));
}
allDeps.push(...deps.map(d => ({ ...d, lockfile: lockfile.name })));
}
// 2. OSV.dev batch check (all deps from all lockfiles at once)
const osvDeps = allDeps.filter(d => d.version);
if (osvDeps.length > 0) {
const osvResult = await checkOSV(osvDeps, 'lockfiles');
if (osvResult.offline) {
osvOffline = true;
allFindings.push(finding({
scanner: 'SCR',
severity: SEVERITY.INFO,
title: 'OSV.dev unreachable — CVE check skipped',
description:
'Could not reach the OSV.dev API. Blocklist and typosquat checks were performed, ' +
'but known vulnerability (CVE) detection was skipped. Re-run when network is available.',
owasp: 'LLM03',
recommendation: 'Re-run the scanner when network connectivity is restored.',
}));
} else {
// Re-tag findings with correct lockfile names
for (const f of osvResult.findings) {
// Find the dep this finding refers to
const depMatch = f.evidence?.match(/^(\S+)\s*—\s*(\S+?)@/);
if (depMatch) {
const depName = depMatch[2];
const sourceDep = allDeps.find(d => d.name === depName);
if (sourceDep) {
f.file = sourceDep.lockfile;
}
}
allFindings.push(f);
}
}
}
const durationMs = Date.now() - startMs;
const result = scannerResult('supply-chain-recheck', 'ok', allFindings, filesScanned, durationMs);
if (osvOffline) result.osv_offline = true;
return result;
} catch (err) {
const durationMs = Date.now() - startMs;
return scannerResult('supply-chain-recheck', 'error', allFindings, filesScanned, durationMs, err.message);
}
}