ktg-plugin-marketplace/plugins/llm-security/scanners/dep-auditor.mjs

// dep-auditor.mjs — Deterministic dependency security scanner
// Detects CVEs (npm/pip audit), typosquatting, malicious install scripts,
// and unpinned versions. Zero external dependencies — Node.js builtins only.
//
// OWASP coverage: LLM03 (Supply Chain)

import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { levenshtein } from './lib/string-utils.mjs';
import { readFile } from 'node:fs/promises';
import { join, dirname } from 'node:path';
import { existsSync } from 'node:fs';
import { execSync } from 'node:child_process';
import { fileURLToPath } from 'node:url';

// ---------------------------------------------------------------------------
// Top-package knowledge base loader
// ---------------------------------------------------------------------------

const __dirname = dirname(fileURLToPath(import.meta.url));

/** @type {{ npm: string[], pypi: string[] } | null} */
let _topPackages = null;
let _typosquatAllowlist = null;

/**
 * Load top-packages.json from the knowledge directory.
 * Result is cached after first load.
 * @returns {Promise<{ npm: string[], pypi: string[] }>}
 */
async function loadTopPackages() {
  if (_topPackages) return _topPackages;
  const knowledgePath = join(__dirname, '..', 'knowledge', 'top-packages.json');
  try {
    const raw = await readFile(knowledgePath, 'utf8');
    _topPackages = JSON.parse(raw);
  } catch {
    // Graceful fallback: empty lists — typosquatting detection skipped
    _topPackages = { npm: [], pypi: [] };
  }
  return _topPackages;
}

async function loadTyposquatAllowlist() {
  if (_typosquatAllowlist) return _typosquatAllowlist;
  const allowPath = join(__dirname, '..', 'knowledge', 'typosquat-allowlist.json');
  try {
    const raw = await readFile(allowPath, 'utf8');
    const data = JSON.parse(raw);
    _typosquatAllowlist = {
      npm: new Set((data.npm || []).map(n => n.toLowerCase().replace(/[_.-]/g, '-'))),
      pypi: new Set((data.pypi || []).map(n => n.toLowerCase().replace(/[_.-]/g, '-'))),
    };
  } catch {
    _typosquatAllowlist = { npm: new Set(), pypi: new Set() };
  }
  return _typosquatAllowlist;
}

// ---------------------------------------------------------------------------
// File reading helpers
// ---------------------------------------------------------------------------

/**
 * Read and parse a JSON file. Returns null on error.
 * @param {string} absPath
 * @returns {Promise<object|null>}
 */
async function readJson(absPath) {
  try {
    const raw = await readFile(absPath, 'utf8');
    return JSON.parse(raw);
  } catch {
    return null;
  }
}

/**
 * Read a text file line by line. Returns empty array on error.
 * @param {string} absPath
 * @returns {Promise<string[]>}
 */
async function readLines(absPath) {
  try {
    const raw = await readFile(absPath, 'utf8');
    return raw.split('\n').map(l => l.replace(/\r$/, ''));
  } catch {
    return [];
  }
}

// ---------------------------------------------------------------------------
// Category 1: CVE Detection via npm/pip audit
// ---------------------------------------------------------------------------

/** Map npm audit severity strings to our SEVERITY constants. */
function npmSeverityToOurs(npmSev) {
  switch (npmSev) {
    case 'critical': return SEVERITY.CRITICAL;
    case 'high':     return SEVERITY.HIGH;
    case 'moderate': return SEVERITY.MEDIUM;
    case 'low':
    default:         return SEVERITY.LOW;
  }
}

/**
 * Run npm audit --json in targetPath and return findings.
 * Gracefully handles: command not found, timeout, parse errors, non-zero exit.
 * @param {string} targetPath
 * @returns {object[]} findings
 */
function runNpmAudit(targetPath) {
  const findings = [];
  let raw;
  try {
    raw = execSync('npm audit --json', {
      cwd: targetPath,
      timeout: 30_000,
      // Allow non-zero exit (npm audit exits 1 when vulnerabilities found)
      stdio: ['ignore', 'pipe', 'ignore'],
    }).toString();
  } catch (err) {
    // execSync throws on non-zero exit; the stdout is still on err.stdout
    raw = err.stdout ? err.stdout.toString() : null;
  }

  if (!raw || raw.trim().length === 0) return findings;

  let parsed;
  try {
    parsed = JSON.parse(raw);
  } catch {
    return findings;
  }

  // npm audit v2 format: { vulnerabilities: { pkgName: { severity, via, ... } } }
  const vulns = parsed.vulnerabilities || {};
  for (const [pkgName, vuln] of Object.entries(vulns)) {
    const severity = npmSeverityToOurs(vuln.severity);

    // Collect CVE IDs from the via chain
    const cveIds = [];
    if (Array.isArray(vuln.via)) {
      for (const v of vuln.via) {
        if (typeof v === 'object' && v.url) {
          // Extract CVE or advisory ID from URL
          const match = v.url.match(/GHSA-[\w-]+|CVE-\d{4}-\d+/i);
          if (match) cveIds.push(match[0]);
        }
      }
    }

    const cveRef = cveIds.length > 0 ? ` (${cveIds.join(', ')})` : '';
    const fixAvailable = vuln.fixAvailable
      ? typeof vuln.fixAvailable === 'object'
        ? ` Fix: upgrade to ${vuln.fixAvailable.name}@${vuln.fixAvailable.version}.`
        : ' A fix is available — run `npm audit fix`.'
      : ' No automatic fix available — review manually.';

    findings.push(
      finding({
        scanner: 'DEP',
        severity,
        title: `Vulnerable npm dependency: ${pkgName}${cveRef}`,
        description:
          `npm audit reports a ${vuln.severity} severity vulnerability in "${pkgName}".` +
          (vuln.range ? ` Affected range: ${vuln.range}.` : '') +
          fixAvailable,
        file: 'package.json',
        evidence: cveIds.length > 0 ? cveIds.join(', ') : `${pkgName} @ ${vuln.range || 'unknown'}`,
        owasp: 'LLM03',
        recommendation:
          `Run \`npm audit fix\` or manually upgrade "${pkgName}" to a patched version. ` +
          'Review the advisory for workarounds if no fix is available.',
      }),
    );
  }

  return findings;
}

/**
 * Run pip audit --format json and return findings.
 * Gracefully handles pip audit not installed, timeout, parse errors.
 * @param {string} targetPath
 * @returns {object[]} findings
 */
function runPipAudit(targetPath) {
  const findings = [];
  let raw;
  try {
    raw = execSync('pip audit --format json', {
      cwd: targetPath,
      timeout: 30_000,
      stdio: ['ignore', 'pipe', 'ignore'],
    }).toString();
  } catch (err) {
    raw = err.stdout ? err.stdout.toString() : null;
  }

  if (!raw || raw.trim().length === 0) return findings;

  let parsed;
  try {
    parsed = JSON.parse(raw);
  } catch {
    return findings;
  }

  // pip audit JSON format: array of { name, version, vulns: [{ id, fix_versions, description }] }
  const packages = Array.isArray(parsed) ? parsed : (parsed.dependencies || []);
  for (const pkg of packages) {
    if (!pkg.vulns || pkg.vulns.length === 0) continue;
    for (const vuln of pkg.vulns) {
      const fixes = vuln.fix_versions && vuln.fix_versions.length > 0
        ? ` Fix in version(s): ${vuln.fix_versions.join(', ')}.`
        : ' No fix version reported.';

      findings.push(
        finding({
          scanner: 'DEP',
          severity: SEVERITY.HIGH,  // pip audit does not expose severity; default HIGH
          title: `Vulnerable Python dependency: ${pkg.name} (${vuln.id})`,
          description:
            `pip audit reports vulnerability ${vuln.id} in "${pkg.name}" v${pkg.version}.` +
            (vuln.description ? ` ${vuln.description}` : '') +
            fixes,
          file: 'requirements.txt',
          evidence: `${vuln.id} — ${pkg.name}@${pkg.version}`,
          owasp: 'LLM03',
          recommendation:
            `Upgrade "${pkg.name}" to a patched version.${fixes} ` +
            'Run `pip audit` after upgrading to verify resolution.',
        }),
      );
    }
  }

  return findings;
}

// ---------------------------------------------------------------------------
// Category 2: Typosquatting Detection
// ---------------------------------------------------------------------------

/**
 * Extract package names from requirements.txt lines.
 * Handles: pkg==1.0, pkg>=1.0, pkg~=1.0, pkg, # comments, -r includes, blanks.
 * @param {string[]} lines
 * @returns {string[]}
 */
function parseRequirementsTxt(lines) {
  const names = [];
  for (const line of lines) {
    const stripped = line.trim();
    // Skip blanks, comments, options, includes
    if (!stripped || stripped.startsWith('#') || stripped.startsWith('-')) continue;
    // Extract package name: everything before first [>=<!~;@\s]
    const match = stripped.match(/^([A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?)/);
    if (match) names.push(match[1].toLowerCase().replace(/_/g, '-'));
  }
  return names;
}

/**
 * Check one declared package name against the top-packages list for typosquatting.
 * Pre-filter by length difference to avoid O(n*m) full distance for irrelevant pairs.
 * Returns a finding object or null.
 *
 * @param {string} declaredName   - Normalized (lowercase, hyphens) declared package name
 * @param {string[]} topList      - Top package names (same normalization)
 * @param {number} top200Cutoff   - Index cutoff for "very popular" (top 200 for npm, top 100 for PyPI)
 * @param {string} ecosystem      - 'npm' or 'pypi'
 * @param {string} sourceFile     - 'package.json' or 'requirements.txt'
 * @returns {object|null}
 */
function checkTyposquatting(declaredName, topList, top200Cutoff, ecosystem, sourceFile, allowlist) {
  // Skip known legitimate packages
  if (allowlist && allowlist.has(declaredName)) return null;

  let closestDist = Infinity;
  let closestPkg = null;
  let closestIdx = Infinity;

  for (let i = 0; i < topList.length; i++) {
    const topPkg = topList[i];

    // Exact match — legitimate package, skip
    if (declaredName === topPkg) return null;

    // Pre-filter: skip if length difference > 2
    if (Math.abs(declaredName.length - topPkg.length) > 2) continue;

    const dist = levenshtein(declaredName, topPkg);

    if (dist < closestDist || (dist === closestDist && i < closestIdx)) {
      closestDist = dist;
      closestPkg = topPkg;
      closestIdx = i;
    }
  }

  if (closestPkg === null) return null;

  // Flag distance 1 always; distance 2 only if target is in top 200 (top200Cutoff)
  if (closestDist === 1) {
    return finding({
      scanner: 'DEP',
      severity: SEVERITY.HIGH,
      title: `Possible typosquatting: "${declaredName}" vs "${closestPkg}" (edit distance 1)`,
      description:
        `The declared ${ecosystem} package "${declaredName}" is 1 character away from the ` +
        `popular package "${closestPkg}". This is a strong typosquatting indicator. ` +
        `Typosquatting packages impersonate popular libraries to execute malicious install scripts.`,
      file: sourceFile,
      evidence: `"${declaredName}" → closest match "${closestPkg}" (Levenshtein distance: 1)`,
      owasp: 'LLM03',
      recommendation:
        `Verify that "${declaredName}" is the intended package. If you meant "${closestPkg}", ` +
        `correct the dependency name. If "${declaredName}" is intentional, add an inline comment ` +
        `confirming this to suppress future alerts.`,
    });
  }

  if (closestDist === 2 && closestIdx < top200Cutoff) {
    return finding({
      scanner: 'DEP',
      severity: SEVERITY.MEDIUM,
      title: `Potential typosquatting: "${declaredName}" vs "${closestPkg}" (edit distance 2)`,
      description:
        `The declared ${ecosystem} package "${declaredName}" is 2 characters away from the ` +
        `highly popular package "${closestPkg}" (top ${top200Cutoff} by downloads). ` +
        `While less certain than distance-1 matches, this warrants manual verification.`,
      file: sourceFile,
      evidence: `"${declaredName}" → closest match "${closestPkg}" (Levenshtein distance: 2)`,
      owasp: 'LLM03',
      recommendation:
        `Confirm "${declaredName}" is the correct and intended package name. ` +
        `Check the package's publish date, author, and download count on the registry.`,
    });
  }

  return null;
}

// ---------------------------------------------------------------------------
// Category 3: Malicious Install Scripts
// ---------------------------------------------------------------------------

/** Patterns in install script values that indicate network/exec behaviour. */
const MALICIOUS_SCRIPT_PATTERNS = [
  { pattern: /\bcurl\b/,          label: 'curl (network fetch)' },
  { pattern: /\bwget\b/,          label: 'wget (network fetch)' },
  { pattern: /\bfetch\b/,         label: 'fetch (network request)' },
  { pattern: /https?:\/\//,       label: 'HTTP URL' },
  { pattern: /\beval\b/,          label: 'eval (code execution)' },
  { pattern: /\bexec\b/,          label: 'exec (process execution)' },
  { pattern: /child_process/,     label: 'child_process (subprocess)' },
  { pattern: /net\.connect\b/,    label: 'net.connect (raw TCP)' },
  { pattern: /\bdgram\b/,         label: 'dgram (UDP socket)' },
];

/** npm lifecycle hooks that run automatically on install. */
const INSTALL_HOOKS = ['preinstall', 'install', 'postinstall'];

/**
 * Check package.json scripts for malicious install script patterns.
 * @param {object} pkgJson  - Parsed package.json object
 * @returns {object[]}      - findings
 */
function checkInstallScripts(pkgJson) {
  const findings = [];
  const scripts = pkgJson.scripts || {};

  for (const hook of INSTALL_HOOKS) {
    const script = scripts[hook];
    if (!script || typeof script !== 'string') continue;

    const matched = MALICIOUS_SCRIPT_PATTERNS.filter(({ pattern }) => pattern.test(script));
    if (matched.length === 0) continue;

    const labels = matched.map(m => m.label).join(', ');
    // Redact any URLs in the evidence to avoid leaking sensitive paths in reports
    const safeScript = script.replace(/https?:\/\/[^\s"']+/g, '[URL]').slice(0, 120);

    findings.push(
      finding({
        scanner: 'DEP',
        severity: SEVERITY.HIGH,
        title: `Suspicious npm install hook: scripts.${hook} contains network/exec patterns`,
        description:
          `The package.json "scripts.${hook}" field runs automatically during \`npm install\` ` +
          `and contains suspicious patterns: ${labels}. ` +
          `Malicious packages use install hooks to exfiltrate data, download payloads, or establish persistence.`,
        file: 'package.json',
        evidence: `scripts.${hook}: "${safeScript}${script.length > 120 ? '...' : ''}"`,
        owasp: 'LLM03',
        recommendation:
          `Review the scripts.${hook} command carefully. If this package is a dependency ` +
          `(not your own), consider whether this behaviour is expected. Use \`npm install --ignore-scripts\` ` +
          `if install hooks are not needed. File a report at https://www.npmjs.com/support if malicious.`,
      }),
    );
  }

  return findings;
}

// ---------------------------------------------------------------------------
// Category 4: Unpinned Versions
// ---------------------------------------------------------------------------

/** Flags for unpinned npm dependency specifiers. */
const UNPINNED_NPM_RE = /^(\*|latest|x|>=\d|>\d)/;

/**
 * Check package.json dependencies for unpinned version specifiers.
 * @param {object} pkgJson
 * @returns {object[]}
 */
function checkUnpinnedNpm(pkgJson) {
  const findings = [];
  const depSections = [
    ['dependencies', pkgJson.dependencies],
    ['devDependencies', pkgJson.devDependencies],
  ];

  for (const [sectionName, deps] of depSections) {
    if (!deps || typeof deps !== 'object') continue;
    for (const [name, version] of Object.entries(deps)) {
      if (typeof version !== 'string') continue;
      if (UNPINNED_NPM_RE.test(version.trim())) {
        findings.push(
          finding({
            scanner: 'DEP',
            severity: SEVERITY.LOW,
            title: `Unpinned npm dependency: ${name}@${version}`,
            description:
              `The package "${name}" in ${sectionName} uses an unpinned version specifier "${version}". ` +
              `Unpinned dependencies can silently pull in a compromised version on the next install.`,
            file: 'package.json',
            evidence: `${sectionName}.${name}: "${version}"`,
            owasp: 'LLM03',
            recommendation:
              `Pin "${name}" to an exact version (e.g., "${name}": "x.y.z") or use a lockfile ` +
              `(\`package-lock.json\` or \`yarn.lock\`) and commit it. Run \`npm ci\` in CI instead of \`npm install\`.`,
          }),
        );
      }
    }
  }

  return findings;
}

/**
 * Check requirements.txt lines for unpinned packages (missing == pin).
 * @param {string[]} lines
 * @returns {object[]}
 */
function checkUnpinnedPypi(lines) {
  const findings = [];

  for (let i = 0; i < lines.length; i++) {
    const line = lines[i].trim();
    if (!line || line.startsWith('#') || line.startsWith('-')) continue;

    // Has a version specifier but NOT a strict == pin
    const hasSpecifier = /[><=~!]/.test(line);
    const hasPinned = /==/.test(line);
    const hasAnyOperator = hasSpecifier;

    if (!hasPinned && !hasAnyOperator) {
      // No version at all
      const match = line.match(/^([A-Za-z0-9][A-Za-z0-9._-]*)/);
      const name = match ? match[1] : line;
      findings.push(
        finding({
          scanner: 'DEP',
          severity: SEVERITY.LOW,
          title: `Unpinned Python dependency: ${name} (no version specifier)`,
          description:
            `"${name}" in requirements.txt has no version pin. ` +
            `Without pinning, \`pip install\` may resolve to a future compromised version.`,
          file: 'requirements.txt',
          line: i + 1,
          evidence: line,
          owasp: 'LLM03',
          recommendation:
            `Pin to an exact version: \`${name}==<version>\`. ` +
            `Use \`pip freeze > requirements.txt\` to capture current versions, ` +
            `or use \`pip-compile\` (pip-tools) for reproducible builds.`,
        }),
      );
    } else if (hasSpecifier && !hasPinned) {
      // Has >= or ~= but no == — floating upper bound
      const match = line.match(/^([A-Za-z0-9][A-Za-z0-9._-]*)/);
      const name = match ? match[1] : line;
      findings.push(
        finding({
          scanner: 'DEP',
          severity: SEVERITY.LOW,
          title: `Loosely pinned Python dependency: ${name}`,
          description:
            `"${name}" in requirements.txt uses a range specifier without a strict == pin. ` +
            `Range specifiers allow unexpected version upgrades that may introduce vulnerabilities.`,
          file: 'requirements.txt',
          line: i + 1,
          evidence: line,
          owasp: 'LLM03',
          recommendation:
            `Prefer exact version pinning (\`${name}==x.y.z\`) for reproducible installs. ` +
            `If you need flexibility, use a lockfile approach (\`pip-compile\`).`,
        }),
      );
    }
  }

  return findings;
}

// ---------------------------------------------------------------------------
// Main scanner export
// ---------------------------------------------------------------------------

/**
 * Scan targetPath for dependency security issues.
 *
 * Detection categories:
 *   1. CVE Detection via npm audit / pip audit         (CRITICAL / HIGH)
 *   2. Typosquatting against top-200 npm / top-100 PyPI (HIGH / MEDIUM)
 *   3. Malicious install scripts in package.json        (HIGH)
 *   4. Unpinned version specifiers                      (LOW)
 *
 * @param {string} targetPath   - Absolute root path being scanned
 * @param {object} discovery    - Unused (dep-auditor reads files by convention, not discovery list)
 * @returns {Promise<object>}   - scannerResult envelope
 */
export async function scan(targetPath, discovery) {
  const startMs = Date.now();
  const findings = [];
  let filesScanned = 0;

  // Detect which ecosystems are present
  const pkgJsonPath       = join(targetPath, 'package.json');
  const requirementsTxt   = join(targetPath, 'requirements.txt');
  const setupPy           = join(targetPath, 'setup.py');
  const pyprojectToml     = join(targetPath, 'pyproject.toml');

  const hasNpm   = existsSync(pkgJsonPath);
  const hasPypi  = existsSync(requirementsTxt) || existsSync(setupPy) || existsSync(pyprojectToml);

  // Nothing to scan
  if (!hasNpm && !hasPypi) {
    return scannerResult('dep-auditor', 'skipped', [], 0, Date.now() - startMs);
  }

  try {
    // -----------------------------------------------------------------------
    // npm ecosystem
    // -----------------------------------------------------------------------
    if (hasNpm) {
      filesScanned++;
      const pkgJson = await readJson(pkgJsonPath);

      if (pkgJson) {
        // 1a. CVE via npm audit
        findings.push(...runNpmAudit(targetPath));

        // 2a. Typosquatting — npm
        const [topPkgs, allowlist] = await Promise.all([loadTopPackages(), loadTyposquatAllowlist()]);
        const npmTop  = topPkgs.npm.map(n => n.toLowerCase().replace(/_/g, '-'));
        const allDeps = {
          ...pkgJson.dependencies,
          ...pkgJson.devDependencies,
        };
        for (const dep of Object.keys(allDeps)) {
          const normalized = dep.toLowerCase().replace(/_/g, '-');
          const f = checkTyposquatting(normalized, npmTop, 200, 'npm', 'package.json', allowlist.npm);
          if (f) findings.push(f);
        }

        // 3. Malicious install scripts
        findings.push(...checkInstallScripts(pkgJson));

        // 4a. Unpinned versions
        findings.push(...checkUnpinnedNpm(pkgJson));
      }
    }

    // -----------------------------------------------------------------------
    // PyPI ecosystem
    // -----------------------------------------------------------------------
    if (hasPypi) {
      // 1b. CVE via pip audit (only if requirements.txt or pyproject.toml present)
      if (existsSync(requirementsTxt) || existsSync(pyprojectToml)) {
        findings.push(...runPipAudit(targetPath));
      }

      // 2b. Typosquatting — PyPI (only if requirements.txt present)
      if (existsSync(requirementsTxt)) {
        filesScanned++;
        const reqLines = await readLines(requirementsTxt);
        const topPkgs2  = await loadTopPackages();
        const allowlist2 = await loadTyposquatAllowlist();
        const pypiTop  = topPkgs2.pypi.map(n => n.toLowerCase().replace(/_/g, '-'));
        const declaredPypi = parseRequirementsTxt(reqLines);

        for (const dep of declaredPypi) {
          const f = checkTyposquatting(dep, pypiTop, 100, 'pypi', 'requirements.txt', allowlist2.pypi);
          if (f) findings.push(f);
        }

        // 4b. Unpinned versions
        findings.push(...checkUnpinnedPypi(reqLines));
      }
    }

    const durationMs = Date.now() - startMs;
    return scannerResult('dep-auditor', 'ok', findings, filesScanned, durationMs);

  } catch (err) {
    const durationMs = Date.now() - startMs;
    return scannerResult(
      'dep-auditor',
      'error',
      findings,
      filesScanned,
      durationMs,
      err.message,
    );
  }
}