ktg-plugin-marketplace/plugins/llm-security/scanners/taint-tracer.mjs

// taint-tracer.mjs — Deterministic taint analysis: traces untrusted data from sources to dangerous sinks
// Zero dependencies (Node.js builtins only via lib helpers).
//
// LIMITATIONS (read before interpreting results):
//   ~70% recall, ~50-70% precision for medium findings.
//   - No scope awareness: a variable named `input` in one function taints all uses across the file.
//   - No cross-file tracing: taint does not propagate across module boundaries.
//   - No closure / callback analysis: reassignment inside closures is not tracked.
//   - No data-flow through arrays or object properties (e.g., `obj.field = userInput`).
//   - Sanitization suppression is keyword-based; adversarial code can evade it.
//   - Shell variable pattern ($VAR) is very broad in .sh/.bash/.zsh files — expect FPs.
//   - Same-line source+sink detection is approximate; unrelated code on the same line may trigger.
//
// References:
//   - OWASP LLM01 (Prompt Injection — injection sinks: eval, exec, SQL queries)
//   - OWASP LLM02 (Sensitive Info Disclosure — exfiltration sinks: fetch, .post, .send)
//   - skill-threat-patterns.md: toolchain manipulation, persistence patterns

import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';

// ---------------------------------------------------------------------------
// File extension filter — only scan code files, not config/docs
// JVM-language support (.kt, .kts, .groovy, .gradle, .scala) is required for
// JetBrains plugin scanning — plugin source lives in these languages.
// ---------------------------------------------------------------------------

const CODE_EXTENSIONS = new Set([
  '.js', '.mjs', '.cjs',
  '.ts', '.mts', '.cts',
  '.jsx', '.tsx',
  '.py', '.pyw',
  '.rb', '.php',
  '.go', '.rs',
  '.java', '.cs',
  '.kt', '.kts',
  '.groovy', '.gradle',
  '.scala',
  '.sh', '.bash', '.zsh',
]);

const SHELL_EXTENSIONS = new Set(['.sh', '.bash', '.zsh']);

// ---------------------------------------------------------------------------
// Source patterns — untrusted / externally controlled data origins
// ---------------------------------------------------------------------------

// NOTE: Shell variable pattern ($VAR) is intentionally only applied in SHELL_EXTENSIONS.
// Applying it to JS/TS would produce massive false-positive rates.
const SOURCES_COMMON = [
  // Node.js / JavaScript
  { pattern: /process\.env\[?/g,      label: 'process.env' },
  { pattern: /process\.argv/g,         label: 'process.argv' },
  { pattern: /req\.body/g,             label: 'req.body' },
  { pattern: /req\.query/g,            label: 'req.query' },
  { pattern: /req\.params/g,           label: 'req.params' },
  { pattern: /req\.headers/g,          label: 'req.headers' },
  { pattern: /request\.body/g,         label: 'request.body' },
  { pattern: /request\.form/g,         label: 'request.form' },
  { pattern: /tool_input/g,            label: 'tool_input' },
  { pattern: /user_input/g,            label: 'user_input' },
  { pattern: /\$ARGUMENTS/g,           label: '$ARGUMENTS' },
  { pattern: /\bstdin\b/g,             label: 'stdin' },
  // Python
  { pattern: /os\.environ/g,           label: 'os.environ' },
  { pattern: /sys\.argv/g,             label: 'sys.argv' },
  { pattern: /\binput\s*\(/g,          label: 'input()' },
  { pattern: /request\.args/g,         label: 'request.args' },
  { pattern: /request\.json/g,         label: 'request.json' },
];

// Shell-only source: $VARIABLE references (excluding safe well-known vars)
const SOURCE_SHELL = { pattern: /\$\{?\w+\}?/g, label: 'shell variable' };

// Shell vars that are virtually always safe — suppress false positives
const SHELL_SAFE_VARS = new Set([
  '$HOME', '$PATH', '$USER', '$PWD', '$SHELL', '$IFS', '$0', '$#',
  '${HOME}', '${PATH}', '${USER}', '${PWD}', '${SHELL}',
]);

// ---------------------------------------------------------------------------
// Sink patterns — dangerous operations that could lead to injection/exfiltration
// ---------------------------------------------------------------------------

// Each sink carries a `risk` label and a preferred OWASP mapping:
//   injection  → LLM01
//   exfiltration → LLM02

const SINKS = [
  // Code / command execution (injection risk → LLM01)
  { pattern: /\beval\s*\(/g,              label: 'eval()',                 risk: 'code execution',    owasp: 'LLM01' },
  { pattern: /\bexec\s*\(/g,             label: 'exec()',                  risk: 'command execution', owasp: 'LLM01' },
  { pattern: /\bexecSync\s*\(/g,         label: 'execSync()',              risk: 'command execution', owasp: 'LLM01' },
  { pattern: /\bspawn\s*\(/g,            label: 'spawn()',                 risk: 'command execution', owasp: 'LLM01' },
  { pattern: /\bspawnSync\s*\(/g,        label: 'spawnSync()',             risk: 'command execution', owasp: 'LLM01' },
  { pattern: /child_process/g,           label: 'child_process',          risk: 'command execution', owasp: 'LLM01' },
  { pattern: /new\s+Function\s*\(/g,     label: 'new Function()',         risk: 'code execution',    owasp: 'LLM01' },
  { pattern: /\bsubprocess\./g,          label: 'subprocess',             risk: 'command execution', owasp: 'LLM01' },
  { pattern: /os\.system\s*\(/g,         label: 'os.system()',            risk: 'command execution', owasp: 'LLM01' },
  { pattern: /os\.popen\s*\(/g,          label: 'os.popen()',             risk: 'command execution', owasp: 'LLM01' },
  // File system writes (could be used to persist injected content)
  { pattern: /writeFile\s*\(/g,          label: 'writeFile()',            risk: 'file write',        owasp: 'LLM01' },
  { pattern: /writeFileSync\s*\(/g,      label: 'writeFileSync()',        risk: 'file write',        owasp: 'LLM01' },
  { pattern: /\bappendFile/g,            label: 'appendFile()',           risk: 'file write',        owasp: 'LLM01' },
  { pattern: /createWriteStream/g,       label: 'createWriteStream()',    risk: 'file write',        owasp: 'LLM01' },
  { pattern: /open\s*\(.*['"]w/g,        label: 'open(w)',               risk: 'file write',        owasp: 'LLM01' },
  // Network / exfiltration (data leaving the process → LLM02)
  { pattern: /\bfetch\s*\(/g,            label: 'fetch()',               risk: 'network request',   owasp: 'LLM02' },
  { pattern: /\.send\s*\(/g,             label: '.send()',               risk: 'data exfiltration', owasp: 'LLM02' },
  { pattern: /\.post\s*\(/g,             label: '.post()',               risk: 'data exfiltration', owasp: 'LLM02' },
  { pattern: /XMLHttpRequest/g,          label: 'XMLHttpRequest',        risk: 'network request',   owasp: 'LLM02' },
  { pattern: /WebSocket/g,               label: 'WebSocket',             risk: 'network connection', owasp: 'LLM02' },
  // Database (SQL injection → LLM01)
  { pattern: /\.query\s*\(/g,            label: '.query()',              risk: 'SQL injection',     owasp: 'LLM01' },
  { pattern: /\.execute\s*\(/g,          label: '.execute()',            risk: 'SQL injection',     owasp: 'LLM01' },
  { pattern: /\.raw\s*\(/g,              label: '.raw()',                risk: 'raw query',         owasp: 'LLM01' },
  // HTML / DOM injection (XSS → LLM01 in agentic browser contexts)
  { pattern: /innerHTML\s*=/g,           label: 'innerHTML',             risk: 'XSS',               owasp: 'LLM01' },
  { pattern: /document\.write\s*\(/g,    label: 'document.write()',      risk: 'XSS',               owasp: 'LLM01' },
  { pattern: /dangerouslySetInnerHTML/g, label: 'dangerouslySetInnerHTML', risk: 'XSS',             owasp: 'LLM01' },
];

// ---------------------------------------------------------------------------
// Sanitization suppression keywords
// ---------------------------------------------------------------------------
// If any of these appear on a line between a source and a sink (inclusive),
// severity is downgraded by one level. This is a heuristic — skilled attackers
// can bypass it by naming variables after safe functions.

const SANITIZER_PATTERN = /sanitize|escape|validate|parseInt|Number\s*\(|path\.resolve|path\.join|encodeURI|encodeURIComponent|DOMPurify|\.strip\s*\(|\.clean\s*\(|\.filter\s*\(|whitelist|allowlist/i;

// ---------------------------------------------------------------------------
// Severity ordering utilities
// ---------------------------------------------------------------------------

const SEVERITY_ORDER = [
  SEVERITY.CRITICAL,
  SEVERITY.HIGH,
  SEVERITY.MEDIUM,
  SEVERITY.LOW,
  SEVERITY.INFO,
];

/**
 * Return the severity one step lower than the given one.
 * INFO cannot be reduced further.
 * @param {string} sev
 * @returns {string}
 */
function downgradeSeverity(sev) {
  const idx = SEVERITY_ORDER.indexOf(sev);
  if (idx < 0) return sev;
  return SEVERITY_ORDER[Math.min(idx + 1, SEVERITY_ORDER.length - 1)];
}

// ---------------------------------------------------------------------------
// Variable name extraction helpers
// ---------------------------------------------------------------------------

/**
 * Attempt to extract the variable name being assigned on a source line.
 * Handles:
 *   const/let/var X = <source>
 *   X = <source>
 *   X: <source>          (Python / YAML-ish)
 *   (X) = <source>       (destructuring approximation)
 *
 * Returns an empty array if no assignment variable is found — the source
 * will still be tracked for same-line sink detection, but not propagated.
 *
 * @param {string} line
 * @returns {string[]} variable names (may be empty)
 */
function extractAssignedVariable(line) {
  const names = [];

  // Pattern 1: const/let/var X = ...  or  const/let/var { X } = ...
  const declMatch = line.match(/\b(?:const|let|var)\s+\{?\s*(\w+)/);
  if (declMatch) {
    names.push(declMatch[1]);
  }

  // Pattern 2: plain assignment  X = ...  (no keyword)
  // Avoid matching == and ===
  const assignMatch = line.match(/^\s*(\w+)\s*=[^=]/);
  if (assignMatch && !names.includes(assignMatch[1])) {
    names.push(assignMatch[1]);
  }

  // Pattern 3: Python-style keyword argument or named parameter: X = source
  // Already covered by Pattern 2 above.

  return names;
}

// ---------------------------------------------------------------------------
// Shell file safety check
// ---------------------------------------------------------------------------

/**
 * In shell files, check whether a matched shell variable token is a safe built-in.
 * @param {string} token - e.g. "$HOME" or "${USER}"
 * @returns {boolean}
 */
function isShellSafeVar(token) {
  // Normalize: strip the part after the variable name in ${VAR:-default} patterns
  const normalized = token.replace(/\{(\w+)[^}]*\}/, '{$1}').replace(/\{/, '').replace(/\}/, '');
  const bare = '$' + normalized.replace(/^\$/, '');
  return SHELL_SAFE_VARS.has(token) || SHELL_SAFE_VARS.has(bare);
}

// ---------------------------------------------------------------------------
// Per-line source/sink detection
// ---------------------------------------------------------------------------

/**
 * Check if a line contains a source pattern.
 * Returns all matches: { label, position }.
 * For shell files, skips safe built-in variables.
 *
 * @param {string} line
 * @param {boolean} isShell
 * @returns {Array<{ label: string, position: number }>}
 */
function detectSources(line, isShell) {
  const sources = [...SOURCES_COMMON];
  if (isShell) sources.push(SOURCE_SHELL);

  const matches = [];

  for (const src of sources) {
    // Reset regex state (global flag retains lastIndex)
    const re = new RegExp(src.pattern.source, src.pattern.flags);
    let m;
    while ((m = re.exec(line)) !== null) {
      // Shell safe-var suppression
      if (isShell && src === SOURCE_SHELL) {
        const token = m[0];
        if (isShellSafeVar(token)) continue;
      }
      matches.push({ label: src.label, position: m.index });
    }
  }

  return matches;
}

/**
 * Check if a line contains a sink pattern.
 * Returns all matches: { label, risk, owasp, position }.
 *
 * @param {string} line
 * @returns {Array<{ label: string, risk: string, owasp: string, position: number }>}
 */
function detectSinks(line) {
  const matches = [];
  for (const sink of SINKS) {
    const re = new RegExp(sink.pattern.source, sink.pattern.flags);
    let m;
    while ((m = re.exec(line)) !== null) {
      matches.push({ label: sink.label, risk: sink.risk, owasp: sink.owasp, position: m.index });
    }
  }
  return matches;
}

// ---------------------------------------------------------------------------
// Sanitization check in a line range
// ---------------------------------------------------------------------------

/**
 * Check whether any line in [fromLine, toLine] (0-indexed, inclusive) contains
 * a sanitization keyword. If so, caller should downgrade severity.
 *
 * @param {string[]} lines
 * @param {number} fromIdx - 0-based inclusive start
 * @param {number} toIdx   - 0-based inclusive end
 * @returns {boolean}
 */
function hasSanitizationBetween(lines, fromIdx, toIdx) {
  const start = Math.max(0, fromIdx);
  const end = Math.min(lines.length - 1, toIdx);
  for (let i = start; i <= end; i++) {
    if (SANITIZER_PATTERN.test(lines[i])) return true;
  }
  return false;
}

// ---------------------------------------------------------------------------
// Proximity-based severity
// ---------------------------------------------------------------------------

/**
 * Map line distance between source and sink to a base severity.
 *   same line (dist 0) → CRITICAL
 *   within 10 lines    → HIGH
 *   within 50 lines    → MEDIUM
 *   beyond 50 lines    → LOW
 *
 * @param {number} distance - number of lines between source and sink (0 = same line)
 * @returns {string}
 */
function distanceToSeverity(distance) {
  if (distance === 0)  return SEVERITY.CRITICAL;
  if (distance <= 10)  return SEVERITY.HIGH;
  if (distance <= 50)  return SEVERITY.MEDIUM;
  return SEVERITY.LOW;
}

// ---------------------------------------------------------------------------
// Tainted variable tracking
// ---------------------------------------------------------------------------

/**
 * @typedef {{ name: string, sourceLine: number, sourceLabel: string }} TaintedVar
 */

// ---------------------------------------------------------------------------
// Per-file scan
// ---------------------------------------------------------------------------

/**
 * Run the 3-pass taint analysis on a single file.
 *
 * Pass 1 — Source Detection:  Find lines with source patterns, extract assigned variable names.
 * Pass 2 — Same-line Flow:    Source and sink on the same line → CRITICAL finding.
 * Pass 3 — Variable-to-Sink:  For each tainted variable, search subsequent lines for its name
 *                              appearing near a sink → severity by proximity.
 *
 * @param {string} content   - File text
 * @param {string} absPath   - Absolute path (for suppression checks)
 * @param {string} relPath   - Relative path (for finding output)
 * @returns {ReturnType<typeof import('./lib/output.mjs').finding>[]}
 */
function scanFileContent(content, absPath, relPath) {
  const lines = content.split('\n');
  const isShell = SHELL_EXTENSIONS.has(
    (relPath.match(/\.[^.]+$/) || [''])[0].toLowerCase()
  );
  const fileFindings = [];

  // Dedup key: prevent reporting the same source+sink pair multiple times
  const reportedPairs = new Set();

  // ---- Pass 1: Source Detection ----
  // Collect tainted variables and same-line sink candidates in a single sweep.

  /** @type {TaintedVar[]} */
  const taintedVars = [];

  for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
    const line = lines[lineIdx];
    const sourceMatches = detectSources(line, isShell);
    if (sourceMatches.length === 0) continue;

    // Extract variable being assigned on this source line
    const assignedVarNames = extractAssignedVariable(line);
    for (const varName of assignedVarNames) {
      // Skip very short or overly generic names that would produce noise
      if (varName.length < 2) continue;
      taintedVars.push({ name: varName, sourceLine: lineIdx, sourceLabel: sourceMatches[0].label });
    }

    // ---- Pass 2: Same-line Source + Sink ----
    const sinkMatches = detectSinks(line);
    for (const src of sourceMatches) {
      for (const sink of sinkMatches) {
        const pairKey = `sameline:${lineIdx}:${src.label}:${sink.label}`;
        if (reportedPairs.has(pairKey)) continue;
        reportedPairs.add(pairKey);

        // Same-line: CRITICAL, but check for sanitizer on the same line
        let severity = SEVERITY.CRITICAL;
        if (hasSanitizationBetween(lines, lineIdx, lineIdx)) {
          severity = downgradeSeverity(severity);
        }

        fileFindings.push(
          finding({
            scanner: 'TNT',
            severity,
            title: `Taint: ${src.label} flows directly to ${sink.label} (same line)`,
            description:
              `Untrusted data from source \`${src.label}\` appears on the same line as ` +
              `dangerous sink \`${sink.label}\` (${sink.risk}). ` +
              `Same-line flow is a strong indicator of unsanitized data reaching a dangerous operation.`,
            file: relPath,
            line: lineIdx + 1,
            evidence: `source \`${src.label}\` at line ${lineIdx + 1} flows to \`${sink.label}\` at line ${lineIdx + 1} (same-line)`,
            owasp: sink.owasp,
            recommendation:
              'Validate/sanitize data before passing to sink. Consider using parameterized queries, allowlists, or safe APIs.',
          })
        );
      }
    }
  }

  // ---- Pass 3: Variable-to-Sink ----
  // For each tainted variable, scan lines after the source for the variable name
  // appearing in context with a sink.
  //
  // Strategy: scan every line that comes after the source line for the presence of:
  //   (a) the tainted variable name as a word token, AND
  //   (b) a sink pattern on the same line.
  //
  // We also catch the case where the variable appears as an argument to a sink call
  // on the same line (most common real-world pattern).

  for (const taintedVar of taintedVars) {
    // Build a word-boundary regex for the variable name to avoid substring matches
    // (e.g., "cmd" should not match "cmdLine" unless we want it to).
    // We use a simple word-boundary check here.
    const varRe = new RegExp(`\\b${escapeRegex(taintedVar.name)}\\b`);

    for (let lineIdx = taintedVar.sourceLine + 1; lineIdx < lines.length; lineIdx++) {
      const line = lines[lineIdx];

      // Check if tainted variable appears on this line
      if (!varRe.test(line)) continue;

      // Check if a sink also appears on this line
      const sinkMatches = detectSinks(line);
      if (sinkMatches.length === 0) continue;

      for (const sink of sinkMatches) {
        const distance = lineIdx - taintedVar.sourceLine;
        const pairKey = `var:${relPath}:${taintedVar.name}:${taintedVar.sourceLine}:${sink.label}:${lineIdx}`;
        if (reportedPairs.has(pairKey)) continue;
        reportedPairs.add(pairKey);

        let severity = distanceToSeverity(distance);

        // Apply sanitization suppression: scan lines from source through sink
        if (hasSanitizationBetween(lines, taintedVar.sourceLine, lineIdx)) {
          severity = downgradeSeverity(severity);
        }

        fileFindings.push(
          finding({
            scanner: 'TNT',
            severity,
            title: `Taint: ${taintedVar.sourceLabel} → ${taintedVar.name} → ${sink.label}`,
            description:
              `Variable \`${taintedVar.name}\` is assigned from untrusted source ` +
              `\`${taintedVar.sourceLabel}\` at line ${taintedVar.sourceLine + 1} ` +
              `and flows into dangerous sink \`${sink.label}\` (${sink.risk}) ` +
              `at line ${lineIdx + 1} (${distance} line${distance === 1 ? '' : 's'} away). ` +
              `No recognized sanitization was detected between source and sink.`,
            file: relPath,
            line: lineIdx + 1,
            evidence:
              `source \`${taintedVar.sourceLabel}\` at line ${taintedVar.sourceLine + 1} ` +
              `flows to \`${sink.label}\` at line ${lineIdx + 1} ` +
              `via variable \`${taintedVar.name}\``,
            owasp: sink.owasp,
            recommendation:
              'Validate/sanitize data before passing to sink. Consider using parameterized queries, allowlists, or safe APIs.',
          })
        );
      }
    }
  }

  return fileFindings;
}

// ---------------------------------------------------------------------------
// Utility: escape regex special characters in a variable name
// ---------------------------------------------------------------------------

/**
 * Escape regex metacharacters in a literal string so it can be embedded in a RegExp.
 * @param {string} str
 * @returns {string}
 */
function escapeRegex(str) {
  return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------

/**
 * Scan a target path for taint flows from untrusted sources to dangerous sinks.
 *
 * Only processes code files (.js, .mjs, .cjs, .ts, .mts, .cts, .jsx, .tsx,
 * .py, .pyw, .rb, .php, .go, .rs, .java, .cs, .sh, .bash, .zsh).
 * All other files in the discovery set are skipped silently.
 *
 * @param {string} targetPath - Absolute path to scan (file or directory root)
 * @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
 *   Pre-computed file discovery result from the orchestrator.
 * @returns {Promise<object>} Scanner result envelope (see lib/output.mjs::scannerResult)
 */
export async function scan(targetPath, discovery) {
  const startMs = Date.now();
  const allFindings = [];
  let filesScanned = 0;

  try {
    for (const fileInfo of discovery.files) {
      // Only scan code files
      if (!CODE_EXTENSIONS.has(fileInfo.ext)) continue;

      const content = await readTextFile(fileInfo.absPath);

      // readTextFile returns null for binary files or unreadable paths
      if (content === null) continue;

      filesScanned++;

      const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
      allFindings.push(...fileFindings);
    }

    const durationMs = Date.now() - startMs;
    return scannerResult('taint-tracer', 'ok', allFindings, filesScanned, durationMs);
  } catch (err) {
    const durationMs = Date.now() - startMs;
    return scannerResult(
      'taint-tracer',
      'error',
      allFindings,
      filesScanned,
      durationMs,
      String(err?.message || err)
    );
  }
}