// taint-tracer.mjs — Deterministic taint analysis: traces untrusted data from sources to dangerous sinks // Zero dependencies (Node.js builtins only via lib helpers). // // LIMITATIONS (read before interpreting results): // ~70% recall, ~50-70% precision for medium findings. // - No scope awareness: a variable named `input` in one function taints all uses across the file. // - No cross-file tracing: taint does not propagate across module boundaries. // - No closure / callback analysis: reassignment inside closures is not tracked. // - No data-flow through arrays or object properties (e.g., `obj.field = userInput`). // - Sanitization suppression is keyword-based; adversarial code can evade it. // - Shell variable pattern ($VAR) is very broad in .sh/.bash/.zsh files — expect FPs. // - Same-line source+sink detection is approximate; unrelated code on the same line may trigger. // // References: // - OWASP LLM01 (Prompt Injection — injection sinks: eval, exec, SQL queries) // - OWASP LLM02 (Sensitive Info Disclosure — exfiltration sinks: fetch, .post, .send) // - skill-threat-patterns.md: toolchain manipulation, persistence patterns import { readTextFile } from './lib/file-discovery.mjs'; import { finding, scannerResult } from './lib/output.mjs'; import { SEVERITY } from './lib/severity.mjs'; // --------------------------------------------------------------------------- // File extension filter — only scan code files, not config/docs // JVM-language support (.kt, .kts, .groovy, .gradle, .scala) is required for // JetBrains plugin scanning — plugin source lives in these languages. // --------------------------------------------------------------------------- const CODE_EXTENSIONS = new Set([ '.js', '.mjs', '.cjs', '.ts', '.mts', '.cts', '.jsx', '.tsx', '.py', '.pyw', '.rb', '.php', '.go', '.rs', '.java', '.cs', '.kt', '.kts', '.groovy', '.gradle', '.scala', '.sh', '.bash', '.zsh', ]); const SHELL_EXTENSIONS = new Set(['.sh', '.bash', '.zsh']); // --------------------------------------------------------------------------- // Source patterns — untrusted / externally controlled data origins // --------------------------------------------------------------------------- // NOTE: Shell variable pattern ($VAR) is intentionally only applied in SHELL_EXTENSIONS. // Applying it to JS/TS would produce massive false-positive rates. const SOURCES_COMMON = [ // Node.js / JavaScript { pattern: /process\.env\[?/g, label: 'process.env' }, { pattern: /process\.argv/g, label: 'process.argv' }, { pattern: /req\.body/g, label: 'req.body' }, { pattern: /req\.query/g, label: 'req.query' }, { pattern: /req\.params/g, label: 'req.params' }, { pattern: /req\.headers/g, label: 'req.headers' }, { pattern: /request\.body/g, label: 'request.body' }, { pattern: /request\.form/g, label: 'request.form' }, { pattern: /tool_input/g, label: 'tool_input' }, { pattern: /user_input/g, label: 'user_input' }, { pattern: /\$ARGUMENTS/g, label: '$ARGUMENTS' }, { pattern: /\bstdin\b/g, label: 'stdin' }, // Python { pattern: /os\.environ/g, label: 'os.environ' }, { pattern: /sys\.argv/g, label: 'sys.argv' }, { pattern: /\binput\s*\(/g, label: 'input()' }, { pattern: /request\.args/g, label: 'request.args' }, { pattern: /request\.json/g, label: 'request.json' }, ]; // Shell-only source: $VARIABLE references (excluding safe well-known vars) const SOURCE_SHELL = { pattern: /\$\{?\w+\}?/g, label: 'shell variable' }; // Shell vars that are virtually always safe — suppress false positives const SHELL_SAFE_VARS = new Set([ '$HOME', '$PATH', '$USER', '$PWD', '$SHELL', '$IFS', '$0', '$#', '${HOME}', '${PATH}', '${USER}', '${PWD}', '${SHELL}', ]); // --------------------------------------------------------------------------- // Sink patterns — dangerous operations that could lead to injection/exfiltration // --------------------------------------------------------------------------- // Each sink carries a `risk` label and a preferred OWASP mapping: // injection → LLM01 // exfiltration → LLM02 const SINKS = [ // Code / command execution (injection risk → LLM01) { pattern: /\beval\s*\(/g, label: 'eval()', risk: 'code execution', owasp: 'LLM01' }, { pattern: /\bexec\s*\(/g, label: 'exec()', risk: 'command execution', owasp: 'LLM01' }, { pattern: /\bexecSync\s*\(/g, label: 'execSync()', risk: 'command execution', owasp: 'LLM01' }, { pattern: /\bspawn\s*\(/g, label: 'spawn()', risk: 'command execution', owasp: 'LLM01' }, { pattern: /\bspawnSync\s*\(/g, label: 'spawnSync()', risk: 'command execution', owasp: 'LLM01' }, { pattern: /child_process/g, label: 'child_process', risk: 'command execution', owasp: 'LLM01' }, { pattern: /new\s+Function\s*\(/g, label: 'new Function()', risk: 'code execution', owasp: 'LLM01' }, { pattern: /\bsubprocess\./g, label: 'subprocess', risk: 'command execution', owasp: 'LLM01' }, { pattern: /os\.system\s*\(/g, label: 'os.system()', risk: 'command execution', owasp: 'LLM01' }, { pattern: /os\.popen\s*\(/g, label: 'os.popen()', risk: 'command execution', owasp: 'LLM01' }, // File system writes (could be used to persist injected content) { pattern: /writeFile\s*\(/g, label: 'writeFile()', risk: 'file write', owasp: 'LLM01' }, { pattern: /writeFileSync\s*\(/g, label: 'writeFileSync()', risk: 'file write', owasp: 'LLM01' }, { pattern: /\bappendFile/g, label: 'appendFile()', risk: 'file write', owasp: 'LLM01' }, { pattern: /createWriteStream/g, label: 'createWriteStream()', risk: 'file write', owasp: 'LLM01' }, { pattern: /open\s*\(.*['"]w/g, label: 'open(w)', risk: 'file write', owasp: 'LLM01' }, // Network / exfiltration (data leaving the process → LLM02) { pattern: /\bfetch\s*\(/g, label: 'fetch()', risk: 'network request', owasp: 'LLM02' }, { pattern: /\.send\s*\(/g, label: '.send()', risk: 'data exfiltration', owasp: 'LLM02' }, { pattern: /\.post\s*\(/g, label: '.post()', risk: 'data exfiltration', owasp: 'LLM02' }, { pattern: /XMLHttpRequest/g, label: 'XMLHttpRequest', risk: 'network request', owasp: 'LLM02' }, { pattern: /WebSocket/g, label: 'WebSocket', risk: 'network connection', owasp: 'LLM02' }, // Database (SQL injection → LLM01) { pattern: /\.query\s*\(/g, label: '.query()', risk: 'SQL injection', owasp: 'LLM01' }, { pattern: /\.execute\s*\(/g, label: '.execute()', risk: 'SQL injection', owasp: 'LLM01' }, { pattern: /\.raw\s*\(/g, label: '.raw()', risk: 'raw query', owasp: 'LLM01' }, // HTML / DOM injection (XSS → LLM01 in agentic browser contexts) { pattern: /innerHTML\s*=/g, label: 'innerHTML', risk: 'XSS', owasp: 'LLM01' }, { pattern: /document\.write\s*\(/g, label: 'document.write()', risk: 'XSS', owasp: 'LLM01' }, { pattern: /dangerouslySetInnerHTML/g, label: 'dangerouslySetInnerHTML', risk: 'XSS', owasp: 'LLM01' }, ]; // --------------------------------------------------------------------------- // Sanitization suppression keywords // --------------------------------------------------------------------------- // If any of these appear on a line between a source and a sink (inclusive), // severity is downgraded by one level. This is a heuristic — skilled attackers // can bypass it by naming variables after safe functions. const SANITIZER_PATTERN = /sanitize|escape|validate|parseInt|Number\s*\(|path\.resolve|path\.join|encodeURI|encodeURIComponent|DOMPurify|\.strip\s*\(|\.clean\s*\(|\.filter\s*\(|whitelist|allowlist/i; // --------------------------------------------------------------------------- // Severity ordering utilities // --------------------------------------------------------------------------- const SEVERITY_ORDER = [ SEVERITY.CRITICAL, SEVERITY.HIGH, SEVERITY.MEDIUM, SEVERITY.LOW, SEVERITY.INFO, ]; /** * Return the severity one step lower than the given one. * INFO cannot be reduced further. * @param {string} sev * @returns {string} */ function downgradeSeverity(sev) { const idx = SEVERITY_ORDER.indexOf(sev); if (idx < 0) return sev; return SEVERITY_ORDER[Math.min(idx + 1, SEVERITY_ORDER.length - 1)]; } // --------------------------------------------------------------------------- // Variable name extraction helpers // --------------------------------------------------------------------------- /** * Attempt to extract the variable name(s) being assigned on a source line. * Handles: * const/let/var X = (plain decl) * X = (plain assignment) * X: (Python / YAML-ish) * const { x } = (object destructuring) * const { x, y } = (multi-key) * const { secret: alias } = (renamed) * const { a, ...spread } = (object rest) * const { a, b: { c } } = (nested object) * const [a, b] = (array destructuring) * const [first, ...rest] = (array rest) * const [a, [b, c]] = (nested array) * * Implementation: regex-based, no full JS parser. Same constraint as the * pre-B6 extractor — the goal is best-effort, not soundness. Untracked * variables fall back to same-line sink detection (no propagation). * * Returns an empty array if no assignment variable is found. * * @param {string} line * @returns {string[]} variable names (may be empty) */ export function extractAssignedVariable(line) { const names = new Set(); // Identify a destructuring pattern boundary on the LHS of `=`. // Match `const|let|var` followed by either `{...}` or `[...]` and `=`. // We capture the LHS-pattern body so we can extract names without // reading past the assignment. const destructDecl = line.match(/\b(?:const|let|var)\s+([{[][\s\S]*?[\]}])\s*=[^=]/); if (destructDecl) { extractDestructuredNames(destructDecl[1], names); } else { // Pattern 1: const/let/var X = ... (plain identifier — keep // existing behavior; the original pre-B6 regex tolerated optional `{` // and silently ate the first key. Now that destructuring has its own // branch above, the plain-decl branch only matches plain identifiers.) const declMatch = line.match(/\b(?:const|let|var)\s+(\w+)\s*=/); if (declMatch) { names.add(declMatch[1]); } } // Pattern 2: plain assignment X = ... (no keyword) // Avoid matching == and === const assignMatch = line.match(/^\s*(\w+)\s*=[^=]/); if (assignMatch) { names.add(assignMatch[1]); } // Pattern 3 (Python-style `X: source`) — already covered by other patterns // when present in YAML/Python contexts via the plain-decl branch. return [...names]; } /** * Walk a destructuring pattern body (the `{...}` or `[...]` after the * `const`/`let`/`var` keyword and before `=`) and add every bound * identifier to `names`. Handles nested patterns and rest elements. * * Pure regex — does not parse balanced brackets perfectly, but the * patterns we care about (plain identifiers, renamed keys `key: alias`, * rest `...spread`) all surface as `\w+` tokens at predictable positions * that a simple tokenizer can extract. Edge case: shorthand keys with * default values (`{ x = 5 }`) are handled by the identifier-before-= rule. * * @param {string} pattern The body including the outer brackets. * @param {Set} names Mutated. */ function extractDestructuredNames(pattern, names) { // Strip outer brackets so we focus on contents. const inner = pattern.slice(1, -1); // Token-walk: at each position consume one of: // - `{ ... }` or `[ ... ]` — recurse into the nested pattern // - `key: ` — bind whatever \w+ comes from 's leading ident // (or recurse if is a nested pattern) // - `...spread` — the next ident is the rest var // - `ident` — bound directly (shorthand or array element) // - `ident = default` — bound (default value ignored) // - separators (`,`, whitespace) — skip // // Implementation simplification: match on three regex alternatives that // cover everything in practice. Catastrophic-backtracking-safe: every // token consumes ≥1 character. let i = 0; while (i < inner.length) { const ch = inner[i]; if (ch === '{' || ch === '[') { // Find matching close bracket via depth counter (handles nesting). const open = ch; const close = open === '{' ? '}' : ']'; let depth = 1; let j = i + 1; while (j < inner.length && depth > 0) { if (inner[j] === open) depth++; else if (inner[j] === close) depth--; j++; } // Recurse into the nested pattern body. extractDestructuredNames(inner.slice(i, j), names); i = j; continue; } if (ch === ',' || /\s/.test(ch) || ch === ':' || ch === '=') { i++; continue; } if (inner.startsWith('...', i)) { i += 3; continue; } // Identifier token. After this token: either followed by `:` (then // the RHS is the actual binding — skip this token, the bind is the // next ident), or followed by `,`/`}`/`]`/`=`/whitespace/end (then // this token is the bound name). const idMatch = inner.slice(i).match(/^(\w+)/); if (!idMatch) { i++; continue; } const ident = idMatch[1]; const next = i + ident.length; // Skip whitespace to find the next significant character. let k = next; while (k < inner.length && /\s/.test(inner[k])) k++; if (inner[k] === ':') { // This ident is a key — the RHS is the binding. Don't add this // ident; the loop will pick up the RHS on the next iteration. i = k + 1; continue; } // Otherwise this ident IS bound. names.add(ident); i = next; } } // --------------------------------------------------------------------------- // Shell file safety check // --------------------------------------------------------------------------- /** * In shell files, check whether a matched shell variable token is a safe built-in. * @param {string} token - e.g. "$HOME" or "${USER}" * @returns {boolean} */ function isShellSafeVar(token) { // Normalize: strip the part after the variable name in ${VAR:-default} patterns const normalized = token.replace(/\{(\w+)[^}]*\}/, '{$1}').replace(/\{/, '').replace(/\}/, ''); const bare = '$' + normalized.replace(/^\$/, ''); return SHELL_SAFE_VARS.has(token) || SHELL_SAFE_VARS.has(bare); } // --------------------------------------------------------------------------- // Per-line source/sink detection // --------------------------------------------------------------------------- /** * Check if a line contains a source pattern. * Returns all matches: { label, position }. * For shell files, skips safe built-in variables. * * @param {string} line * @param {boolean} isShell * @returns {Array<{ label: string, position: number }>} */ function detectSources(line, isShell) { const sources = [...SOURCES_COMMON]; if (isShell) sources.push(SOURCE_SHELL); const matches = []; for (const src of sources) { // Reset regex state (global flag retains lastIndex) const re = new RegExp(src.pattern.source, src.pattern.flags); let m; while ((m = re.exec(line)) !== null) { // Shell safe-var suppression if (isShell && src === SOURCE_SHELL) { const token = m[0]; if (isShellSafeVar(token)) continue; } matches.push({ label: src.label, position: m.index }); } } return matches; } /** * Check if a line contains a sink pattern. * Returns all matches: { label, risk, owasp, position }. * * @param {string} line * @returns {Array<{ label: string, risk: string, owasp: string, position: number }>} */ function detectSinks(line) { const matches = []; for (const sink of SINKS) { const re = new RegExp(sink.pattern.source, sink.pattern.flags); let m; while ((m = re.exec(line)) !== null) { matches.push({ label: sink.label, risk: sink.risk, owasp: sink.owasp, position: m.index }); } } return matches; } // --------------------------------------------------------------------------- // Sanitization check in a line range // --------------------------------------------------------------------------- /** * Check whether any line in [fromLine, toLine] (0-indexed, inclusive) contains * a sanitization keyword. If so, caller should downgrade severity. * * @param {string[]} lines * @param {number} fromIdx - 0-based inclusive start * @param {number} toIdx - 0-based inclusive end * @returns {boolean} */ function hasSanitizationBetween(lines, fromIdx, toIdx) { const start = Math.max(0, fromIdx); const end = Math.min(lines.length - 1, toIdx); for (let i = start; i <= end; i++) { if (SANITIZER_PATTERN.test(lines[i])) return true; } return false; } // --------------------------------------------------------------------------- // Proximity-based severity // --------------------------------------------------------------------------- /** * Map line distance between source and sink to a base severity. * same line (dist 0) → CRITICAL * within 10 lines → HIGH * within 50 lines → MEDIUM * beyond 50 lines → LOW * * @param {number} distance - number of lines between source and sink (0 = same line) * @returns {string} */ function distanceToSeverity(distance) { if (distance === 0) return SEVERITY.CRITICAL; if (distance <= 10) return SEVERITY.HIGH; if (distance <= 50) return SEVERITY.MEDIUM; return SEVERITY.LOW; } // --------------------------------------------------------------------------- // Tainted variable tracking // --------------------------------------------------------------------------- /** * @typedef {{ name: string, sourceLine: number, sourceLabel: string }} TaintedVar */ // --------------------------------------------------------------------------- // Per-file scan // --------------------------------------------------------------------------- /** * Run the 3-pass taint analysis on a single file. * * Pass 1 — Source Detection: Find lines with source patterns, extract assigned variable names. * Pass 2 — Same-line Flow: Source and sink on the same line → CRITICAL finding. * Pass 3 — Variable-to-Sink: For each tainted variable, search subsequent lines for its name * appearing near a sink → severity by proximity. * * @param {string} content - File text * @param {string} absPath - Absolute path (for suppression checks) * @param {string} relPath - Relative path (for finding output) * @returns {ReturnType[]} */ function scanFileContent(content, absPath, relPath) { const lines = content.split('\n'); const isShell = SHELL_EXTENSIONS.has( (relPath.match(/\.[^.]+$/) || [''])[0].toLowerCase() ); const fileFindings = []; // Dedup key: prevent reporting the same source+sink pair multiple times const reportedPairs = new Set(); // ---- Pass 1: Source Detection ---- // Collect tainted variables and same-line sink candidates in a single sweep. /** @type {TaintedVar[]} */ const taintedVars = []; for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) { const line = lines[lineIdx]; const sourceMatches = detectSources(line, isShell); if (sourceMatches.length === 0) continue; // Extract variable being assigned on this source line const assignedVarNames = extractAssignedVariable(line); for (const varName of assignedVarNames) { // Skip very short or overly generic names that would produce noise if (varName.length < 2) continue; taintedVars.push({ name: varName, sourceLine: lineIdx, sourceLabel: sourceMatches[0].label }); } // ---- Pass 2: Same-line Source + Sink ---- const sinkMatches = detectSinks(line); for (const src of sourceMatches) { for (const sink of sinkMatches) { const pairKey = `sameline:${lineIdx}:${src.label}:${sink.label}`; if (reportedPairs.has(pairKey)) continue; reportedPairs.add(pairKey); // Same-line: CRITICAL, but check for sanitizer on the same line let severity = SEVERITY.CRITICAL; if (hasSanitizationBetween(lines, lineIdx, lineIdx)) { severity = downgradeSeverity(severity); } fileFindings.push( finding({ scanner: 'TNT', severity, title: `Taint: ${src.label} flows directly to ${sink.label} (same line)`, description: `Untrusted data from source \`${src.label}\` appears on the same line as ` + `dangerous sink \`${sink.label}\` (${sink.risk}). ` + `Same-line flow is a strong indicator of unsanitized data reaching a dangerous operation.`, file: relPath, line: lineIdx + 1, evidence: `source \`${src.label}\` at line ${lineIdx + 1} flows to \`${sink.label}\` at line ${lineIdx + 1} (same-line)`, owasp: sink.owasp, recommendation: 'Validate/sanitize data before passing to sink. Consider using parameterized queries, allowlists, or safe APIs.', }) ); } } } // ---- Pass 3: Variable-to-Sink ---- // For each tainted variable, scan lines after the source for the variable name // appearing in context with a sink. // // Strategy: scan every line that comes after the source line for the presence of: // (a) the tainted variable name as a word token, AND // (b) a sink pattern on the same line. // // We also catch the case where the variable appears as an argument to a sink call // on the same line (most common real-world pattern). for (const taintedVar of taintedVars) { // Build a word-boundary regex for the variable name to avoid substring matches // (e.g., "cmd" should not match "cmdLine" unless we want it to). // We use a simple word-boundary check here. const varRe = new RegExp(`\\b${escapeRegex(taintedVar.name)}\\b`); for (let lineIdx = taintedVar.sourceLine + 1; lineIdx < lines.length; lineIdx++) { const line = lines[lineIdx]; // Check if tainted variable appears on this line if (!varRe.test(line)) continue; // Check if a sink also appears on this line const sinkMatches = detectSinks(line); if (sinkMatches.length === 0) continue; for (const sink of sinkMatches) { const distance = lineIdx - taintedVar.sourceLine; const pairKey = `var:${relPath}:${taintedVar.name}:${taintedVar.sourceLine}:${sink.label}:${lineIdx}`; if (reportedPairs.has(pairKey)) continue; reportedPairs.add(pairKey); let severity = distanceToSeverity(distance); // Apply sanitization suppression: scan lines from source through sink if (hasSanitizationBetween(lines, taintedVar.sourceLine, lineIdx)) { severity = downgradeSeverity(severity); } fileFindings.push( finding({ scanner: 'TNT', severity, title: `Taint: ${taintedVar.sourceLabel} → ${taintedVar.name} → ${sink.label}`, description: `Variable \`${taintedVar.name}\` is assigned from untrusted source ` + `\`${taintedVar.sourceLabel}\` at line ${taintedVar.sourceLine + 1} ` + `and flows into dangerous sink \`${sink.label}\` (${sink.risk}) ` + `at line ${lineIdx + 1} (${distance} line${distance === 1 ? '' : 's'} away). ` + `No recognized sanitization was detected between source and sink.`, file: relPath, line: lineIdx + 1, evidence: `source \`${taintedVar.sourceLabel}\` at line ${taintedVar.sourceLine + 1} ` + `flows to \`${sink.label}\` at line ${lineIdx + 1} ` + `via variable \`${taintedVar.name}\``, owasp: sink.owasp, recommendation: 'Validate/sanitize data before passing to sink. Consider using parameterized queries, allowlists, or safe APIs.', }) ); } } } return fileFindings; } // --------------------------------------------------------------------------- // Utility: escape regex special characters in a variable name // --------------------------------------------------------------------------- /** * Escape regex metacharacters in a literal string so it can be embedded in a RegExp. * @param {string} str * @returns {string} */ function escapeRegex(str) { return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } // --------------------------------------------------------------------------- // Public scanner entry point // --------------------------------------------------------------------------- /** * Scan a target path for taint flows from untrusted sources to dangerous sinks. * * Only processes code files (.js, .mjs, .cjs, .ts, .mts, .cts, .jsx, .tsx, * .py, .pyw, .rb, .php, .go, .rs, .java, .cs, .sh, .bash, .zsh). * All other files in the discovery set are skipped silently. * * @param {string} targetPath - Absolute path to scan (file or directory root) * @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery * Pre-computed file discovery result from the orchestrator. * @returns {Promise} Scanner result envelope (see lib/output.mjs::scannerResult) */ export async function scan(targetPath, discovery) { const startMs = Date.now(); const allFindings = []; let filesScanned = 0; try { for (const fileInfo of discovery.files) { // Only scan code files if (!CODE_EXTENSIONS.has(fileInfo.ext)) continue; const content = await readTextFile(fileInfo.absPath); // readTextFile returns null for binary files or unreadable paths if (content === null) continue; filesScanned++; const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath); allFindings.push(...fileFindings); } const durationMs = Date.now() - startMs; return scannerResult('taint-tracer', 'ok', allFindings, filesScanned, durationMs); } catch (err) { const durationMs = Date.now() - startMs; return scannerResult( 'taint-tracer', 'error', allFindings, filesScanned, durationMs, String(err?.message || err) ); } }