ktg-plugin-marketplace/plugins/llm-security/scanners/taint-tracer.mjs

532 lines
22 KiB
JavaScript

// taint-tracer.mjs — Deterministic taint analysis: traces untrusted data from sources to dangerous sinks
// Zero dependencies (Node.js builtins only via lib helpers).
//
// LIMITATIONS (read before interpreting results):
// ~70% recall, ~50-70% precision for medium findings.
// - No scope awareness: a variable named `input` in one function taints all uses across the file.
// - No cross-file tracing: taint does not propagate across module boundaries.
// - No closure / callback analysis: reassignment inside closures is not tracked.
// - No data-flow through arrays or object properties (e.g., `obj.field = userInput`).
// - Sanitization suppression is keyword-based; adversarial code can evade it.
// - Shell variable pattern ($VAR) is very broad in .sh/.bash/.zsh files — expect FPs.
// - Same-line source+sink detection is approximate; unrelated code on the same line may trigger.
//
// References:
// - OWASP LLM01 (Prompt Injection — injection sinks: eval, exec, SQL queries)
// - OWASP LLM02 (Sensitive Info Disclosure — exfiltration sinks: fetch, .post, .send)
// - skill-threat-patterns.md: toolchain manipulation, persistence patterns
import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
// ---------------------------------------------------------------------------
// File extension filter — only scan code files, not config/docs
// JVM-language support (.kt, .kts, .groovy, .gradle, .scala) is required for
// JetBrains plugin scanning — plugin source lives in these languages.
// ---------------------------------------------------------------------------
const CODE_EXTENSIONS = new Set([
'.js', '.mjs', '.cjs',
'.ts', '.mts', '.cts',
'.jsx', '.tsx',
'.py', '.pyw',
'.rb', '.php',
'.go', '.rs',
'.java', '.cs',
'.kt', '.kts',
'.groovy', '.gradle',
'.scala',
'.sh', '.bash', '.zsh',
]);
const SHELL_EXTENSIONS = new Set(['.sh', '.bash', '.zsh']);
// ---------------------------------------------------------------------------
// Source patterns — untrusted / externally controlled data origins
// ---------------------------------------------------------------------------
// NOTE: Shell variable pattern ($VAR) is intentionally only applied in SHELL_EXTENSIONS.
// Applying it to JS/TS would produce massive false-positive rates.
const SOURCES_COMMON = [
// Node.js / JavaScript
{ pattern: /process\.env\[?/g, label: 'process.env' },
{ pattern: /process\.argv/g, label: 'process.argv' },
{ pattern: /req\.body/g, label: 'req.body' },
{ pattern: /req\.query/g, label: 'req.query' },
{ pattern: /req\.params/g, label: 'req.params' },
{ pattern: /req\.headers/g, label: 'req.headers' },
{ pattern: /request\.body/g, label: 'request.body' },
{ pattern: /request\.form/g, label: 'request.form' },
{ pattern: /tool_input/g, label: 'tool_input' },
{ pattern: /user_input/g, label: 'user_input' },
{ pattern: /\$ARGUMENTS/g, label: '$ARGUMENTS' },
{ pattern: /\bstdin\b/g, label: 'stdin' },
// Python
{ pattern: /os\.environ/g, label: 'os.environ' },
{ pattern: /sys\.argv/g, label: 'sys.argv' },
{ pattern: /\binput\s*\(/g, label: 'input()' },
{ pattern: /request\.args/g, label: 'request.args' },
{ pattern: /request\.json/g, label: 'request.json' },
];
// Shell-only source: $VARIABLE references (excluding safe well-known vars)
const SOURCE_SHELL = { pattern: /\$\{?\w+\}?/g, label: 'shell variable' };
// Shell vars that are virtually always safe — suppress false positives
const SHELL_SAFE_VARS = new Set([
'$HOME', '$PATH', '$USER', '$PWD', '$SHELL', '$IFS', '$0', '$#',
'${HOME}', '${PATH}', '${USER}', '${PWD}', '${SHELL}',
]);
// ---------------------------------------------------------------------------
// Sink patterns — dangerous operations that could lead to injection/exfiltration
// ---------------------------------------------------------------------------
// Each sink carries a `risk` label and a preferred OWASP mapping:
// injection → LLM01
// exfiltration → LLM02
const SINKS = [
// Code / command execution (injection risk → LLM01)
{ pattern: /\beval\s*\(/g, label: 'eval()', risk: 'code execution', owasp: 'LLM01' },
{ pattern: /\bexec\s*\(/g, label: 'exec()', risk: 'command execution', owasp: 'LLM01' },
{ pattern: /\bexecSync\s*\(/g, label: 'execSync()', risk: 'command execution', owasp: 'LLM01' },
{ pattern: /\bspawn\s*\(/g, label: 'spawn()', risk: 'command execution', owasp: 'LLM01' },
{ pattern: /\bspawnSync\s*\(/g, label: 'spawnSync()', risk: 'command execution', owasp: 'LLM01' },
{ pattern: /child_process/g, label: 'child_process', risk: 'command execution', owasp: 'LLM01' },
{ pattern: /new\s+Function\s*\(/g, label: 'new Function()', risk: 'code execution', owasp: 'LLM01' },
{ pattern: /\bsubprocess\./g, label: 'subprocess', risk: 'command execution', owasp: 'LLM01' },
{ pattern: /os\.system\s*\(/g, label: 'os.system()', risk: 'command execution', owasp: 'LLM01' },
{ pattern: /os\.popen\s*\(/g, label: 'os.popen()', risk: 'command execution', owasp: 'LLM01' },
// File system writes (could be used to persist injected content)
{ pattern: /writeFile\s*\(/g, label: 'writeFile()', risk: 'file write', owasp: 'LLM01' },
{ pattern: /writeFileSync\s*\(/g, label: 'writeFileSync()', risk: 'file write', owasp: 'LLM01' },
{ pattern: /\bappendFile/g, label: 'appendFile()', risk: 'file write', owasp: 'LLM01' },
{ pattern: /createWriteStream/g, label: 'createWriteStream()', risk: 'file write', owasp: 'LLM01' },
{ pattern: /open\s*\(.*['"]w/g, label: 'open(w)', risk: 'file write', owasp: 'LLM01' },
// Network / exfiltration (data leaving the process → LLM02)
{ pattern: /\bfetch\s*\(/g, label: 'fetch()', risk: 'network request', owasp: 'LLM02' },
{ pattern: /\.send\s*\(/g, label: '.send()', risk: 'data exfiltration', owasp: 'LLM02' },
{ pattern: /\.post\s*\(/g, label: '.post()', risk: 'data exfiltration', owasp: 'LLM02' },
{ pattern: /XMLHttpRequest/g, label: 'XMLHttpRequest', risk: 'network request', owasp: 'LLM02' },
{ pattern: /WebSocket/g, label: 'WebSocket', risk: 'network connection', owasp: 'LLM02' },
// Database (SQL injection → LLM01)
{ pattern: /\.query\s*\(/g, label: '.query()', risk: 'SQL injection', owasp: 'LLM01' },
{ pattern: /\.execute\s*\(/g, label: '.execute()', risk: 'SQL injection', owasp: 'LLM01' },
{ pattern: /\.raw\s*\(/g, label: '.raw()', risk: 'raw query', owasp: 'LLM01' },
// HTML / DOM injection (XSS → LLM01 in agentic browser contexts)
{ pattern: /innerHTML\s*=/g, label: 'innerHTML', risk: 'XSS', owasp: 'LLM01' },
{ pattern: /document\.write\s*\(/g, label: 'document.write()', risk: 'XSS', owasp: 'LLM01' },
{ pattern: /dangerouslySetInnerHTML/g, label: 'dangerouslySetInnerHTML', risk: 'XSS', owasp: 'LLM01' },
];
// ---------------------------------------------------------------------------
// Sanitization suppression keywords
// ---------------------------------------------------------------------------
// If any of these appear on a line between a source and a sink (inclusive),
// severity is downgraded by one level. This is a heuristic — skilled attackers
// can bypass it by naming variables after safe functions.
const SANITIZER_PATTERN = /sanitize|escape|validate|parseInt|Number\s*\(|path\.resolve|path\.join|encodeURI|encodeURIComponent|DOMPurify|\.strip\s*\(|\.clean\s*\(|\.filter\s*\(|whitelist|allowlist/i;
// ---------------------------------------------------------------------------
// Severity ordering utilities
// ---------------------------------------------------------------------------
const SEVERITY_ORDER = [
SEVERITY.CRITICAL,
SEVERITY.HIGH,
SEVERITY.MEDIUM,
SEVERITY.LOW,
SEVERITY.INFO,
];
/**
* Return the severity one step lower than the given one.
* INFO cannot be reduced further.
* @param {string} sev
* @returns {string}
*/
function downgradeSeverity(sev) {
const idx = SEVERITY_ORDER.indexOf(sev);
if (idx < 0) return sev;
return SEVERITY_ORDER[Math.min(idx + 1, SEVERITY_ORDER.length - 1)];
}
// ---------------------------------------------------------------------------
// Variable name extraction helpers
// ---------------------------------------------------------------------------
/**
* Attempt to extract the variable name being assigned on a source line.
* Handles:
* const/let/var X = <source>
* X = <source>
* X: <source> (Python / YAML-ish)
* (X) = <source> (destructuring approximation)
*
* Returns an empty array if no assignment variable is found — the source
* will still be tracked for same-line sink detection, but not propagated.
*
* @param {string} line
* @returns {string[]} variable names (may be empty)
*/
function extractAssignedVariable(line) {
const names = [];
// Pattern 1: const/let/var X = ... or const/let/var { X } = ...
const declMatch = line.match(/\b(?:const|let|var)\s+\{?\s*(\w+)/);
if (declMatch) {
names.push(declMatch[1]);
}
// Pattern 2: plain assignment X = ... (no keyword)
// Avoid matching == and ===
const assignMatch = line.match(/^\s*(\w+)\s*=[^=]/);
if (assignMatch && !names.includes(assignMatch[1])) {
names.push(assignMatch[1]);
}
// Pattern 3: Python-style keyword argument or named parameter: X = source
// Already covered by Pattern 2 above.
return names;
}
// ---------------------------------------------------------------------------
// Shell file safety check
// ---------------------------------------------------------------------------
/**
* In shell files, check whether a matched shell variable token is a safe built-in.
* @param {string} token - e.g. "$HOME" or "${USER}"
* @returns {boolean}
*/
function isShellSafeVar(token) {
// Normalize: strip the part after the variable name in ${VAR:-default} patterns
const normalized = token.replace(/\{(\w+)[^}]*\}/, '{$1}').replace(/\{/, '').replace(/\}/, '');
const bare = '$' + normalized.replace(/^\$/, '');
return SHELL_SAFE_VARS.has(token) || SHELL_SAFE_VARS.has(bare);
}
// ---------------------------------------------------------------------------
// Per-line source/sink detection
// ---------------------------------------------------------------------------
/**
* Check if a line contains a source pattern.
* Returns all matches: { label, position }.
* For shell files, skips safe built-in variables.
*
* @param {string} line
* @param {boolean} isShell
* @returns {Array<{ label: string, position: number }>}
*/
function detectSources(line, isShell) {
const sources = [...SOURCES_COMMON];
if (isShell) sources.push(SOURCE_SHELL);
const matches = [];
for (const src of sources) {
// Reset regex state (global flag retains lastIndex)
const re = new RegExp(src.pattern.source, src.pattern.flags);
let m;
while ((m = re.exec(line)) !== null) {
// Shell safe-var suppression
if (isShell && src === SOURCE_SHELL) {
const token = m[0];
if (isShellSafeVar(token)) continue;
}
matches.push({ label: src.label, position: m.index });
}
}
return matches;
}
/**
* Check if a line contains a sink pattern.
* Returns all matches: { label, risk, owasp, position }.
*
* @param {string} line
* @returns {Array<{ label: string, risk: string, owasp: string, position: number }>}
*/
function detectSinks(line) {
const matches = [];
for (const sink of SINKS) {
const re = new RegExp(sink.pattern.source, sink.pattern.flags);
let m;
while ((m = re.exec(line)) !== null) {
matches.push({ label: sink.label, risk: sink.risk, owasp: sink.owasp, position: m.index });
}
}
return matches;
}
// ---------------------------------------------------------------------------
// Sanitization check in a line range
// ---------------------------------------------------------------------------
/**
* Check whether any line in [fromLine, toLine] (0-indexed, inclusive) contains
* a sanitization keyword. If so, caller should downgrade severity.
*
* @param {string[]} lines
* @param {number} fromIdx - 0-based inclusive start
* @param {number} toIdx - 0-based inclusive end
* @returns {boolean}
*/
function hasSanitizationBetween(lines, fromIdx, toIdx) {
const start = Math.max(0, fromIdx);
const end = Math.min(lines.length - 1, toIdx);
for (let i = start; i <= end; i++) {
if (SANITIZER_PATTERN.test(lines[i])) return true;
}
return false;
}
// ---------------------------------------------------------------------------
// Proximity-based severity
// ---------------------------------------------------------------------------
/**
* Map line distance between source and sink to a base severity.
* same line (dist 0) → CRITICAL
* within 10 lines → HIGH
* within 50 lines → MEDIUM
* beyond 50 lines → LOW
*
* @param {number} distance - number of lines between source and sink (0 = same line)
* @returns {string}
*/
function distanceToSeverity(distance) {
if (distance === 0) return SEVERITY.CRITICAL;
if (distance <= 10) return SEVERITY.HIGH;
if (distance <= 50) return SEVERITY.MEDIUM;
return SEVERITY.LOW;
}
// ---------------------------------------------------------------------------
// Tainted variable tracking
// ---------------------------------------------------------------------------
/**
* @typedef {{ name: string, sourceLine: number, sourceLabel: string }} TaintedVar
*/
// ---------------------------------------------------------------------------
// Per-file scan
// ---------------------------------------------------------------------------
/**
* Run the 3-pass taint analysis on a single file.
*
* Pass 1 — Source Detection: Find lines with source patterns, extract assigned variable names.
* Pass 2 — Same-line Flow: Source and sink on the same line → CRITICAL finding.
* Pass 3 — Variable-to-Sink: For each tainted variable, search subsequent lines for its name
* appearing near a sink → severity by proximity.
*
* @param {string} content - File text
* @param {string} absPath - Absolute path (for suppression checks)
* @param {string} relPath - Relative path (for finding output)
* @returns {ReturnType<typeof import('./lib/output.mjs').finding>[]}
*/
function scanFileContent(content, absPath, relPath) {
const lines = content.split('\n');
const isShell = SHELL_EXTENSIONS.has(
(relPath.match(/\.[^.]+$/) || [''])[0].toLowerCase()
);
const fileFindings = [];
// Dedup key: prevent reporting the same source+sink pair multiple times
const reportedPairs = new Set();
// ---- Pass 1: Source Detection ----
// Collect tainted variables and same-line sink candidates in a single sweep.
/** @type {TaintedVar[]} */
const taintedVars = [];
for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
const line = lines[lineIdx];
const sourceMatches = detectSources(line, isShell);
if (sourceMatches.length === 0) continue;
// Extract variable being assigned on this source line
const assignedVarNames = extractAssignedVariable(line);
for (const varName of assignedVarNames) {
// Skip very short or overly generic names that would produce noise
if (varName.length < 2) continue;
taintedVars.push({ name: varName, sourceLine: lineIdx, sourceLabel: sourceMatches[0].label });
}
// ---- Pass 2: Same-line Source + Sink ----
const sinkMatches = detectSinks(line);
for (const src of sourceMatches) {
for (const sink of sinkMatches) {
const pairKey = `sameline:${lineIdx}:${src.label}:${sink.label}`;
if (reportedPairs.has(pairKey)) continue;
reportedPairs.add(pairKey);
// Same-line: CRITICAL, but check for sanitizer on the same line
let severity = SEVERITY.CRITICAL;
if (hasSanitizationBetween(lines, lineIdx, lineIdx)) {
severity = downgradeSeverity(severity);
}
fileFindings.push(
finding({
scanner: 'TNT',
severity,
title: `Taint: ${src.label} flows directly to ${sink.label} (same line)`,
description:
`Untrusted data from source \`${src.label}\` appears on the same line as ` +
`dangerous sink \`${sink.label}\` (${sink.risk}). ` +
`Same-line flow is a strong indicator of unsanitized data reaching a dangerous operation.`,
file: relPath,
line: lineIdx + 1,
evidence: `source \`${src.label}\` at line ${lineIdx + 1} flows to \`${sink.label}\` at line ${lineIdx + 1} (same-line)`,
owasp: sink.owasp,
recommendation:
'Validate/sanitize data before passing to sink. Consider using parameterized queries, allowlists, or safe APIs.',
})
);
}
}
}
// ---- Pass 3: Variable-to-Sink ----
// For each tainted variable, scan lines after the source for the variable name
// appearing in context with a sink.
//
// Strategy: scan every line that comes after the source line for the presence of:
// (a) the tainted variable name as a word token, AND
// (b) a sink pattern on the same line.
//
// We also catch the case where the variable appears as an argument to a sink call
// on the same line (most common real-world pattern).
for (const taintedVar of taintedVars) {
// Build a word-boundary regex for the variable name to avoid substring matches
// (e.g., "cmd" should not match "cmdLine" unless we want it to).
// We use a simple word-boundary check here.
const varRe = new RegExp(`\\b${escapeRegex(taintedVar.name)}\\b`);
for (let lineIdx = taintedVar.sourceLine + 1; lineIdx < lines.length; lineIdx++) {
const line = lines[lineIdx];
// Check if tainted variable appears on this line
if (!varRe.test(line)) continue;
// Check if a sink also appears on this line
const sinkMatches = detectSinks(line);
if (sinkMatches.length === 0) continue;
for (const sink of sinkMatches) {
const distance = lineIdx - taintedVar.sourceLine;
const pairKey = `var:${relPath}:${taintedVar.name}:${taintedVar.sourceLine}:${sink.label}:${lineIdx}`;
if (reportedPairs.has(pairKey)) continue;
reportedPairs.add(pairKey);
let severity = distanceToSeverity(distance);
// Apply sanitization suppression: scan lines from source through sink
if (hasSanitizationBetween(lines, taintedVar.sourceLine, lineIdx)) {
severity = downgradeSeverity(severity);
}
fileFindings.push(
finding({
scanner: 'TNT',
severity,
title: `Taint: ${taintedVar.sourceLabel}${taintedVar.name}${sink.label}`,
description:
`Variable \`${taintedVar.name}\` is assigned from untrusted source ` +
`\`${taintedVar.sourceLabel}\` at line ${taintedVar.sourceLine + 1} ` +
`and flows into dangerous sink \`${sink.label}\` (${sink.risk}) ` +
`at line ${lineIdx + 1} (${distance} line${distance === 1 ? '' : 's'} away). ` +
`No recognized sanitization was detected between source and sink.`,
file: relPath,
line: lineIdx + 1,
evidence:
`source \`${taintedVar.sourceLabel}\` at line ${taintedVar.sourceLine + 1} ` +
`flows to \`${sink.label}\` at line ${lineIdx + 1} ` +
`via variable \`${taintedVar.name}\``,
owasp: sink.owasp,
recommendation:
'Validate/sanitize data before passing to sink. Consider using parameterized queries, allowlists, or safe APIs.',
})
);
}
}
}
return fileFindings;
}
// ---------------------------------------------------------------------------
// Utility: escape regex special characters in a variable name
// ---------------------------------------------------------------------------
/**
* Escape regex metacharacters in a literal string so it can be embedded in a RegExp.
* @param {string} str
* @returns {string}
*/
function escapeRegex(str) {
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------
/**
* Scan a target path for taint flows from untrusted sources to dangerous sinks.
*
* Only processes code files (.js, .mjs, .cjs, .ts, .mts, .cts, .jsx, .tsx,
* .py, .pyw, .rb, .php, .go, .rs, .java, .cs, .sh, .bash, .zsh).
* All other files in the discovery set are skipped silently.
*
* @param {string} targetPath - Absolute path to scan (file or directory root)
* @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
* Pre-computed file discovery result from the orchestrator.
* @returns {Promise<object>} Scanner result envelope (see lib/output.mjs::scannerResult)
*/
export async function scan(targetPath, discovery) {
const startMs = Date.now();
const allFindings = [];
let filesScanned = 0;
try {
for (const fileInfo of discovery.files) {
// Only scan code files
if (!CODE_EXTENSIONS.has(fileInfo.ext)) continue;
const content = await readTextFile(fileInfo.absPath);
// readTextFile returns null for binary files or unreadable paths
if (content === null) continue;
filesScanned++;
const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
allFindings.push(...fileFindings);
}
const durationMs = Date.now() - startMs;
return scannerResult('taint-tracer', 'ok', allFindings, filesScanned, durationMs);
} catch (err) {
const durationMs = Date.now() - startMs;
return scannerResult(
'taint-tracer',
'error',
allFindings,
filesScanned,
durationMs,
String(err?.message || err)
);
}
}