// taint-tracer.mjs — Deterministic taint analysis: traces untrusted data from sources to dangerous sinks
// Zero dependencies (Node.js builtins only via lib helpers).
//
// LIMITATIONS (read before interpreting results):
//   ~70% recall, ~50-70% precision for medium findings.
//   - No scope awareness: a variable named `input` in one function taints all uses across the file.
//   - No cross-file tracing: taint does not propagate across module boundaries.
//   - No closure / callback analysis: reassignment inside closures is not tracked.
//   - No data-flow through arrays or object properties (e.g., `obj.field = userInput`).
//   - Sanitization suppression is keyword-based; adversarial code can evade it.
//   - Shell variable pattern ($VAR) is very broad in .sh/.bash/.zsh files — expect FPs.
//   - Same-line source+sink detection is approximate; unrelated code on the same line may trigger.
//
// References:
//   - OWASP LLM01 (Prompt Injection — injection sinks: eval, exec, SQL queries)
//   - OWASP LLM02 (Sensitive Info Disclosure — exfiltration sinks: fetch, .post, .send)
//   - skill-threat-patterns.md: toolchain manipulation, persistence patterns

import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';

// ---------------------------------------------------------------------------
// File extension filter — only scan code files, not config/docs
// JVM-language support (.kt, .kts, .groovy, .gradle, .scala) is required for
// JetBrains plugin scanning — plugin source lives in these languages.
// ---------------------------------------------------------------------------

const CODE_EXTENSIONS = new Set([
  '.js', '.mjs', '.cjs',
  '.ts', '.mts', '.cts',
  '.jsx', '.tsx',
  '.py', '.pyw',
  '.rb', '.php',
  '.go', '.rs',
  '.java', '.cs',
  '.kt', '.kts',
  '.groovy', '.gradle',
  '.scala',
  '.sh', '.bash', '.zsh',
]);

const SHELL_EXTENSIONS = new Set(['.sh', '.bash', '.zsh']);

// ---------------------------------------------------------------------------
// Source patterns — untrusted / externally controlled data origins
// ---------------------------------------------------------------------------

// NOTE: Shell variable pattern ($VAR) is intentionally only applied in SHELL_EXTENSIONS.
// Applying it to JS/TS would produce massive false-positive rates.
const SOURCES_COMMON = [
  // Node.js / JavaScript
  { pattern: /process\.env\[?/g,      label: 'process.env' },
  { pattern: /process\.argv/g,         label: 'process.argv' },
  { pattern: /req\.body/g,             label: 'req.body' },
  { pattern: /req\.query/g,            label: 'req.query' },
  { pattern: /req\.params/g,           label: 'req.params' },
  { pattern: /req\.headers/g,          label: 'req.headers' },
  { pattern: /request\.body/g,         label: 'request.body' },
  { pattern: /request\.form/g,         label: 'request.form' },
  { pattern: /tool_input/g,            label: 'tool_input' },
  { pattern: /user_input/g,            label: 'user_input' },
  { pattern: /\$ARGUMENTS/g,           label: '$ARGUMENTS' },
  { pattern: /\bstdin\b/g,             label: 'stdin' },
  // Python
  { pattern: /os\.environ/g,           label: 'os.environ' },
  { pattern: /sys\.argv/g,             label: 'sys.argv' },
  { pattern: /\binput\s*\(/g,          label: 'input()' },
  { pattern: /request\.args/g,         label: 'request.args' },
  { pattern: /request\.json/g,         label: 'request.json' },
];

// Shell-only source: $VARIABLE references (excluding safe well-known vars)
const SOURCE_SHELL = { pattern: /\$\{?\w+\}?/g, label: 'shell variable' };

// Shell vars that are virtually always safe — suppress false positives
const SHELL_SAFE_VARS = new Set([
  '$HOME', '$PATH', '$USER', '$PWD', '$SHELL', '$IFS', '$0', '$#',
  '${HOME}', '${PATH}', '${USER}', '${PWD}', '${SHELL}',
]);

// ---------------------------------------------------------------------------
// Sink patterns — dangerous operations that could lead to injection/exfiltration
// ---------------------------------------------------------------------------

// Each sink carries a `risk` label and a preferred OWASP mapping:
//   injection  → LLM01
//   exfiltration → LLM02

const SINKS = [
  // Code / command execution (injection risk → LLM01)
  { pattern: /\beval\s*\(/g,              label: 'eval()',                 risk: 'code execution',    owasp: 'LLM01' },
  { pattern: /\bexec\s*\(/g,             label: 'exec()',                  risk: 'command execution', owasp: 'LLM01' },
  { pattern: /\bexecSync\s*\(/g,         label: 'execSync()',              risk: 'command execution', owasp: 'LLM01' },
  { pattern: /\bspawn\s*\(/g,            label: 'spawn()',                 risk: 'command execution', owasp: 'LLM01' },
  { pattern: /\bspawnSync\s*\(/g,        label: 'spawnSync()',             risk: 'command execution', owasp: 'LLM01' },
  { pattern: /child_process/g,           label: 'child_process',          risk: 'command execution', owasp: 'LLM01' },
  { pattern: /new\s+Function\s*\(/g,     label: 'new Function()',         risk: 'code execution',    owasp: 'LLM01' },
  { pattern: /\bsubprocess\./g,          label: 'subprocess',             risk: 'command execution', owasp: 'LLM01' },
  { pattern: /os\.system\s*\(/g,         label: 'os.system()',            risk: 'command execution', owasp: 'LLM01' },
  { pattern: /os\.popen\s*\(/g,          label: 'os.popen()',             risk: 'command execution', owasp: 'LLM01' },
  // File system writes (could be used to persist injected content)
  { pattern: /writeFile\s*\(/g,          label: 'writeFile()',            risk: 'file write',        owasp: 'LLM01' },
  { pattern: /writeFileSync\s*\(/g,      label: 'writeFileSync()',        risk: 'file write',        owasp: 'LLM01' },
  { pattern: /\bappendFile/g,            label: 'appendFile()',           risk: 'file write',        owasp: 'LLM01' },
  { pattern: /createWriteStream/g,       label: 'createWriteStream()',    risk: 'file write',        owasp: 'LLM01' },
  { pattern: /open\s*\(.*['"]w/g,        label: 'open(w)',               risk: 'file write',        owasp: 'LLM01' },
  // Network / exfiltration (data leaving the process → LLM02)
  { pattern: /\bfetch\s*\(/g,            label: 'fetch()',               risk: 'network request',   owasp: 'LLM02' },
  { pattern: /\.send\s*\(/g,             label: '.send()',               risk: 'data exfiltration', owasp: 'LLM02' },
  { pattern: /\.post\s*\(/g,             label: '.post()',               risk: 'data exfiltration', owasp: 'LLM02' },
  { pattern: /XMLHttpRequest/g,          label: 'XMLHttpRequest',        risk: 'network request',   owasp: 'LLM02' },
  { pattern: /WebSocket/g,               label: 'WebSocket',             risk: 'network connection', owasp: 'LLM02' },
  // Database (SQL injection → LLM01)
  { pattern: /\.query\s*\(/g,            label: '.query()',              risk: 'SQL injection',     owasp: 'LLM01' },
  { pattern: /\.execute\s*\(/g,          label: '.execute()',            risk: 'SQL injection',     owasp: 'LLM01' },
  { pattern: /\.raw\s*\(/g,              label: '.raw()',                risk: 'raw query',         owasp: 'LLM01' },
  // HTML / DOM injection (XSS → LLM01 in agentic browser contexts)
  { pattern: /innerHTML\s*=/g,           label: 'innerHTML',             risk: 'XSS',               owasp: 'LLM01' },
  { pattern: /document\.write\s*\(/g,    label: 'document.write()',      risk: 'XSS',               owasp: 'LLM01' },
  { pattern: /dangerouslySetInnerHTML/g, label: 'dangerouslySetInnerHTML', risk: 'XSS',             owasp: 'LLM01' },
];

// ---------------------------------------------------------------------------
// Sanitization suppression keywords
// ---------------------------------------------------------------------------
// If any of these appear on a line between a source and a sink (inclusive),
// severity is downgraded by one level. This is a heuristic — skilled attackers
// can bypass it by naming variables after safe functions.

const SANITIZER_PATTERN = /sanitize|escape|validate|parseInt|Number\s*\(|path\.resolve|path\.join|encodeURI|encodeURIComponent|DOMPurify|\.strip\s*\(|\.clean\s*\(|\.filter\s*\(|whitelist|allowlist/i;

// ---------------------------------------------------------------------------
// Severity ordering utilities
// ---------------------------------------------------------------------------

const SEVERITY_ORDER = [
  SEVERITY.CRITICAL,
  SEVERITY.HIGH,
  SEVERITY.MEDIUM,
  SEVERITY.LOW,
  SEVERITY.INFO,
];

/**
 * Return the severity one step lower than the given one.
 * INFO cannot be reduced further.
 * @param {string} sev
 * @returns {string}
 */
function downgradeSeverity(sev) {
  const idx = SEVERITY_ORDER.indexOf(sev);
  if (idx < 0) return sev;
  return SEVERITY_ORDER[Math.min(idx + 1, SEVERITY_ORDER.length - 1)];
}

// ---------------------------------------------------------------------------
// Variable name extraction helpers
// ---------------------------------------------------------------------------

/**
 * Attempt to extract the variable name(s) being assigned on a source line.
 * Handles:
 *   const/let/var X = <source>                           (plain decl)
 *   X = <source>                                          (plain assignment)
 *   X: <source>                                           (Python / YAML-ish)
 *   const { x } = <source>                                (object destructuring)
 *   const { x, y } = <source>                             (multi-key)
 *   const { secret: alias } = <source>                    (renamed)
 *   const { a, ...spread } = <source>                     (object rest)
 *   const { a, b: { c } } = <source>                      (nested object)
 *   const [a, b] = <source>                               (array destructuring)
 *   const [first, ...rest] = <source>                     (array rest)
 *   const [a, [b, c]] = <source>                          (nested array)
 *
 * Implementation: regex-based, no full JS parser. Same constraint as the
 * pre-B6 extractor — the goal is best-effort, not soundness. Untracked
 * variables fall back to same-line sink detection (no propagation).
 *
 * Returns an empty array if no assignment variable is found.
 *
 * @param {string} line
 * @returns {string[]} variable names (may be empty)
 */
export function extractAssignedVariable(line) {
  const names = new Set();

  // Identify a destructuring pattern boundary on the LHS of `=`.
  // Match `const|let|var` followed by either `{...}` or `[...]` and `=`.
  // We capture the LHS-pattern body so we can extract names without
  // reading past the assignment.
  const destructDecl = line.match(/\b(?:const|let|var)\s+([{[][\s\S]*?[\]}])\s*=[^=]/);
  if (destructDecl) {
    extractDestructuredNames(destructDecl[1], names);
  } else {
    // Pattern 1: const/let/var X = ...  (plain identifier — keep
    // existing behavior; the original pre-B6 regex tolerated optional `{`
    // and silently ate the first key. Now that destructuring has its own
    // branch above, the plain-decl branch only matches plain identifiers.)
    const declMatch = line.match(/\b(?:const|let|var)\s+(\w+)\s*=/);
    if (declMatch) {
      names.add(declMatch[1]);
    }
  }

  // Pattern 2: plain assignment  X = ...  (no keyword)
  // Avoid matching == and ===
  const assignMatch = line.match(/^\s*(\w+)\s*=[^=]/);
  if (assignMatch) {
    names.add(assignMatch[1]);
  }

  // Pattern 3 (Python-style `X: source`) — already covered by other patterns
  // when present in YAML/Python contexts via the plain-decl branch.

  return [...names];
}

/**
 * Walk a destructuring pattern body (the `{...}` or `[...]` after the
 * `const`/`let`/`var` keyword and before `=`) and add every bound
 * identifier to `names`. Handles nested patterns and rest elements.
 *
 * Pure regex — does not parse balanced brackets perfectly, but the
 * patterns we care about (plain identifiers, renamed keys `key: alias`,
 * rest `...spread`) all surface as `\w+` tokens at predictable positions
 * that a simple tokenizer can extract. Edge case: shorthand keys with
 * default values (`{ x = 5 }`) are handled by the identifier-before-= rule.
 *
 * @param {string} pattern  The body including the outer brackets.
 * @param {Set<string>} names  Mutated.
 */
function extractDestructuredNames(pattern, names) {
  // Strip outer brackets so we focus on contents.
  const inner = pattern.slice(1, -1);

  // Token-walk: at each position consume one of:
  //   - `{ ... }` or `[ ... ]` — recurse into the nested pattern
  //   - `key: <rhs>` — bind whatever \w+ comes from <rhs>'s leading ident
  //                    (or recurse if <rhs> is a nested pattern)
  //   - `...spread` — the next ident is the rest var
  //   - `ident` — bound directly (shorthand or array element)
  //   - `ident = default` — bound (default value ignored)
  //   - separators (`,`, whitespace) — skip
  //
  // Implementation simplification: match on three regex alternatives that
  // cover everything in practice. Catastrophic-backtracking-safe: every
  // token consumes ≥1 character.

  let i = 0;
  while (i < inner.length) {
    const ch = inner[i];

    if (ch === '{' || ch === '[') {
      // Find matching close bracket via depth counter (handles nesting).
      const open = ch;
      const close = open === '{' ? '}' : ']';
      let depth = 1;
      let j = i + 1;
      while (j < inner.length && depth > 0) {
        if (inner[j] === open) depth++;
        else if (inner[j] === close) depth--;
        j++;
      }
      // Recurse into the nested pattern body.
      extractDestructuredNames(inner.slice(i, j), names);
      i = j;
      continue;
    }

    if (ch === ',' || /\s/.test(ch) || ch === ':' || ch === '=') {
      i++;
      continue;
    }

    if (inner.startsWith('...', i)) {
      i += 3;
      continue;
    }

    // Identifier token. After this token: either followed by `:` (then
    // the RHS is the actual binding — skip this token, the bind is the
    // next ident), or followed by `,`/`}`/`]`/`=`/whitespace/end (then
    // this token is the bound name).
    const idMatch = inner.slice(i).match(/^(\w+)/);
    if (!idMatch) {
      i++;
      continue;
    }
    const ident = idMatch[1];
    const next = i + ident.length;
    // Skip whitespace to find the next significant character.
    let k = next;
    while (k < inner.length && /\s/.test(inner[k])) k++;
    if (inner[k] === ':') {
      // This ident is a key — the RHS is the binding. Don't add this
      // ident; the loop will pick up the RHS on the next iteration.
      i = k + 1;
      continue;
    }
    // Otherwise this ident IS bound.
    names.add(ident);
    i = next;
  }
}

// ---------------------------------------------------------------------------
// Shell file safety check
// ---------------------------------------------------------------------------

/**
 * In shell files, check whether a matched shell variable token is a safe built-in.
 * @param {string} token - e.g. "$HOME" or "${USER}"
 * @returns {boolean}
 */
function isShellSafeVar(token) {
  // Normalize: strip the part after the variable name in ${VAR:-default} patterns
  const normalized = token.replace(/\{(\w+)[^}]*\}/, '{$1}').replace(/\{/, '').replace(/\}/, '');
  const bare = '$' + normalized.replace(/^\$/, '');
  return SHELL_SAFE_VARS.has(token) || SHELL_SAFE_VARS.has(bare);
}

// ---------------------------------------------------------------------------
// Per-line source/sink detection
// ---------------------------------------------------------------------------

/**
 * Check if a line contains a source pattern.
 * Returns all matches: { label, position }.
 * For shell files, skips safe built-in variables.
 *
 * @param {string} line
 * @param {boolean} isShell
 * @returns {Array<{ label: string, position: number }>}
 */
function detectSources(line, isShell) {
  const sources = [...SOURCES_COMMON];
  if (isShell) sources.push(SOURCE_SHELL);

  const matches = [];

  for (const src of sources) {
    // Reset regex state (global flag retains lastIndex)
    const re = new RegExp(src.pattern.source, src.pattern.flags);
    let m;
    while ((m = re.exec(line)) !== null) {
      // Shell safe-var suppression
      if (isShell && src === SOURCE_SHELL) {
        const token = m[0];
        if (isShellSafeVar(token)) continue;
      }
      matches.push({ label: src.label, position: m.index });
    }
  }

  return matches;
}

/**
 * Check if a line contains a sink pattern.
 * Returns all matches: { label, risk, owasp, position }.
 *
 * @param {string} line
 * @returns {Array<{ label: string, risk: string, owasp: string, position: number }>}
 */
function detectSinks(line) {
  const matches = [];
  for (const sink of SINKS) {
    const re = new RegExp(sink.pattern.source, sink.pattern.flags);
    let m;
    while ((m = re.exec(line)) !== null) {
      matches.push({ label: sink.label, risk: sink.risk, owasp: sink.owasp, position: m.index });
    }
  }
  return matches;
}

// ---------------------------------------------------------------------------
// Sanitization check in a line range
// ---------------------------------------------------------------------------

/**
 * Check whether any line in [fromLine, toLine] (0-indexed, inclusive) contains
 * a sanitization keyword. If so, caller should downgrade severity.
 *
 * @param {string[]} lines
 * @param {number} fromIdx - 0-based inclusive start
 * @param {number} toIdx   - 0-based inclusive end
 * @returns {boolean}
 */
function hasSanitizationBetween(lines, fromIdx, toIdx) {
  const start = Math.max(0, fromIdx);
  const end = Math.min(lines.length - 1, toIdx);
  for (let i = start; i <= end; i++) {
    if (SANITIZER_PATTERN.test(lines[i])) return true;
  }
  return false;
}

// ---------------------------------------------------------------------------
// Proximity-based severity
// ---------------------------------------------------------------------------

/**
 * Map line distance between source and sink to a base severity.
 *   same line (dist 0) → CRITICAL
 *   within 10 lines    → HIGH
 *   within 50 lines    → MEDIUM
 *   beyond 50 lines    → LOW
 *
 * @param {number} distance - number of lines between source and sink (0 = same line)
 * @returns {string}
 */
function distanceToSeverity(distance) {
  if (distance === 0)  return SEVERITY.CRITICAL;
  if (distance <= 10)  return SEVERITY.HIGH;
  if (distance <= 50)  return SEVERITY.MEDIUM;
  return SEVERITY.LOW;
}

// ---------------------------------------------------------------------------
// Tainted variable tracking
// ---------------------------------------------------------------------------

/**
 * @typedef {{ name: string, sourceLine: number, sourceLabel: string }} TaintedVar
 */

// ---------------------------------------------------------------------------
// Per-file scan
// ---------------------------------------------------------------------------

/**
 * Run the 3-pass taint analysis on a single file.
 *
 * Pass 1 — Source Detection:  Find lines with source patterns, extract assigned variable names.
 * Pass 2 — Same-line Flow:    Source and sink on the same line → CRITICAL finding.
 * Pass 3 — Variable-to-Sink:  For each tainted variable, search subsequent lines for its name
 *                              appearing near a sink → severity by proximity.
 *
 * @param {string} content   - File text
 * @param {string} absPath   - Absolute path (for suppression checks)
 * @param {string} relPath   - Relative path (for finding output)
 * @returns {ReturnType<typeof import('./lib/output.mjs').finding>[]}
 */
function scanFileContent(content, absPath, relPath) {
  const lines = content.split('\n');
  const isShell = SHELL_EXTENSIONS.has(
    (relPath.match(/\.[^.]+$/) || [''])[0].toLowerCase()
  );
  const fileFindings = [];

  // Dedup key: prevent reporting the same source+sink pair multiple times
  const reportedPairs = new Set();

  // ---- Pass 1: Source Detection ----
  // Collect tainted variables and same-line sink candidates in a single sweep.

  /** @type {TaintedVar[]} */
  const taintedVars = [];

  for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
    const line = lines[lineIdx];
    const sourceMatches = detectSources(line, isShell);
    if (sourceMatches.length === 0) continue;

    // Extract variable being assigned on this source line
    const assignedVarNames = extractAssignedVariable(line);
    for (const varName of assignedVarNames) {
      // Skip very short or overly generic names that would produce noise
      if (varName.length < 2) continue;
      taintedVars.push({ name: varName, sourceLine: lineIdx, sourceLabel: sourceMatches[0].label });
    }

    // ---- Pass 2: Same-line Source + Sink ----
    const sinkMatches = detectSinks(line);
    for (const src of sourceMatches) {
      for (const sink of sinkMatches) {
        const pairKey = `sameline:${lineIdx}:${src.label}:${sink.label}`;
        if (reportedPairs.has(pairKey)) continue;
        reportedPairs.add(pairKey);

        // Same-line: CRITICAL, but check for sanitizer on the same line
        let severity = SEVERITY.CRITICAL;
        if (hasSanitizationBetween(lines, lineIdx, lineIdx)) {
          severity = downgradeSeverity(severity);
        }

        fileFindings.push(
          finding({
            scanner: 'TNT',
            severity,
            title: `Taint: ${src.label} flows directly to ${sink.label} (same line)`,
            description:
              `Untrusted data from source \`${src.label}\` appears on the same line as ` +
              `dangerous sink \`${sink.label}\` (${sink.risk}). ` +
              `Same-line flow is a strong indicator of unsanitized data reaching a dangerous operation.`,
            file: relPath,
            line: lineIdx + 1,
            evidence: `source \`${src.label}\` at line ${lineIdx + 1} flows to \`${sink.label}\` at line ${lineIdx + 1} (same-line)`,
            owasp: sink.owasp,
            recommendation:
              'Validate/sanitize data before passing to sink. Consider using parameterized queries, allowlists, or safe APIs.',
          })
        );
      }
    }
  }

  // ---- Pass 3: Variable-to-Sink ----
  // For each tainted variable, scan lines after the source for the variable name
  // appearing in context with a sink.
  //
  // Strategy: scan every line that comes after the source line for the presence of:
  //   (a) the tainted variable name as a word token, AND
  //   (b) a sink pattern on the same line.
  //
  // We also catch the case where the variable appears as an argument to a sink call
  // on the same line (most common real-world pattern).

  for (const taintedVar of taintedVars) {
    // Build a word-boundary regex for the variable name to avoid substring matches
    // (e.g., "cmd" should not match "cmdLine" unless we want it to).
    // We use a simple word-boundary check here.
    const varRe = new RegExp(`\\b${escapeRegex(taintedVar.name)}\\b`);

    for (let lineIdx = taintedVar.sourceLine + 1; lineIdx < lines.length; lineIdx++) {
      const line = lines[lineIdx];

      // Check if tainted variable appears on this line
      if (!varRe.test(line)) continue;

      // Check if a sink also appears on this line
      const sinkMatches = detectSinks(line);
      if (sinkMatches.length === 0) continue;

      for (const sink of sinkMatches) {
        const distance = lineIdx - taintedVar.sourceLine;
        const pairKey = `var:${relPath}:${taintedVar.name}:${taintedVar.sourceLine}:${sink.label}:${lineIdx}`;
        if (reportedPairs.has(pairKey)) continue;
        reportedPairs.add(pairKey);

        let severity = distanceToSeverity(distance);

        // Apply sanitization suppression: scan lines from source through sink
        if (hasSanitizationBetween(lines, taintedVar.sourceLine, lineIdx)) {
          severity = downgradeSeverity(severity);
        }

        fileFindings.push(
          finding({
            scanner: 'TNT',
            severity,
            title: `Taint: ${taintedVar.sourceLabel} → ${taintedVar.name} → ${sink.label}`,
            description:
              `Variable \`${taintedVar.name}\` is assigned from untrusted source ` +
              `\`${taintedVar.sourceLabel}\` at line ${taintedVar.sourceLine + 1} ` +
              `and flows into dangerous sink \`${sink.label}\` (${sink.risk}) ` +
              `at line ${lineIdx + 1} (${distance} line${distance === 1 ? '' : 's'} away). ` +
              `No recognized sanitization was detected between source and sink.`,
            file: relPath,
            line: lineIdx + 1,
            evidence:
              `source \`${taintedVar.sourceLabel}\` at line ${taintedVar.sourceLine + 1} ` +
              `flows to \`${sink.label}\` at line ${lineIdx + 1} ` +
              `via variable \`${taintedVar.name}\``,
            owasp: sink.owasp,
            recommendation:
              'Validate/sanitize data before passing to sink. Consider using parameterized queries, allowlists, or safe APIs.',
          })
        );
      }
    }
  }

  return fileFindings;
}

// ---------------------------------------------------------------------------
// Utility: escape regex special characters in a variable name
// ---------------------------------------------------------------------------

/**
 * Escape regex metacharacters in a literal string so it can be embedded in a RegExp.
 * @param {string} str
 * @returns {string}
 */
function escapeRegex(str) {
  return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------

/**
 * Scan a target path for taint flows from untrusted sources to dangerous sinks.
 *
 * Only processes code files (.js, .mjs, .cjs, .ts, .mts, .cts, .jsx, .tsx,
 * .py, .pyw, .rb, .php, .go, .rs, .java, .cs, .sh, .bash, .zsh).
 * All other files in the discovery set are skipped silently.
 *
 * @param {string} targetPath - Absolute path to scan (file or directory root)
 * @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
 *   Pre-computed file discovery result from the orchestrator.
 * @returns {Promise<object>} Scanner result envelope (see lib/output.mjs::scannerResult)
 */
export async function scan(targetPath, discovery) {
  const startMs = Date.now();
  const allFindings = [];
  let filesScanned = 0;

  try {
    for (const fileInfo of discovery.files) {
      // Only scan code files
      if (!CODE_EXTENSIONS.has(fileInfo.ext)) continue;

      const content = await readTextFile(fileInfo.absPath);

      // readTextFile returns null for binary files or unreadable paths
      if (content === null) continue;

      filesScanned++;

      const fileFindings = scanFileContent(content, fileInfo.absPath, fileInfo.relPath);
      allFindings.push(...fileFindings);
    }

    const durationMs = Date.now() - startMs;
    return scannerResult('taint-tracer', 'ok', allFindings, filesScanned, durationMs);
  } catch (err) {
    const durationMs = Date.now() - startMs;
    return scannerResult(
      'taint-tracer',
      'error',
      allFindings,
      filesScanned,
      durationMs,
      String(err?.message || err)
    );
  }
}