ktg-plugin-marketplace/plugins/llm-security/scanners/scan-orchestrator.mjs

#!/usr/bin/env node
// scan-orchestrator.mjs — Entry point for deterministic deep-scan
// Single Node.js process. Imports all 7 scanners, runs them sequentially,
// shares file discovery, outputs JSON envelope to stdout.
// Zero external dependencies.

import { resolve, join, dirname } from 'node:path';
import { existsSync, readFileSync, writeFileSync, appendFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { tmpdir } from 'node:os';
import { discoverFiles } from './lib/file-discovery.mjs';
import { envelope, resetCounter } from './lib/output.mjs';
import { saveBaseline, diffAgainstBaseline, extractFindings } from './lib/diff-engine.mjs';
import { toSARIF } from './lib/sarif-formatter.mjs';

// ---------------------------------------------------------------------------
// .llm-security-ignore support
// Format: one rule per line. Blank lines and # comments ignored.
//   SCANNER:glob    — ignore findings from SCANNER matching file glob
//   glob            — ignore findings from ALL scanners matching file glob
// Globs use minimatch-style: * matches within path segment, ** across segments.
// ---------------------------------------------------------------------------
function loadIgnoreRules(targetPath) {
  const ignoreFile = join(targetPath, '.llm-security-ignore');
  if (!existsSync(ignoreFile)) return [];
  const lines = readFileSync(ignoreFile, 'utf8').split('\n');
  const rules = [];
  for (const raw of lines) {
    const line = raw.trim();
    if (!line || line.startsWith('#')) continue;
    const colonIdx = line.indexOf(':');
    // Check if before colon is a known scanner prefix (3 uppercase letters)
    if (colonIdx > 0 && colonIdx <= 3 && /^[A-Z]+$/.test(line.slice(0, colonIdx))) {
      rules.push({ scanner: line.slice(0, colonIdx), pattern: line.slice(colonIdx + 1) });
    } else {
      rules.push({ scanner: null, pattern: line });
    }
  }
  return rules;
}

function globToRegex(glob) {
  let regex = '^';
  let i = 0;
  while (i < glob.length) {
    const c = glob[i];
    if (c === '*' && glob[i + 1] === '*') {
      regex += '.*';
      i += 2;
      if (glob[i] === '/') i++; // skip trailing slash after **
    } else if (c === '*') {
      regex += '[^/]*';
      i++;
    } else if (c === '?') {
      regex += '[^/]';
      i++;
    } else if ('.+^${}()|[]\\'.includes(c)) {
      regex += '\\' + c;
      i++;
    } else {
      regex += c;
      i++;
    }
  }
  regex += '$';
  return new RegExp(regex);
}

function applyIgnoreRules(scannerResults, rules) {
  if (rules.length === 0) return 0;
  const compiled = rules.map(r => ({ scanner: r.scanner, regex: globToRegex(r.pattern) }));
  let suppressed = 0;
  for (const [name, result] of Object.entries(scannerResults)) {
    const before = result.findings.length;
    result.findings = result.findings.filter(f => {
      const file = f.file || '';
      const findingPrefix = f.scanner || name.toUpperCase().slice(0, 3);
      for (const rule of compiled) {
        if (rule.scanner && rule.scanner !== findingPrefix) continue;
        if (rule.regex.test(file)) return false;
      }
      return true;
    });
    const removed = before - result.findings.length;
    suppressed += removed;
    // Recount severities
    if (removed > 0) {
      result.counts = { critical: 0, high: 0, medium: 0, low: 0, info: 0 };
      for (const f of result.findings) {
        result.counts[f.severity] = (result.counts[f.severity] || 0) + 1;
      }
    }
  }
  return suppressed;
}

// Import all scanners
import { scan as unicodeScan } from './unicode-scanner.mjs';
import { scan as entropyScan } from './entropy-scanner.mjs';
import { scan as permissionScan } from './permission-mapper.mjs';
import { scan as depScan } from './dep-auditor.mjs';
import { scan as taintScan } from './taint-tracer.mjs';
import { scan as gitScan } from './git-forensics.mjs';
import { scan as networkScan } from './network-mapper.mjs';
import { scan as memoryScan } from './memory-poisoning-scanner.mjs';
import { scan as supplyChainScan } from './supply-chain-recheck.mjs';
import { scan as tfaScan } from './toxic-flow-analyzer.mjs';

const SCANNERS = [
  { name: 'unicode',       fn: unicodeScan },
  { name: 'entropy',       fn: entropyScan },
  { name: 'permission',    fn: permissionScan },
  { name: 'dep',           fn: depScan },
  { name: 'taint',         fn: taintScan },
  { name: 'git',           fn: gitScan },
  { name: 'network',       fn: networkScan },
  { name: 'memory',        fn: memoryScan },
  { name: 'supply-chain',  fn: supplyChainScan },
  { name: 'toxic-flow',    fn: tfaScan, requiresPriorResults: true },
];

// ---------------------------------------------------------------------------
// CLI arg parsing — supports --log-file <path>
// ---------------------------------------------------------------------------
function parseArgs(argv) {
  const args = { target: null, logFile: null, outputFile: null, baseline: false, saveBaseline: false, format: 'json' };
  for (let i = 2; i < argv.length; i++) {
    if (argv[i] === '--log-file' && argv[i + 1]) {
      args.logFile = argv[++i];
    } else if (argv[i] === '--output-file' && argv[i + 1]) {
      args.outputFile = argv[++i];
    } else if (argv[i] === '--format' && argv[i + 1]) {
      args.format = argv[++i];
    } else if (argv[i] === '--baseline') {
      args.baseline = true;
    } else if (argv[i] === '--save-baseline') {
      args.saveBaseline = true;
    } else if (!args.target) {
      args.target = argv[i];
    }
  }
  return args;
}

async function main() {
  const args = parseArgs(process.argv);
  if (!args.target) {
    console.error('Usage: node scan-orchestrator.mjs <target-path> [--log-file <path>]');
    process.exit(1);
  }

  const targetPath = resolve(args.target);
  if (!existsSync(targetPath)) {
    console.error(`Target path does not exist: ${targetPath}`);
    process.exit(1);
  }

  // Set up cross-platform log file (writes to both stderr and file)
  const logFilePath = args.logFile || join(tmpdir(), `llm-security-scan-${Date.now()}.log`);
  writeFileSync(logFilePath, ''); // create/truncate
  function log(msg) {
    process.stderr.write(msg);
    appendFileSync(logFilePath, msg);
  }

  const totalStart = Date.now();

  // Shared file discovery — done once, passed to all scanners
  let discovery;
  try {
    discovery = await discoverFiles(targetPath);
    // Log discovery summary to stderr (stdout is reserved for JSON)
    log(
      `[deep-scan] Discovered ${discovery.files.length} files` +
      ` (${discovery.skipped} skipped${discovery.truncated ? ', TRUNCATED' : ''})\n`
    );
  } catch (err) {
    console.error(`File discovery failed: ${err.message}`);
    process.exit(1);
  }

  // Run each scanner sequentially, catching errors per-scanner.
  // Scanners with requiresPriorResults receive accumulated results as 3rd arg.
  const results = {};
  for (const { name, fn, requiresPriorResults } of SCANNERS) {
    resetCounter(); // Reset finding counter per scanner for clean IDs
    log(`[deep-scan] Running ${name} scanner...\n`);
    try {
      results[name] = requiresPriorResults
        ? await fn(targetPath, discovery, results)
        : await fn(targetPath, discovery);
      const r = results[name];
      log(
        `[deep-scan]   ${name}: ${r.status} — ${r.findings.length} findings in ${r.duration_ms}ms\n`
      );
    } catch (err) {
      results[name] = {
        scanner: `${name}-scanner`,
        status: 'error',
        files_scanned: 0,
        duration_ms: 0,
        findings: [],
        counts: { critical: 0, high: 0, medium: 0, low: 0, info: 0 },
        error: err.message,
      };
      log(`[deep-scan]   ${name}: ERROR — ${err.message}\n`);
    }
  }

  // Apply .llm-security-ignore rules
  const ignoreRules = loadIgnoreRules(targetPath);
  const suppressed = applyIgnoreRules(results, ignoreRules);
  if (suppressed > 0) {
    log(`[deep-scan] Suppressed ${suppressed} finding(s) via .llm-security-ignore\n`);
  }

  const totalDuration = Date.now() - totalStart;
  const output = envelope(targetPath, results, totalDuration);
  if (suppressed > 0) output.suppressed = suppressed;

  // Include log file path in JSON output (cross-platform — no shell redirect needed)
  output.log_file = logFilePath;

  // ---------------------------------------------------------------------------
  // Baseline diffing — compare against stored baseline and/or save new one
  // ---------------------------------------------------------------------------
  const pluginRoot = dirname(dirname(fileURLToPath(import.meta.url)));
  const baselinesDir = join(pluginRoot, 'reports', 'baselines');

  if (args.baseline) {
    const diff = diffAgainstBaseline(baselinesDir, targetPath, output);
    if (diff) {
      output.diff = diff;
      log(
        `[deep-scan] Baseline diff: ${diff.summary.new} new, ${diff.summary.resolved} resolved, ` +
        `${diff.summary.unchanged} unchanged, ${diff.summary.moved} moved ` +
        `(baseline from ${diff.summary.baseline_timestamp})\n`
      );
    } else {
      log(`[deep-scan] No baseline found for this target. Use --save-baseline to create one.\n`);
      output.diff = null;
    }
  }

  if (args.saveBaseline) {
    const savedPath = saveBaseline(baselinesDir, targetPath, output);
    output.baseline_saved = savedPath;
    log(`[deep-scan] Baseline saved: ${savedPath}\n`);
  }

  // Output: SARIF or JSON, to file (--output-file) or stdout
  const finalOutput = args.format === 'sarif' ? toSARIF(output) : output;
  const jsonStr = JSON.stringify(finalOutput, null, 2) + '\n';
  if (args.outputFile) {
    writeFileSync(args.outputFile, jsonStr);
    output.output_file = args.outputFile;
    // Stdout gets only the compact aggregate (keeps caller context small)
    process.stdout.write(JSON.stringify({ aggregate: output.aggregate, output_file: args.outputFile }) + '\n');
  } else {
    process.stdout.write(jsonStr);
  }

  // Summary banner to stderr + log file
  const agg = output.aggregate;
  log(
    `\n[deep-scan] === COMPLETE ===\n` +
    `[deep-scan] Verdict: ${agg.verdict} | Risk Score: ${agg.risk_score}/100\n` +
    `[deep-scan] Findings: ${agg.total_findings} total ` +
    `(${agg.counts.critical}C ${agg.counts.high}H ${agg.counts.medium}M ${agg.counts.low}L ${agg.counts.info}I)\n` +
    `[deep-scan] Scanners: ${agg.scanners_ok} ok, ${agg.scanners_error} error, ${agg.scanners_skipped} skipped\n` +
    `[deep-scan] Duration: ${totalDuration}ms\n`
  );

  // Exit code based on verdict
  if (agg.verdict === 'BLOCK') process.exit(2);
  if (agg.verdict === 'WARNING') process.exit(1);
  process.exit(0);
}

main().catch(err => {
  console.error(`Fatal error: ${err.message}`);
  process.exit(1);
});