ktg-plugin-marketplace/plugins/llm-security/scanners/content-extractor.mjs

#!/usr/bin/env node
// content-extractor.mjs — Pre-extraction indirection layer for remote repo scanning
// Produces a structured JSON "evidence package" that LLM agents analyze
// instead of reading raw (potentially malicious) file content.
//
// Usage: node content-extractor.mjs <target-path> --output-file <path>

import { writeFileSync } from 'node:fs';
import { resolve, relative } from 'node:path';
import { discoverFiles, readTextFile } from './lib/file-discovery.mjs';
import { CRITICAL_PATTERNS, HIGH_PATTERNS } from './lib/injection-patterns.mjs';
import { normalizeForScan } from './lib/string-utils.mjs';
import { parseFrontmatter, classifyPluginFile } from './lib/yaml-frontmatter.mjs';

// ---------------------------------------------------------------------------
// Pattern sets for extraction passes
// ---------------------------------------------------------------------------

const SHELL_CMD_PATTERNS = [
  { pattern: /curl\s+[^|]*\|\s*(?:ba)?sh/gi, label: 'curl-pipe-to-shell' },
  { pattern: /wget\s+[^|]*\|\s*(?:ba)?sh/gi, label: 'wget-pipe-to-shell' },
  { pattern: /curl\s+-[fsSLo]*\s+https?:\/\/\S+/gi, label: 'curl-download' },
  { pattern: /npm\s+install\s+(?!-[DdgE])\S+/gi, label: 'npm-install' },
  { pattern: /pip3?\s+install\s+\S+/gi, label: 'pip-install' },
  { pattern: /yarn\s+add\s+\S+/gi, label: 'yarn-add' },
  { pattern: /chmod\s+[0-7]+\s+\S+/gi, label: 'chmod' },
  { pattern: /sudo\s+\S+/gi, label: 'sudo' },
  { pattern: /eval\s*\(/gi, label: 'eval' },
  { pattern: /echo\s+["'][^"']*["']\s*\|\s*base64\s+-d\s*\|\s*(?:ba)?sh/gi, label: 'base64-decode-exec' },
  { pattern: /gh\s+api\s+[^\\]*\/starred\//gi, label: 'gh-api-star' },
  { pattern: /gh\s+api\s+--method\s+(?:PUT|POST|DELETE)/gi, label: 'gh-api-mutation' },
];

const CREDENTIAL_PATH_PATTERNS = [
  { pattern: /~\/\.ssh\/\S*/g, label: 'ssh-dir' },
  { pattern: /~\/\.aws\/\S*/g, label: 'aws-dir' },
  { pattern: /~\/\.env\b/g, label: 'dotenv' },
  { pattern: /~\/\.npmrc\b/g, label: 'npmrc' },
  { pattern: /~\/\.netrc\b/g, label: 'netrc' },
  { pattern: /~\/\.gitconfig\b/g, label: 'gitconfig' },
  { pattern: /~\/\.gnupg\/\S*/g, label: 'gnupg-dir' },
  { pattern: /~\/Library\/Application\s+Support\/\S+/g, label: 'macos-app-support' },
  { pattern: /~\/\.ethereum\/\S*/g, label: 'ethereum-wallet' },
  { pattern: /wallet\.dat/gi, label: 'wallet-file' },
  { pattern: /id_rsa|id_ed25519|id_ecdsa/g, label: 'ssh-key-file' },
  { pattern: /\.pem\b|\.key\b|\.p12\b|\.pfx\b/g, label: 'cert-key-file' },
  { pattern: /\$AWS_SECRET\w*/gi, label: 'aws-secret-env' },
  { pattern: /\$AZURE_CLIENT_SECRET/gi, label: 'azure-secret-env' },
  { pattern: /\$GOOGLE_APPLICATION_CREDENTIALS/gi, label: 'gcp-creds-env' },
  { pattern: /\$(?:NPM_TOKEN|GITHUB_TOKEN|PYPI_TOKEN|ANTHROPIC_API_KEY)/gi, label: 'api-token-env' },
  { pattern: /process\.env\s*(?:\.\s*\w+|\[\s*['"`]\w+['"`]\s*\])/g, label: 'process-env-access' },
];

const PERSISTENCE_PATTERNS = [
  { pattern: /crontab/gi, label: 'crontab' },
  { pattern: /\/etc\/cron\.d/gi, label: 'cron.d' },
  { pattern: /launchctl\s+load/gi, label: 'launchctl-load' },
  { pattern: /LaunchAgents/gi, label: 'LaunchAgents' },
  { pattern: /RunAtLoad|StartInterval|KeepAlive/gi, label: 'plist-persistence' },
  { pattern: /systemctl\s+(?:enable|start)/gi, label: 'systemd' },
  { pattern: /ExecStart\s*=/gi, label: 'systemd-unit' },
  { pattern: /\.zshrc|\.bashrc|\.bash_profile|\.profile|\.zprofile|\.zshenv/g, label: 'shell-profile' },
  { pattern: /\.git\/hooks\//g, label: 'git-hooks' },
  { pattern: /\*\s+\*\s+\*\s+\*\s+\*/g, label: 'cron-schedule' },
];

const NETWORK_CALL_PATTERNS = [
  /\bcurl\b/i, /\bwget\b/i, /\bfetch\s*\(/i, /\baxios\b/i,
  /https?:\/\/\S+/i, /\.post\s*\(/i, /\.send\s*\(/i,
  /XMLHttpRequest/i, /WebSocket/i,
];

const MCP_TOOL_PATTERNS = [
  /server\.tool\s*\(\s*(['"`])([\s\S]*?)\1/g,
  /@mcp\.tool/g,
  /@server\.tool/g,
];

const MCP_DESC_PATTERN = /description\s*[:=]\s*(['"`])([\s\S]*?)\1/g;

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

function parseArgs(argv) {
  const args = { target: null, outputFile: null };
  for (let i = 0; i < argv.length; i++) {
    if (argv[i] === '--output-file' && i + 1 < argv.length) {
      args.outputFile = argv[++i];
    } else if (!args.target) {
      args.target = argv[i];
    }
  }
  return args;
}

/** Strip injection patterns from text, return sanitized text + findings */
function stripInjection(text, file) {
  const findings = [];
  let sanitized = text;
  const normalized = normalizeForScan(text);
  const isDifferent = normalized !== text;

  const allPatterns = [
    ...CRITICAL_PATTERNS.map(p => ({ ...p, severity: 'critical' })),
    ...HIGH_PATTERNS.map(p => ({ ...p, severity: 'high' })),
  ];

  for (const { pattern, label, severity } of allPatterns) {
    // Need fresh regex per match (some have /g, some don't)
    const globalPattern = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g');

    for (const variant of (isDifferent ? [text, normalized] : [text])) {
      let match;
      while ((match = globalPattern.exec(variant)) !== null) {
        const line = variant.substring(0, match.index).split('\n').length;
        findings.push({ file, line, label, severity });
        // Replace in sanitized text (use original pattern position)
        sanitized = sanitized.replace(match[0], `[INJECTION-PATTERN-STRIPPED: ${label}]`);
      }
    }
  }

  return { sanitized, findings };
}

/** Extract line number for a match index in text */
function lineAt(text, index) {
  return text.substring(0, index).split('\n').length;
}

/** Get surrounding line as context snippet (max 200 chars) */
function contextSnippet(text, index) {
  const lines = text.split('\n');
  const lineNum = text.substring(0, index).split('\n').length - 1;
  const line = lines[lineNum] || '';
  return line.length > 200 ? line.substring(0, 200) + '...' : line;
}

/** Check if file is markdown */
function isMd(relPath) {
  return /\.mdx?$/i.test(relPath);
}

/** Check if file is code */
function isCode(relPath) {
  return /\.(js|mjs|cjs|ts|mts|cts|jsx|tsx|py|pyw|rb|go|rs|java|kt|cs|php)$/i.test(relPath);
}

/** Check if file is CLAUDE.md */
function isClaudeMd(relPath) {
  return /(?:^|\/|\\)CLAUDE\.md$/i.test(relPath);
}

// ---------------------------------------------------------------------------
// Extraction passes
// ---------------------------------------------------------------------------

function extractFromText(text, patterns, file) {
  const results = [];
  for (const { pattern, label } of patterns) {
    const globalPattern = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g');
    let match;
    while ((match = globalPattern.exec(text)) !== null) {
      results.push({
        file,
        line: lineAt(text, match.index),
        label,
        match: match[0].length > 120 ? match[0].substring(0, 120) + '...' : match[0],
        context_snippet: contextSnippet(text, match.index),
      });
    }
  }
  return results;
}

function extractShellFromCodeBlocks(text, file) {
  const results = [];
  const codeBlockRe = /```(?:bash|sh|shell|zsh|console)?\s*\n([\s\S]*?)```/gi;
  let block;
  while ((block = codeBlockRe.exec(text)) !== null) {
    const blockContent = block[1];
    const blockLine = lineAt(text, block.index);
    for (const line of blockContent.split('\n')) {
      const trimmed = line.replace(/^\$\s*/, '').trim();
      if (trimmed.length > 3) {
        results.push({
          file,
          line: blockLine,
          command: trimmed.length > 200 ? trimmed.substring(0, 200) + '...' : trimmed,
          context: 'code_block',
        });
      }
    }
  }
  return results;
}

function extractMcpToolDescriptions(text, file) {
  const results = [];
  // Check for MCP-related patterns first
  let hasMcp = false;
  for (const pattern of MCP_TOOL_PATTERNS) {
    const re = new RegExp(pattern.source, pattern.flags);
    if (re.test(text)) { hasMcp = true; break; }
  }
  if (!hasMcp) return results;

  const re = new RegExp(MCP_DESC_PATTERN.source, MCP_DESC_PATTERN.flags);
  let match;
  while ((match = re.exec(text)) !== null) {
    const descText = match[2];
    const injection = scanDescForInjection(descText);
    results.push({
      file,
      line: lineAt(text, match.index),
      tool_name: null, // Tool name often on separate line
      description_text: descText.length > 500 ? descText.substring(0, 500) + '...' : descText,
      char_count: descText.length,
      injection_detected: injection.length > 0,
      injection_labels: injection,
    });
  }
  return results;
}

function scanDescForInjection(text) {
  const labels = [];
  const allPatterns = [...CRITICAL_PATTERNS, ...HIGH_PATTERNS];
  for (const { pattern, label } of allPatterns) {
    if (pattern.test(text)) labels.push(label);
  }
  return labels;
}

// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------

async function main() {
  const startTime = Date.now();
  const { target, outputFile } = parseArgs(process.argv.slice(2));

  if (!target) {
    console.error('Usage: node content-extractor.mjs <target-path> --output-file <path>');
    process.exit(1);
  }

  const targetPath = resolve(target);
  const discovery = await discoverFiles(targetPath);
  const { files } = discovery;

  // Output containers
  const injectionFindings = [];
  const frontmatterInventory = [];
  const shellCommands = [];
  const credentialRefs = [];
  const persistenceSignals = [];
  const mcpToolDescriptions = [];
  const claudeMdAnalysis = [];
  const crossInstructionFlags = [];
  let filesWithInjections = 0;

  // Process each file
  for (const fileInfo of files) {
    const { absPath, relPath } = fileInfo;
    const content = await readTextFile(absPath);
    if (!content) continue;

    // Pass 1: Injection strip
    const { sanitized, findings: injFindings } = stripInjection(content, relPath);
    if (injFindings.length > 0) {
      injectionFindings.push(...injFindings);
      filesWithInjections++;
    }

    // Pass 2: Frontmatter (markdown files only)
    if (isMd(relPath)) {
      const fm = parseFrontmatter(content);
      if (fm) {
        const fileType = classifyPluginFile(relPath, fm);
        const tools = fm.allowed_tools || fm.tools || [];
        const desc = fm.description || '';
        const descInjection = scanDescForInjection(desc);
        frontmatterInventory.push({
          file: relPath,
          type: fileType,
          name: fm.name || null,
          model: fm.model || null,
          tools: Array.isArray(tools) ? tools : [tools],
          description_snippet: desc.length > 200 ? desc.substring(0, 200) + '...' : desc,
          injection_in_frontmatter: descInjection.length > 0,
          injection_labels: descInjection.length > 0 ? descInjection : undefined,
        });
      }
    }

    // Pass 3a: Shell commands (markdown — code blocks + prose patterns)
    if (isMd(relPath)) {
      shellCommands.push(...extractShellFromCodeBlocks(sanitized, relPath));
      const proseShell = extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath);
      for (const s of proseShell) {
        shellCommands.push({
          file: s.file, line: s.line,
          command: s.match,
          context: 'prose',
        });
      }
    }
    // Also extract from code files
    if (isCode(relPath)) {
      const codeShell = extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath);
      for (const s of codeShell) {
        shellCommands.push({
          file: s.file, line: s.line,
          command: s.match,
          context: 'source_code',
        });
      }
    }

    // Pass 3b: Credential paths
    const creds = extractFromText(sanitized, CREDENTIAL_PATH_PATTERNS, relPath);
    credentialRefs.push(...creds);

    // Pass 3c: Persistence
    const persistence = extractFromText(sanitized, PERSISTENCE_PATTERNS, relPath);
    persistenceSignals.push(...persistence);

    // Pass 4: MCP tool descriptions (code files only)
    if (isCode(relPath)) {
      mcpToolDescriptions.push(...extractMcpToolDescriptions(sanitized, relPath));
    }

    // Pass 5: CLAUDE.md special analysis
    if (isClaudeMd(relPath)) {
      const claudeShell = [
        ...extractShellFromCodeBlocks(sanitized, relPath),
        ...extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath).map(s => ({
          file: s.file, line: s.line, command: s.match, context: 'prose',
        })),
      ];
      const claudeCreds = extractFromText(sanitized, CREDENTIAL_PATH_PATTERNS, relPath);
      claudeMdAnalysis.push({
        file: relPath,
        sanitized_content: sanitized.length > 5000 ? sanitized.substring(0, 5000) + '\n[TRUNCATED]' : sanitized,
        shell_commands: claudeShell,
        credential_refs: claudeCreds,
        injection_findings: injFindings.filter(f => f.file === relPath),
      });
    }

    // Pass 6: Cross-instruction combination
    const hasCred = creds.length > 0;
    const hasNetwork = NETWORK_CALL_PATTERNS.some(p => p.test(sanitized));
    if (hasCred && hasNetwork) {
      crossInstructionFlags.push({
        file: relPath,
        combination: 'credential_access+network_call',
        credential_ref: creds[0]?.label || 'unknown',
        network_ref: 'network call detected in same file',
      });
    }
  }

  // Deterministic verdict
  const hasInjection = injectionFindings.some(f => f.severity === 'critical');
  const hasPersistence = persistenceSignals.length > 0;
  const hasCredNetCombo = crossInstructionFlags.length > 0;
  let riskLevel = 'low';
  if (hasInjection || hasCredNetCombo) riskLevel = 'critical';
  else if (injectionFindings.length > 0 || hasPersistence) riskLevel = 'high';
  else if (credentialRefs.length > 0 || shellCommands.length > 5) riskLevel = 'medium';

  const result = {
    meta: {
      target: targetPath,
      timestamp: new Date().toISOString(),
      files_scanned: files.length,
      files_with_injections: filesWithInjections,
      duration_ms: Date.now() - startTime,
    },
    injection_findings: injectionFindings,
    frontmatter_inventory: frontmatterInventory,
    shell_commands: shellCommands,
    credential_references: credentialRefs,
    persistence_signals: persistenceSignals,
    mcp_tool_descriptions: mcpToolDescriptions,
    claude_md_analysis: claudeMdAnalysis,
    cross_instruction_flags: crossInstructionFlags,
    deterministic_verdict: {
      has_injection: injectionFindings.length > 0,
      has_critical_injection: hasInjection,
      has_persistence: hasPersistence,
      has_credential_network_combo: hasCredNetCombo,
      risk_level: riskLevel,
    },
  };

  if (outputFile) {
    writeFileSync(outputFile, JSON.stringify(result, null, 2));
    // Compact summary to stdout
    const summary = {
      files_scanned: files.length,
      injection_findings: injectionFindings.length,
      shell_commands: shellCommands.length,
      credential_references: credentialRefs.length,
      persistence_signals: persistenceSignals.length,
      mcp_tool_descriptions: mcpToolDescriptions.length,
      claude_md_count: claudeMdAnalysis.length,
      cross_instruction_flags: crossInstructionFlags.length,
      risk_level: riskLevel,
    };
    process.stdout.write(JSON.stringify(summary) + '\n');
  } else {
    process.stdout.write(JSON.stringify(result, null, 2) + '\n');
  }
}

main().catch(err => {
  console.error(`content-extractor: ${err.message}`);
  process.exit(1);
});