#!/usr/bin/env node // content-extractor.mjs — Pre-extraction indirection layer for remote repo scanning // Produces a structured JSON "evidence package" that LLM agents analyze // instead of reading raw (potentially malicious) file content. // // Usage: node content-extractor.mjs --output-file import { writeFileSync } from 'node:fs'; import { resolve, relative } from 'node:path'; import { discoverFiles, readTextFile } from './lib/file-discovery.mjs'; import { CRITICAL_PATTERNS, HIGH_PATTERNS } from './lib/injection-patterns.mjs'; import { normalizeForScan } from './lib/string-utils.mjs'; import { parseFrontmatter, classifyPluginFile } from './lib/yaml-frontmatter.mjs'; // --------------------------------------------------------------------------- // Pattern sets for extraction passes // --------------------------------------------------------------------------- const SHELL_CMD_PATTERNS = [ { pattern: /curl\s+[^|]*\|\s*(?:ba)?sh/gi, label: 'curl-pipe-to-shell' }, { pattern: /wget\s+[^|]*\|\s*(?:ba)?sh/gi, label: 'wget-pipe-to-shell' }, { pattern: /curl\s+-[fsSLo]*\s+https?:\/\/\S+/gi, label: 'curl-download' }, { pattern: /npm\s+install\s+(?!-[DdgE])\S+/gi, label: 'npm-install' }, { pattern: /pip3?\s+install\s+\S+/gi, label: 'pip-install' }, { pattern: /yarn\s+add\s+\S+/gi, label: 'yarn-add' }, { pattern: /chmod\s+[0-7]+\s+\S+/gi, label: 'chmod' }, { pattern: /sudo\s+\S+/gi, label: 'sudo' }, { pattern: /eval\s*\(/gi, label: 'eval' }, { pattern: /echo\s+["'][^"']*["']\s*\|\s*base64\s+-d\s*\|\s*(?:ba)?sh/gi, label: 'base64-decode-exec' }, { pattern: /gh\s+api\s+[^\\]*\/starred\//gi, label: 'gh-api-star' }, { pattern: /gh\s+api\s+--method\s+(?:PUT|POST|DELETE)/gi, label: 'gh-api-mutation' }, ]; const CREDENTIAL_PATH_PATTERNS = [ { pattern: /~\/\.ssh\/\S*/g, label: 'ssh-dir' }, { pattern: /~\/\.aws\/\S*/g, label: 'aws-dir' }, { pattern: /~\/\.env\b/g, label: 'dotenv' }, { pattern: /~\/\.npmrc\b/g, label: 'npmrc' }, { pattern: /~\/\.netrc\b/g, label: 'netrc' }, { pattern: /~\/\.gitconfig\b/g, label: 'gitconfig' }, { pattern: /~\/\.gnupg\/\S*/g, label: 'gnupg-dir' }, { pattern: /~\/Library\/Application\s+Support\/\S+/g, label: 'macos-app-support' }, { pattern: /~\/\.ethereum\/\S*/g, label: 'ethereum-wallet' }, { pattern: /wallet\.dat/gi, label: 'wallet-file' }, { pattern: /id_rsa|id_ed25519|id_ecdsa/g, label: 'ssh-key-file' }, { pattern: /\.pem\b|\.key\b|\.p12\b|\.pfx\b/g, label: 'cert-key-file' }, { pattern: /\$AWS_SECRET\w*/gi, label: 'aws-secret-env' }, { pattern: /\$AZURE_CLIENT_SECRET/gi, label: 'azure-secret-env' }, { pattern: /\$GOOGLE_APPLICATION_CREDENTIALS/gi, label: 'gcp-creds-env' }, { pattern: /\$(?:NPM_TOKEN|GITHUB_TOKEN|PYPI_TOKEN|ANTHROPIC_API_KEY)/gi, label: 'api-token-env' }, { pattern: /process\.env\s*(?:\.\s*\w+|\[\s*['"`]\w+['"`]\s*\])/g, label: 'process-env-access' }, ]; const PERSISTENCE_PATTERNS = [ { pattern: /crontab/gi, label: 'crontab' }, { pattern: /\/etc\/cron\.d/gi, label: 'cron.d' }, { pattern: /launchctl\s+load/gi, label: 'launchctl-load' }, { pattern: /LaunchAgents/gi, label: 'LaunchAgents' }, { pattern: /RunAtLoad|StartInterval|KeepAlive/gi, label: 'plist-persistence' }, { pattern: /systemctl\s+(?:enable|start)/gi, label: 'systemd' }, { pattern: /ExecStart\s*=/gi, label: 'systemd-unit' }, { pattern: /\.zshrc|\.bashrc|\.bash_profile|\.profile|\.zprofile|\.zshenv/g, label: 'shell-profile' }, { pattern: /\.git\/hooks\//g, label: 'git-hooks' }, { pattern: /\*\s+\*\s+\*\s+\*\s+\*/g, label: 'cron-schedule' }, ]; const NETWORK_CALL_PATTERNS = [ /\bcurl\b/i, /\bwget\b/i, /\bfetch\s*\(/i, /\baxios\b/i, /https?:\/\/\S+/i, /\.post\s*\(/i, /\.send\s*\(/i, /XMLHttpRequest/i, /WebSocket/i, ]; const MCP_TOOL_PATTERNS = [ /server\.tool\s*\(\s*(['"`])([\s\S]*?)\1/g, /@mcp\.tool/g, /@server\.tool/g, ]; const MCP_DESC_PATTERN = /description\s*[:=]\s*(['"`])([\s\S]*?)\1/g; // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- function parseArgs(argv) { const args = { target: null, outputFile: null }; for (let i = 0; i < argv.length; i++) { if (argv[i] === '--output-file' && i + 1 < argv.length) { args.outputFile = argv[++i]; } else if (!args.target) { args.target = argv[i]; } } return args; } /** Strip injection patterns from text, return sanitized text + findings */ function stripInjection(text, file) { const findings = []; let sanitized = text; const normalized = normalizeForScan(text); const isDifferent = normalized !== text; const allPatterns = [ ...CRITICAL_PATTERNS.map(p => ({ ...p, severity: 'critical' })), ...HIGH_PATTERNS.map(p => ({ ...p, severity: 'high' })), ]; for (const { pattern, label, severity } of allPatterns) { // Need fresh regex per match (some have /g, some don't) const globalPattern = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g'); for (const variant of (isDifferent ? [text, normalized] : [text])) { let match; while ((match = globalPattern.exec(variant)) !== null) { const line = variant.substring(0, match.index).split('\n').length; findings.push({ file, line, label, severity }); // Replace in sanitized text (use original pattern position) sanitized = sanitized.replace(match[0], `[INJECTION-PATTERN-STRIPPED: ${label}]`); } } } return { sanitized, findings }; } /** Extract line number for a match index in text */ function lineAt(text, index) { return text.substring(0, index).split('\n').length; } /** Get surrounding line as context snippet (max 200 chars) */ function contextSnippet(text, index) { const lines = text.split('\n'); const lineNum = text.substring(0, index).split('\n').length - 1; const line = lines[lineNum] || ''; return line.length > 200 ? line.substring(0, 200) + '...' : line; } /** Check if file is markdown */ function isMd(relPath) { return /\.mdx?$/i.test(relPath); } /** Check if file is code */ function isCode(relPath) { return /\.(js|mjs|cjs|ts|mts|cts|jsx|tsx|py|pyw|rb|go|rs|java|kt|cs|php)$/i.test(relPath); } /** Check if file is CLAUDE.md */ function isClaudeMd(relPath) { return /(?:^|\/|\\)CLAUDE\.md$/i.test(relPath); } // --------------------------------------------------------------------------- // Extraction passes // --------------------------------------------------------------------------- function extractFromText(text, patterns, file) { const results = []; for (const { pattern, label } of patterns) { const globalPattern = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g'); let match; while ((match = globalPattern.exec(text)) !== null) { results.push({ file, line: lineAt(text, match.index), label, match: match[0].length > 120 ? match[0].substring(0, 120) + '...' : match[0], context_snippet: contextSnippet(text, match.index), }); } } return results; } function extractShellFromCodeBlocks(text, file) { const results = []; const codeBlockRe = /```(?:bash|sh|shell|zsh|console)?\s*\n([\s\S]*?)```/gi; let block; while ((block = codeBlockRe.exec(text)) !== null) { const blockContent = block[1]; const blockLine = lineAt(text, block.index); for (const line of blockContent.split('\n')) { const trimmed = line.replace(/^\$\s*/, '').trim(); if (trimmed.length > 3) { results.push({ file, line: blockLine, command: trimmed.length > 200 ? trimmed.substring(0, 200) + '...' : trimmed, context: 'code_block', }); } } } return results; } function extractMcpToolDescriptions(text, file) { const results = []; // Check for MCP-related patterns first let hasMcp = false; for (const pattern of MCP_TOOL_PATTERNS) { const re = new RegExp(pattern.source, pattern.flags); if (re.test(text)) { hasMcp = true; break; } } if (!hasMcp) return results; const re = new RegExp(MCP_DESC_PATTERN.source, MCP_DESC_PATTERN.flags); let match; while ((match = re.exec(text)) !== null) { const descText = match[2]; const injection = scanDescForInjection(descText); results.push({ file, line: lineAt(text, match.index), tool_name: null, // Tool name often on separate line description_text: descText.length > 500 ? descText.substring(0, 500) + '...' : descText, char_count: descText.length, injection_detected: injection.length > 0, injection_labels: injection, }); } return results; } function scanDescForInjection(text) { const labels = []; const allPatterns = [...CRITICAL_PATTERNS, ...HIGH_PATTERNS]; for (const { pattern, label } of allPatterns) { if (pattern.test(text)) labels.push(label); } return labels; } // --------------------------------------------------------------------------- // Main // --------------------------------------------------------------------------- async function main() { const startTime = Date.now(); const { target, outputFile } = parseArgs(process.argv.slice(2)); if (!target) { console.error('Usage: node content-extractor.mjs --output-file '); process.exit(1); } const targetPath = resolve(target); const discovery = await discoverFiles(targetPath); const { files } = discovery; // Output containers const injectionFindings = []; const frontmatterInventory = []; const shellCommands = []; const credentialRefs = []; const persistenceSignals = []; const mcpToolDescriptions = []; const claudeMdAnalysis = []; const crossInstructionFlags = []; let filesWithInjections = 0; // Process each file for (const fileInfo of files) { const { absPath, relPath } = fileInfo; const content = await readTextFile(absPath); if (!content) continue; // Pass 1: Injection strip const { sanitized, findings: injFindings } = stripInjection(content, relPath); if (injFindings.length > 0) { injectionFindings.push(...injFindings); filesWithInjections++; } // Pass 2: Frontmatter (markdown files only) if (isMd(relPath)) { const fm = parseFrontmatter(content); if (fm) { const fileType = classifyPluginFile(relPath, fm); const tools = fm.allowed_tools || fm.tools || []; const desc = fm.description || ''; const descInjection = scanDescForInjection(desc); frontmatterInventory.push({ file: relPath, type: fileType, name: fm.name || null, model: fm.model || null, tools: Array.isArray(tools) ? tools : [tools], description_snippet: desc.length > 200 ? desc.substring(0, 200) + '...' : desc, injection_in_frontmatter: descInjection.length > 0, injection_labels: descInjection.length > 0 ? descInjection : undefined, }); } } // Pass 3a: Shell commands (markdown — code blocks + prose patterns) if (isMd(relPath)) { shellCommands.push(...extractShellFromCodeBlocks(sanitized, relPath)); const proseShell = extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath); for (const s of proseShell) { shellCommands.push({ file: s.file, line: s.line, command: s.match, context: 'prose', }); } } // Also extract from code files if (isCode(relPath)) { const codeShell = extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath); for (const s of codeShell) { shellCommands.push({ file: s.file, line: s.line, command: s.match, context: 'source_code', }); } } // Pass 3b: Credential paths const creds = extractFromText(sanitized, CREDENTIAL_PATH_PATTERNS, relPath); credentialRefs.push(...creds); // Pass 3c: Persistence const persistence = extractFromText(sanitized, PERSISTENCE_PATTERNS, relPath); persistenceSignals.push(...persistence); // Pass 4: MCP tool descriptions (code files only) if (isCode(relPath)) { mcpToolDescriptions.push(...extractMcpToolDescriptions(sanitized, relPath)); } // Pass 5: CLAUDE.md special analysis if (isClaudeMd(relPath)) { const claudeShell = [ ...extractShellFromCodeBlocks(sanitized, relPath), ...extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath).map(s => ({ file: s.file, line: s.line, command: s.match, context: 'prose', })), ]; const claudeCreds = extractFromText(sanitized, CREDENTIAL_PATH_PATTERNS, relPath); claudeMdAnalysis.push({ file: relPath, sanitized_content: sanitized.length > 5000 ? sanitized.substring(0, 5000) + '\n[TRUNCATED]' : sanitized, shell_commands: claudeShell, credential_refs: claudeCreds, injection_findings: injFindings.filter(f => f.file === relPath), }); } // Pass 6: Cross-instruction combination const hasCred = creds.length > 0; const hasNetwork = NETWORK_CALL_PATTERNS.some(p => p.test(sanitized)); if (hasCred && hasNetwork) { crossInstructionFlags.push({ file: relPath, combination: 'credential_access+network_call', credential_ref: creds[0]?.label || 'unknown', network_ref: 'network call detected in same file', }); } } // Deterministic verdict const hasInjection = injectionFindings.some(f => f.severity === 'critical'); const hasPersistence = persistenceSignals.length > 0; const hasCredNetCombo = crossInstructionFlags.length > 0; let riskLevel = 'low'; if (hasInjection || hasCredNetCombo) riskLevel = 'critical'; else if (injectionFindings.length > 0 || hasPersistence) riskLevel = 'high'; else if (credentialRefs.length > 0 || shellCommands.length > 5) riskLevel = 'medium'; const result = { meta: { target: targetPath, timestamp: new Date().toISOString(), files_scanned: files.length, files_with_injections: filesWithInjections, duration_ms: Date.now() - startTime, }, injection_findings: injectionFindings, frontmatter_inventory: frontmatterInventory, shell_commands: shellCommands, credential_references: credentialRefs, persistence_signals: persistenceSignals, mcp_tool_descriptions: mcpToolDescriptions, claude_md_analysis: claudeMdAnalysis, cross_instruction_flags: crossInstructionFlags, deterministic_verdict: { has_injection: injectionFindings.length > 0, has_critical_injection: hasInjection, has_persistence: hasPersistence, has_credential_network_combo: hasCredNetCombo, risk_level: riskLevel, }, }; if (outputFile) { writeFileSync(outputFile, JSON.stringify(result, null, 2)); // Compact summary to stdout const summary = { files_scanned: files.length, injection_findings: injectionFindings.length, shell_commands: shellCommands.length, credential_references: credentialRefs.length, persistence_signals: persistenceSignals.length, mcp_tool_descriptions: mcpToolDescriptions.length, claude_md_count: claudeMdAnalysis.length, cross_instruction_flags: crossInstructionFlags.length, risk_level: riskLevel, }; process.stdout.write(JSON.stringify(summary) + '\n'); } else { process.stdout.write(JSON.stringify(result, null, 2) + '\n'); } } main().catch(err => { console.error(`content-extractor: ${err.message}`); process.exit(1); });