ktg-plugin-marketplace/plugins/llm-security/scanners/content-extractor.mjs

423 lines
15 KiB
JavaScript

#!/usr/bin/env node
// content-extractor.mjs — Pre-extraction indirection layer for remote repo scanning
// Produces a structured JSON "evidence package" that LLM agents analyze
// instead of reading raw (potentially malicious) file content.
//
// Usage: node content-extractor.mjs <target-path> --output-file <path>
import { writeFileSync } from 'node:fs';
import { resolve, relative } from 'node:path';
import { discoverFiles, readTextFile } from './lib/file-discovery.mjs';
import { CRITICAL_PATTERNS, HIGH_PATTERNS } from './lib/injection-patterns.mjs';
import { normalizeForScan } from './lib/string-utils.mjs';
import { parseFrontmatter, classifyPluginFile } from './lib/yaml-frontmatter.mjs';
// ---------------------------------------------------------------------------
// Pattern sets for extraction passes
// ---------------------------------------------------------------------------
const SHELL_CMD_PATTERNS = [
{ pattern: /curl\s+[^|]*\|\s*(?:ba)?sh/gi, label: 'curl-pipe-to-shell' },
{ pattern: /wget\s+[^|]*\|\s*(?:ba)?sh/gi, label: 'wget-pipe-to-shell' },
{ pattern: /curl\s+-[fsSLo]*\s+https?:\/\/\S+/gi, label: 'curl-download' },
{ pattern: /npm\s+install\s+(?!-[DdgE])\S+/gi, label: 'npm-install' },
{ pattern: /pip3?\s+install\s+\S+/gi, label: 'pip-install' },
{ pattern: /yarn\s+add\s+\S+/gi, label: 'yarn-add' },
{ pattern: /chmod\s+[0-7]+\s+\S+/gi, label: 'chmod' },
{ pattern: /sudo\s+\S+/gi, label: 'sudo' },
{ pattern: /eval\s*\(/gi, label: 'eval' },
{ pattern: /echo\s+["'][^"']*["']\s*\|\s*base64\s+-d\s*\|\s*(?:ba)?sh/gi, label: 'base64-decode-exec' },
{ pattern: /gh\s+api\s+[^\\]*\/starred\//gi, label: 'gh-api-star' },
{ pattern: /gh\s+api\s+--method\s+(?:PUT|POST|DELETE)/gi, label: 'gh-api-mutation' },
];
const CREDENTIAL_PATH_PATTERNS = [
{ pattern: /~\/\.ssh\/\S*/g, label: 'ssh-dir' },
{ pattern: /~\/\.aws\/\S*/g, label: 'aws-dir' },
{ pattern: /~\/\.env\b/g, label: 'dotenv' },
{ pattern: /~\/\.npmrc\b/g, label: 'npmrc' },
{ pattern: /~\/\.netrc\b/g, label: 'netrc' },
{ pattern: /~\/\.gitconfig\b/g, label: 'gitconfig' },
{ pattern: /~\/\.gnupg\/\S*/g, label: 'gnupg-dir' },
{ pattern: /~\/Library\/Application\s+Support\/\S+/g, label: 'macos-app-support' },
{ pattern: /~\/\.ethereum\/\S*/g, label: 'ethereum-wallet' },
{ pattern: /wallet\.dat/gi, label: 'wallet-file' },
{ pattern: /id_rsa|id_ed25519|id_ecdsa/g, label: 'ssh-key-file' },
{ pattern: /\.pem\b|\.key\b|\.p12\b|\.pfx\b/g, label: 'cert-key-file' },
{ pattern: /\$AWS_SECRET\w*/gi, label: 'aws-secret-env' },
{ pattern: /\$AZURE_CLIENT_SECRET/gi, label: 'azure-secret-env' },
{ pattern: /\$GOOGLE_APPLICATION_CREDENTIALS/gi, label: 'gcp-creds-env' },
{ pattern: /\$(?:NPM_TOKEN|GITHUB_TOKEN|PYPI_TOKEN|ANTHROPIC_API_KEY)/gi, label: 'api-token-env' },
{ pattern: /process\.env\s*(?:\.\s*\w+|\[\s*['"`]\w+['"`]\s*\])/g, label: 'process-env-access' },
];
const PERSISTENCE_PATTERNS = [
{ pattern: /crontab/gi, label: 'crontab' },
{ pattern: /\/etc\/cron\.d/gi, label: 'cron.d' },
{ pattern: /launchctl\s+load/gi, label: 'launchctl-load' },
{ pattern: /LaunchAgents/gi, label: 'LaunchAgents' },
{ pattern: /RunAtLoad|StartInterval|KeepAlive/gi, label: 'plist-persistence' },
{ pattern: /systemctl\s+(?:enable|start)/gi, label: 'systemd' },
{ pattern: /ExecStart\s*=/gi, label: 'systemd-unit' },
{ pattern: /\.zshrc|\.bashrc|\.bash_profile|\.profile|\.zprofile|\.zshenv/g, label: 'shell-profile' },
{ pattern: /\.git\/hooks\//g, label: 'git-hooks' },
{ pattern: /\*\s+\*\s+\*\s+\*\s+\*/g, label: 'cron-schedule' },
];
const NETWORK_CALL_PATTERNS = [
/\bcurl\b/i, /\bwget\b/i, /\bfetch\s*\(/i, /\baxios\b/i,
/https?:\/\/\S+/i, /\.post\s*\(/i, /\.send\s*\(/i,
/XMLHttpRequest/i, /WebSocket/i,
];
const MCP_TOOL_PATTERNS = [
/server\.tool\s*\(\s*(['"`])([\s\S]*?)\1/g,
/@mcp\.tool/g,
/@server\.tool/g,
];
const MCP_DESC_PATTERN = /description\s*[:=]\s*(['"`])([\s\S]*?)\1/g;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function parseArgs(argv) {
const args = { target: null, outputFile: null };
for (let i = 0; i < argv.length; i++) {
if (argv[i] === '--output-file' && i + 1 < argv.length) {
args.outputFile = argv[++i];
} else if (!args.target) {
args.target = argv[i];
}
}
return args;
}
/** Strip injection patterns from text, return sanitized text + findings */
function stripInjection(text, file) {
const findings = [];
let sanitized = text;
const normalized = normalizeForScan(text);
const isDifferent = normalized !== text;
const allPatterns = [
...CRITICAL_PATTERNS.map(p => ({ ...p, severity: 'critical' })),
...HIGH_PATTERNS.map(p => ({ ...p, severity: 'high' })),
];
for (const { pattern, label, severity } of allPatterns) {
// Need fresh regex per match (some have /g, some don't)
const globalPattern = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g');
for (const variant of (isDifferent ? [text, normalized] : [text])) {
let match;
while ((match = globalPattern.exec(variant)) !== null) {
const line = variant.substring(0, match.index).split('\n').length;
findings.push({ file, line, label, severity });
// Replace in sanitized text (use original pattern position)
sanitized = sanitized.replace(match[0], `[INJECTION-PATTERN-STRIPPED: ${label}]`);
}
}
}
return { sanitized, findings };
}
/** Extract line number for a match index in text */
function lineAt(text, index) {
return text.substring(0, index).split('\n').length;
}
/** Get surrounding line as context snippet (max 200 chars) */
function contextSnippet(text, index) {
const lines = text.split('\n');
const lineNum = text.substring(0, index).split('\n').length - 1;
const line = lines[lineNum] || '';
return line.length > 200 ? line.substring(0, 200) + '...' : line;
}
/** Check if file is markdown */
function isMd(relPath) {
return /\.mdx?$/i.test(relPath);
}
/** Check if file is code */
function isCode(relPath) {
return /\.(js|mjs|cjs|ts|mts|cts|jsx|tsx|py|pyw|rb|go|rs|java|kt|cs|php)$/i.test(relPath);
}
/** Check if file is CLAUDE.md */
function isClaudeMd(relPath) {
return /(?:^|\/|\\)CLAUDE\.md$/i.test(relPath);
}
// ---------------------------------------------------------------------------
// Extraction passes
// ---------------------------------------------------------------------------
function extractFromText(text, patterns, file) {
const results = [];
for (const { pattern, label } of patterns) {
const globalPattern = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g');
let match;
while ((match = globalPattern.exec(text)) !== null) {
results.push({
file,
line: lineAt(text, match.index),
label,
match: match[0].length > 120 ? match[0].substring(0, 120) + '...' : match[0],
context_snippet: contextSnippet(text, match.index),
});
}
}
return results;
}
function extractShellFromCodeBlocks(text, file) {
const results = [];
const codeBlockRe = /```(?:bash|sh|shell|zsh|console)?\s*\n([\s\S]*?)```/gi;
let block;
while ((block = codeBlockRe.exec(text)) !== null) {
const blockContent = block[1];
const blockLine = lineAt(text, block.index);
for (const line of blockContent.split('\n')) {
const trimmed = line.replace(/^\$\s*/, '').trim();
if (trimmed.length > 3) {
results.push({
file,
line: blockLine,
command: trimmed.length > 200 ? trimmed.substring(0, 200) + '...' : trimmed,
context: 'code_block',
});
}
}
}
return results;
}
function extractMcpToolDescriptions(text, file) {
const results = [];
// Check for MCP-related patterns first
let hasMcp = false;
for (const pattern of MCP_TOOL_PATTERNS) {
const re = new RegExp(pattern.source, pattern.flags);
if (re.test(text)) { hasMcp = true; break; }
}
if (!hasMcp) return results;
const re = new RegExp(MCP_DESC_PATTERN.source, MCP_DESC_PATTERN.flags);
let match;
while ((match = re.exec(text)) !== null) {
const descText = match[2];
const injection = scanDescForInjection(descText);
results.push({
file,
line: lineAt(text, match.index),
tool_name: null, // Tool name often on separate line
description_text: descText.length > 500 ? descText.substring(0, 500) + '...' : descText,
char_count: descText.length,
injection_detected: injection.length > 0,
injection_labels: injection,
});
}
return results;
}
function scanDescForInjection(text) {
const labels = [];
const allPatterns = [...CRITICAL_PATTERNS, ...HIGH_PATTERNS];
for (const { pattern, label } of allPatterns) {
if (pattern.test(text)) labels.push(label);
}
return labels;
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main() {
const startTime = Date.now();
const { target, outputFile } = parseArgs(process.argv.slice(2));
if (!target) {
console.error('Usage: node content-extractor.mjs <target-path> --output-file <path>');
process.exit(1);
}
const targetPath = resolve(target);
const discovery = await discoverFiles(targetPath);
const { files } = discovery;
// Output containers
const injectionFindings = [];
const frontmatterInventory = [];
const shellCommands = [];
const credentialRefs = [];
const persistenceSignals = [];
const mcpToolDescriptions = [];
const claudeMdAnalysis = [];
const crossInstructionFlags = [];
let filesWithInjections = 0;
// Process each file
for (const fileInfo of files) {
const { absPath, relPath } = fileInfo;
const content = await readTextFile(absPath);
if (!content) continue;
// Pass 1: Injection strip
const { sanitized, findings: injFindings } = stripInjection(content, relPath);
if (injFindings.length > 0) {
injectionFindings.push(...injFindings);
filesWithInjections++;
}
// Pass 2: Frontmatter (markdown files only)
if (isMd(relPath)) {
const fm = parseFrontmatter(content);
if (fm) {
const fileType = classifyPluginFile(relPath, fm);
const tools = fm.allowed_tools || fm.tools || [];
const desc = fm.description || '';
const descInjection = scanDescForInjection(desc);
frontmatterInventory.push({
file: relPath,
type: fileType,
name: fm.name || null,
model: fm.model || null,
tools: Array.isArray(tools) ? tools : [tools],
description_snippet: desc.length > 200 ? desc.substring(0, 200) + '...' : desc,
injection_in_frontmatter: descInjection.length > 0,
injection_labels: descInjection.length > 0 ? descInjection : undefined,
});
}
}
// Pass 3a: Shell commands (markdown — code blocks + prose patterns)
if (isMd(relPath)) {
shellCommands.push(...extractShellFromCodeBlocks(sanitized, relPath));
const proseShell = extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath);
for (const s of proseShell) {
shellCommands.push({
file: s.file, line: s.line,
command: s.match,
context: 'prose',
});
}
}
// Also extract from code files
if (isCode(relPath)) {
const codeShell = extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath);
for (const s of codeShell) {
shellCommands.push({
file: s.file, line: s.line,
command: s.match,
context: 'source_code',
});
}
}
// Pass 3b: Credential paths
const creds = extractFromText(sanitized, CREDENTIAL_PATH_PATTERNS, relPath);
credentialRefs.push(...creds);
// Pass 3c: Persistence
const persistence = extractFromText(sanitized, PERSISTENCE_PATTERNS, relPath);
persistenceSignals.push(...persistence);
// Pass 4: MCP tool descriptions (code files only)
if (isCode(relPath)) {
mcpToolDescriptions.push(...extractMcpToolDescriptions(sanitized, relPath));
}
// Pass 5: CLAUDE.md special analysis
if (isClaudeMd(relPath)) {
const claudeShell = [
...extractShellFromCodeBlocks(sanitized, relPath),
...extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath).map(s => ({
file: s.file, line: s.line, command: s.match, context: 'prose',
})),
];
const claudeCreds = extractFromText(sanitized, CREDENTIAL_PATH_PATTERNS, relPath);
claudeMdAnalysis.push({
file: relPath,
sanitized_content: sanitized.length > 5000 ? sanitized.substring(0, 5000) + '\n[TRUNCATED]' : sanitized,
shell_commands: claudeShell,
credential_refs: claudeCreds,
injection_findings: injFindings.filter(f => f.file === relPath),
});
}
// Pass 6: Cross-instruction combination
const hasCred = creds.length > 0;
const hasNetwork = NETWORK_CALL_PATTERNS.some(p => p.test(sanitized));
if (hasCred && hasNetwork) {
crossInstructionFlags.push({
file: relPath,
combination: 'credential_access+network_call',
credential_ref: creds[0]?.label || 'unknown',
network_ref: 'network call detected in same file',
});
}
}
// Deterministic verdict
const hasInjection = injectionFindings.some(f => f.severity === 'critical');
const hasPersistence = persistenceSignals.length > 0;
const hasCredNetCombo = crossInstructionFlags.length > 0;
let riskLevel = 'low';
if (hasInjection || hasCredNetCombo) riskLevel = 'critical';
else if (injectionFindings.length > 0 || hasPersistence) riskLevel = 'high';
else if (credentialRefs.length > 0 || shellCommands.length > 5) riskLevel = 'medium';
const result = {
meta: {
target: targetPath,
timestamp: new Date().toISOString(),
files_scanned: files.length,
files_with_injections: filesWithInjections,
duration_ms: Date.now() - startTime,
},
injection_findings: injectionFindings,
frontmatter_inventory: frontmatterInventory,
shell_commands: shellCommands,
credential_references: credentialRefs,
persistence_signals: persistenceSignals,
mcp_tool_descriptions: mcpToolDescriptions,
claude_md_analysis: claudeMdAnalysis,
cross_instruction_flags: crossInstructionFlags,
deterministic_verdict: {
has_injection: injectionFindings.length > 0,
has_critical_injection: hasInjection,
has_persistence: hasPersistence,
has_credential_network_combo: hasCredNetCombo,
risk_level: riskLevel,
},
};
if (outputFile) {
writeFileSync(outputFile, JSON.stringify(result, null, 2));
// Compact summary to stdout
const summary = {
files_scanned: files.length,
injection_findings: injectionFindings.length,
shell_commands: shellCommands.length,
credential_references: credentialRefs.length,
persistence_signals: persistenceSignals.length,
mcp_tool_descriptions: mcpToolDescriptions.length,
claude_md_count: claudeMdAnalysis.length,
cross_instruction_flags: crossInstructionFlags.length,
risk_level: riskLevel,
};
process.stdout.write(JSON.stringify(summary) + '\n');
} else {
process.stdout.write(JSON.stringify(result, null, 2) + '\n');
}
}
main().catch(err => {
console.error(`content-extractor: ${err.message}`);
process.exit(1);
});