423 lines
15 KiB
JavaScript
423 lines
15 KiB
JavaScript
#!/usr/bin/env node
|
|
// content-extractor.mjs — Pre-extraction indirection layer for remote repo scanning
|
|
// Produces a structured JSON "evidence package" that LLM agents analyze
|
|
// instead of reading raw (potentially malicious) file content.
|
|
//
|
|
// Usage: node content-extractor.mjs <target-path> --output-file <path>
|
|
|
|
import { writeFileSync } from 'node:fs';
|
|
import { resolve, relative } from 'node:path';
|
|
import { discoverFiles, readTextFile } from './lib/file-discovery.mjs';
|
|
import { CRITICAL_PATTERNS, HIGH_PATTERNS } from './lib/injection-patterns.mjs';
|
|
import { normalizeForScan } from './lib/string-utils.mjs';
|
|
import { parseFrontmatter, classifyPluginFile } from './lib/yaml-frontmatter.mjs';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Pattern sets for extraction passes
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const SHELL_CMD_PATTERNS = [
|
|
{ pattern: /curl\s+[^|]*\|\s*(?:ba)?sh/gi, label: 'curl-pipe-to-shell' },
|
|
{ pattern: /wget\s+[^|]*\|\s*(?:ba)?sh/gi, label: 'wget-pipe-to-shell' },
|
|
{ pattern: /curl\s+-[fsSLo]*\s+https?:\/\/\S+/gi, label: 'curl-download' },
|
|
{ pattern: /npm\s+install\s+(?!-[DdgE])\S+/gi, label: 'npm-install' },
|
|
{ pattern: /pip3?\s+install\s+\S+/gi, label: 'pip-install' },
|
|
{ pattern: /yarn\s+add\s+\S+/gi, label: 'yarn-add' },
|
|
{ pattern: /chmod\s+[0-7]+\s+\S+/gi, label: 'chmod' },
|
|
{ pattern: /sudo\s+\S+/gi, label: 'sudo' },
|
|
{ pattern: /eval\s*\(/gi, label: 'eval' },
|
|
{ pattern: /echo\s+["'][^"']*["']\s*\|\s*base64\s+-d\s*\|\s*(?:ba)?sh/gi, label: 'base64-decode-exec' },
|
|
{ pattern: /gh\s+api\s+[^\\]*\/starred\//gi, label: 'gh-api-star' },
|
|
{ pattern: /gh\s+api\s+--method\s+(?:PUT|POST|DELETE)/gi, label: 'gh-api-mutation' },
|
|
];
|
|
|
|
const CREDENTIAL_PATH_PATTERNS = [
|
|
{ pattern: /~\/\.ssh\/\S*/g, label: 'ssh-dir' },
|
|
{ pattern: /~\/\.aws\/\S*/g, label: 'aws-dir' },
|
|
{ pattern: /~\/\.env\b/g, label: 'dotenv' },
|
|
{ pattern: /~\/\.npmrc\b/g, label: 'npmrc' },
|
|
{ pattern: /~\/\.netrc\b/g, label: 'netrc' },
|
|
{ pattern: /~\/\.gitconfig\b/g, label: 'gitconfig' },
|
|
{ pattern: /~\/\.gnupg\/\S*/g, label: 'gnupg-dir' },
|
|
{ pattern: /~\/Library\/Application\s+Support\/\S+/g, label: 'macos-app-support' },
|
|
{ pattern: /~\/\.ethereum\/\S*/g, label: 'ethereum-wallet' },
|
|
{ pattern: /wallet\.dat/gi, label: 'wallet-file' },
|
|
{ pattern: /id_rsa|id_ed25519|id_ecdsa/g, label: 'ssh-key-file' },
|
|
{ pattern: /\.pem\b|\.key\b|\.p12\b|\.pfx\b/g, label: 'cert-key-file' },
|
|
{ pattern: /\$AWS_SECRET\w*/gi, label: 'aws-secret-env' },
|
|
{ pattern: /\$AZURE_CLIENT_SECRET/gi, label: 'azure-secret-env' },
|
|
{ pattern: /\$GOOGLE_APPLICATION_CREDENTIALS/gi, label: 'gcp-creds-env' },
|
|
{ pattern: /\$(?:NPM_TOKEN|GITHUB_TOKEN|PYPI_TOKEN|ANTHROPIC_API_KEY)/gi, label: 'api-token-env' },
|
|
{ pattern: /process\.env\s*(?:\.\s*\w+|\[\s*['"`]\w+['"`]\s*\])/g, label: 'process-env-access' },
|
|
];
|
|
|
|
const PERSISTENCE_PATTERNS = [
|
|
{ pattern: /crontab/gi, label: 'crontab' },
|
|
{ pattern: /\/etc\/cron\.d/gi, label: 'cron.d' },
|
|
{ pattern: /launchctl\s+load/gi, label: 'launchctl-load' },
|
|
{ pattern: /LaunchAgents/gi, label: 'LaunchAgents' },
|
|
{ pattern: /RunAtLoad|StartInterval|KeepAlive/gi, label: 'plist-persistence' },
|
|
{ pattern: /systemctl\s+(?:enable|start)/gi, label: 'systemd' },
|
|
{ pattern: /ExecStart\s*=/gi, label: 'systemd-unit' },
|
|
{ pattern: /\.zshrc|\.bashrc|\.bash_profile|\.profile|\.zprofile|\.zshenv/g, label: 'shell-profile' },
|
|
{ pattern: /\.git\/hooks\//g, label: 'git-hooks' },
|
|
{ pattern: /\*\s+\*\s+\*\s+\*\s+\*/g, label: 'cron-schedule' },
|
|
];
|
|
|
|
const NETWORK_CALL_PATTERNS = [
|
|
/\bcurl\b/i, /\bwget\b/i, /\bfetch\s*\(/i, /\baxios\b/i,
|
|
/https?:\/\/\S+/i, /\.post\s*\(/i, /\.send\s*\(/i,
|
|
/XMLHttpRequest/i, /WebSocket/i,
|
|
];
|
|
|
|
const MCP_TOOL_PATTERNS = [
|
|
/server\.tool\s*\(\s*(['"`])([\s\S]*?)\1/g,
|
|
/@mcp\.tool/g,
|
|
/@server\.tool/g,
|
|
];
|
|
|
|
const MCP_DESC_PATTERN = /description\s*[:=]\s*(['"`])([\s\S]*?)\1/g;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function parseArgs(argv) {
|
|
const args = { target: null, outputFile: null };
|
|
for (let i = 0; i < argv.length; i++) {
|
|
if (argv[i] === '--output-file' && i + 1 < argv.length) {
|
|
args.outputFile = argv[++i];
|
|
} else if (!args.target) {
|
|
args.target = argv[i];
|
|
}
|
|
}
|
|
return args;
|
|
}
|
|
|
|
/** Strip injection patterns from text, return sanitized text + findings */
|
|
function stripInjection(text, file) {
|
|
const findings = [];
|
|
let sanitized = text;
|
|
const normalized = normalizeForScan(text);
|
|
const isDifferent = normalized !== text;
|
|
|
|
const allPatterns = [
|
|
...CRITICAL_PATTERNS.map(p => ({ ...p, severity: 'critical' })),
|
|
...HIGH_PATTERNS.map(p => ({ ...p, severity: 'high' })),
|
|
];
|
|
|
|
for (const { pattern, label, severity } of allPatterns) {
|
|
// Need fresh regex per match (some have /g, some don't)
|
|
const globalPattern = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g');
|
|
|
|
for (const variant of (isDifferent ? [text, normalized] : [text])) {
|
|
let match;
|
|
while ((match = globalPattern.exec(variant)) !== null) {
|
|
const line = variant.substring(0, match.index).split('\n').length;
|
|
findings.push({ file, line, label, severity });
|
|
// Replace in sanitized text (use original pattern position)
|
|
sanitized = sanitized.replace(match[0], `[INJECTION-PATTERN-STRIPPED: ${label}]`);
|
|
}
|
|
}
|
|
}
|
|
|
|
return { sanitized, findings };
|
|
}
|
|
|
|
/** Extract line number for a match index in text */
|
|
function lineAt(text, index) {
|
|
return text.substring(0, index).split('\n').length;
|
|
}
|
|
|
|
/** Get surrounding line as context snippet (max 200 chars) */
|
|
function contextSnippet(text, index) {
|
|
const lines = text.split('\n');
|
|
const lineNum = text.substring(0, index).split('\n').length - 1;
|
|
const line = lines[lineNum] || '';
|
|
return line.length > 200 ? line.substring(0, 200) + '...' : line;
|
|
}
|
|
|
|
/** Check if file is markdown */
|
|
function isMd(relPath) {
|
|
return /\.mdx?$/i.test(relPath);
|
|
}
|
|
|
|
/** Check if file is code */
|
|
function isCode(relPath) {
|
|
return /\.(js|mjs|cjs|ts|mts|cts|jsx|tsx|py|pyw|rb|go|rs|java|kt|cs|php)$/i.test(relPath);
|
|
}
|
|
|
|
/** Check if file is CLAUDE.md */
|
|
function isClaudeMd(relPath) {
|
|
return /(?:^|\/|\\)CLAUDE\.md$/i.test(relPath);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Extraction passes
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function extractFromText(text, patterns, file) {
|
|
const results = [];
|
|
for (const { pattern, label } of patterns) {
|
|
const globalPattern = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g');
|
|
let match;
|
|
while ((match = globalPattern.exec(text)) !== null) {
|
|
results.push({
|
|
file,
|
|
line: lineAt(text, match.index),
|
|
label,
|
|
match: match[0].length > 120 ? match[0].substring(0, 120) + '...' : match[0],
|
|
context_snippet: contextSnippet(text, match.index),
|
|
});
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
function extractShellFromCodeBlocks(text, file) {
|
|
const results = [];
|
|
const codeBlockRe = /```(?:bash|sh|shell|zsh|console)?\s*\n([\s\S]*?)```/gi;
|
|
let block;
|
|
while ((block = codeBlockRe.exec(text)) !== null) {
|
|
const blockContent = block[1];
|
|
const blockLine = lineAt(text, block.index);
|
|
for (const line of blockContent.split('\n')) {
|
|
const trimmed = line.replace(/^\$\s*/, '').trim();
|
|
if (trimmed.length > 3) {
|
|
results.push({
|
|
file,
|
|
line: blockLine,
|
|
command: trimmed.length > 200 ? trimmed.substring(0, 200) + '...' : trimmed,
|
|
context: 'code_block',
|
|
});
|
|
}
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
function extractMcpToolDescriptions(text, file) {
|
|
const results = [];
|
|
// Check for MCP-related patterns first
|
|
let hasMcp = false;
|
|
for (const pattern of MCP_TOOL_PATTERNS) {
|
|
const re = new RegExp(pattern.source, pattern.flags);
|
|
if (re.test(text)) { hasMcp = true; break; }
|
|
}
|
|
if (!hasMcp) return results;
|
|
|
|
const re = new RegExp(MCP_DESC_PATTERN.source, MCP_DESC_PATTERN.flags);
|
|
let match;
|
|
while ((match = re.exec(text)) !== null) {
|
|
const descText = match[2];
|
|
const injection = scanDescForInjection(descText);
|
|
results.push({
|
|
file,
|
|
line: lineAt(text, match.index),
|
|
tool_name: null, // Tool name often on separate line
|
|
description_text: descText.length > 500 ? descText.substring(0, 500) + '...' : descText,
|
|
char_count: descText.length,
|
|
injection_detected: injection.length > 0,
|
|
injection_labels: injection,
|
|
});
|
|
}
|
|
return results;
|
|
}
|
|
|
|
function scanDescForInjection(text) {
|
|
const labels = [];
|
|
const allPatterns = [...CRITICAL_PATTERNS, ...HIGH_PATTERNS];
|
|
for (const { pattern, label } of allPatterns) {
|
|
if (pattern.test(text)) labels.push(label);
|
|
}
|
|
return labels;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function main() {
|
|
const startTime = Date.now();
|
|
const { target, outputFile } = parseArgs(process.argv.slice(2));
|
|
|
|
if (!target) {
|
|
console.error('Usage: node content-extractor.mjs <target-path> --output-file <path>');
|
|
process.exit(1);
|
|
}
|
|
|
|
const targetPath = resolve(target);
|
|
const discovery = await discoverFiles(targetPath);
|
|
const { files } = discovery;
|
|
|
|
// Output containers
|
|
const injectionFindings = [];
|
|
const frontmatterInventory = [];
|
|
const shellCommands = [];
|
|
const credentialRefs = [];
|
|
const persistenceSignals = [];
|
|
const mcpToolDescriptions = [];
|
|
const claudeMdAnalysis = [];
|
|
const crossInstructionFlags = [];
|
|
let filesWithInjections = 0;
|
|
|
|
// Process each file
|
|
for (const fileInfo of files) {
|
|
const { absPath, relPath } = fileInfo;
|
|
const content = await readTextFile(absPath);
|
|
if (!content) continue;
|
|
|
|
// Pass 1: Injection strip
|
|
const { sanitized, findings: injFindings } = stripInjection(content, relPath);
|
|
if (injFindings.length > 0) {
|
|
injectionFindings.push(...injFindings);
|
|
filesWithInjections++;
|
|
}
|
|
|
|
// Pass 2: Frontmatter (markdown files only)
|
|
if (isMd(relPath)) {
|
|
const fm = parseFrontmatter(content);
|
|
if (fm) {
|
|
const fileType = classifyPluginFile(relPath, fm);
|
|
const tools = fm.allowed_tools || fm.tools || [];
|
|
const desc = fm.description || '';
|
|
const descInjection = scanDescForInjection(desc);
|
|
frontmatterInventory.push({
|
|
file: relPath,
|
|
type: fileType,
|
|
name: fm.name || null,
|
|
model: fm.model || null,
|
|
tools: Array.isArray(tools) ? tools : [tools],
|
|
description_snippet: desc.length > 200 ? desc.substring(0, 200) + '...' : desc,
|
|
injection_in_frontmatter: descInjection.length > 0,
|
|
injection_labels: descInjection.length > 0 ? descInjection : undefined,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Pass 3a: Shell commands (markdown — code blocks + prose patterns)
|
|
if (isMd(relPath)) {
|
|
shellCommands.push(...extractShellFromCodeBlocks(sanitized, relPath));
|
|
const proseShell = extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath);
|
|
for (const s of proseShell) {
|
|
shellCommands.push({
|
|
file: s.file, line: s.line,
|
|
command: s.match,
|
|
context: 'prose',
|
|
});
|
|
}
|
|
}
|
|
// Also extract from code files
|
|
if (isCode(relPath)) {
|
|
const codeShell = extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath);
|
|
for (const s of codeShell) {
|
|
shellCommands.push({
|
|
file: s.file, line: s.line,
|
|
command: s.match,
|
|
context: 'source_code',
|
|
});
|
|
}
|
|
}
|
|
|
|
// Pass 3b: Credential paths
|
|
const creds = extractFromText(sanitized, CREDENTIAL_PATH_PATTERNS, relPath);
|
|
credentialRefs.push(...creds);
|
|
|
|
// Pass 3c: Persistence
|
|
const persistence = extractFromText(sanitized, PERSISTENCE_PATTERNS, relPath);
|
|
persistenceSignals.push(...persistence);
|
|
|
|
// Pass 4: MCP tool descriptions (code files only)
|
|
if (isCode(relPath)) {
|
|
mcpToolDescriptions.push(...extractMcpToolDescriptions(sanitized, relPath));
|
|
}
|
|
|
|
// Pass 5: CLAUDE.md special analysis
|
|
if (isClaudeMd(relPath)) {
|
|
const claudeShell = [
|
|
...extractShellFromCodeBlocks(sanitized, relPath),
|
|
...extractFromText(sanitized, SHELL_CMD_PATTERNS, relPath).map(s => ({
|
|
file: s.file, line: s.line, command: s.match, context: 'prose',
|
|
})),
|
|
];
|
|
const claudeCreds = extractFromText(sanitized, CREDENTIAL_PATH_PATTERNS, relPath);
|
|
claudeMdAnalysis.push({
|
|
file: relPath,
|
|
sanitized_content: sanitized.length > 5000 ? sanitized.substring(0, 5000) + '\n[TRUNCATED]' : sanitized,
|
|
shell_commands: claudeShell,
|
|
credential_refs: claudeCreds,
|
|
injection_findings: injFindings.filter(f => f.file === relPath),
|
|
});
|
|
}
|
|
|
|
// Pass 6: Cross-instruction combination
|
|
const hasCred = creds.length > 0;
|
|
const hasNetwork = NETWORK_CALL_PATTERNS.some(p => p.test(sanitized));
|
|
if (hasCred && hasNetwork) {
|
|
crossInstructionFlags.push({
|
|
file: relPath,
|
|
combination: 'credential_access+network_call',
|
|
credential_ref: creds[0]?.label || 'unknown',
|
|
network_ref: 'network call detected in same file',
|
|
});
|
|
}
|
|
}
|
|
|
|
// Deterministic verdict
|
|
const hasInjection = injectionFindings.some(f => f.severity === 'critical');
|
|
const hasPersistence = persistenceSignals.length > 0;
|
|
const hasCredNetCombo = crossInstructionFlags.length > 0;
|
|
let riskLevel = 'low';
|
|
if (hasInjection || hasCredNetCombo) riskLevel = 'critical';
|
|
else if (injectionFindings.length > 0 || hasPersistence) riskLevel = 'high';
|
|
else if (credentialRefs.length > 0 || shellCommands.length > 5) riskLevel = 'medium';
|
|
|
|
const result = {
|
|
meta: {
|
|
target: targetPath,
|
|
timestamp: new Date().toISOString(),
|
|
files_scanned: files.length,
|
|
files_with_injections: filesWithInjections,
|
|
duration_ms: Date.now() - startTime,
|
|
},
|
|
injection_findings: injectionFindings,
|
|
frontmatter_inventory: frontmatterInventory,
|
|
shell_commands: shellCommands,
|
|
credential_references: credentialRefs,
|
|
persistence_signals: persistenceSignals,
|
|
mcp_tool_descriptions: mcpToolDescriptions,
|
|
claude_md_analysis: claudeMdAnalysis,
|
|
cross_instruction_flags: crossInstructionFlags,
|
|
deterministic_verdict: {
|
|
has_injection: injectionFindings.length > 0,
|
|
has_critical_injection: hasInjection,
|
|
has_persistence: hasPersistence,
|
|
has_credential_network_combo: hasCredNetCombo,
|
|
risk_level: riskLevel,
|
|
},
|
|
};
|
|
|
|
if (outputFile) {
|
|
writeFileSync(outputFile, JSON.stringify(result, null, 2));
|
|
// Compact summary to stdout
|
|
const summary = {
|
|
files_scanned: files.length,
|
|
injection_findings: injectionFindings.length,
|
|
shell_commands: shellCommands.length,
|
|
credential_references: credentialRefs.length,
|
|
persistence_signals: persistenceSignals.length,
|
|
mcp_tool_descriptions: mcpToolDescriptions.length,
|
|
claude_md_count: claudeMdAnalysis.length,
|
|
cross_instruction_flags: crossInstructionFlags.length,
|
|
risk_level: riskLevel,
|
|
};
|
|
process.stdout.write(JSON.stringify(summary) + '\n');
|
|
} else {
|
|
process.stdout.write(JSON.stringify(result, null, 2) + '\n');
|
|
}
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error(`content-extractor: ${err.message}`);
|
|
process.exit(1);
|
|
});
|