// network-mapper.mjs — Discovers and classifies all outbound URLs and network references // Zero dependencies (Node.js builtins only via lib helpers + node:dns). // // Rationale: Malicious skills and MCP servers frequently phone home to attacker-controlled // infrastructure — data exfiltration webhooks, tunneling services, URL shorteners that // redirect to C2 endpoints, or hardcoded IP addresses that bypass DNS/cert validation. // This scanner catalogs every network reference and flags anything suspicious. // // References: // - OWASP LLM02 (Sensitive Information Disclosure — exfiltration endpoints) // - OWASP LLM03 (Supply Chain — third-party network dependencies) // - MCPTox research: rug-pull via domain reassignment after install // - Pillar Security: MCP tool description injection with exfiltration callbacks import { readTextFile } from './lib/file-discovery.mjs'; import { finding, scannerResult } from './lib/output.mjs'; import { SEVERITY } from './lib/severity.mjs'; import { redact } from './lib/string-utils.mjs'; import dns from 'node:dns'; import { promisify } from 'node:util'; // --------------------------------------------------------------------------- // DNS helpers // --------------------------------------------------------------------------- const resolve4 = promisify(dns.resolve4); // --------------------------------------------------------------------------- // URL extraction patterns // --------------------------------------------------------------------------- /** Standard http/https URLs including query strings and fragments. */ const URL_REGEX = /https?:\/\/[^\s'"<>\]\)}{,]+/g; /** IP-based URLs — numeric host in http/https scheme. */ const IP_URL_REGEX = /https?:\/\/(\d{1,3}\.){3}\d{1,3}(?:[:/][^\s'"<>\]\)}{,]*)?/g; /** Bare IP addresses in source code, only matched when near network-related keywords. */ const BARE_IP_REGEX = /(? p.test(ip)); } // --------------------------------------------------------------------------- // URL normalization helpers // --------------------------------------------------------------------------- /** * Extract the effective domain from a URL string, stripping port and path. * Returns null if the URL cannot be parsed. */ function extractDomain(rawUrl) { try { const u = new URL(rawUrl); return u.hostname.toLowerCase().replace(/\.$/, ''); // strip trailing dot } catch { // Fallback: strip scheme and extract up to first / : ? # const m = rawUrl.match(/^https?:\/\/([^/:?#]+)/i); return m ? m[1].toLowerCase() : null; } } /** * Check whether a hostname is purely numeric (IPv4 address). */ function isIpAddress(host) { return /^(\d{1,3}\.){3}\d{1,3}$/.test(host); } /** * Validate that each octet of an IPv4 string is 0-255. */ function isValidIpv4(host) { const parts = host.split('.'); if (parts.length !== 4) return false; return parts.every((p) => { const n = Number(p); return Number.isInteger(n) && n >= 0 && n <= 255; }); } // --------------------------------------------------------------------------- // DNS resolution with timeout // --------------------------------------------------------------------------- const DNS_TIMEOUT_MS = 3000; const DNS_MAX_LOOKUPS = 50; /** * Attempt to resolve a domain to IPv4, with a hard timeout. * Returns { resolved: boolean, addresses: string[], lowTtl: boolean } or null on timeout/error. */ async function resolveDomain(domain) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), DNS_TIMEOUT_MS); try { // node:dns resolve4 does not natively support AbortController — we race with a // timeout promise instead. const raceResult = await Promise.race([ resolve4(domain), new Promise((_, reject) => setTimeout(() => reject(new Error('dns_timeout')), DNS_TIMEOUT_MS) ), ]); clearTimeout(timer); // Check for suspiciously low TTL (infrastructure churn indicator — common in rug-pulls). // node:dns.resolve4 with options is available from Node >=18. let lowTtl = false; try { const withTtl = await Promise.race([ dns.promises.resolve4(domain, { ttl: true }), new Promise((_, reject) => setTimeout(() => reject(new Error('dns_timeout')), DNS_TIMEOUT_MS) ), ]); if (Array.isArray(withTtl)) { lowTtl = withTtl.some((r) => typeof r === 'object' && r.ttl < 60); } } catch { // TTL check failed — non-fatal, ignore } return { resolved: true, addresses: raceResult, lowTtl }; } catch { clearTimeout(timer); return { resolved: false, addresses: [], lowTtl: false }; } } // --------------------------------------------------------------------------- // Per-file scanning // --------------------------------------------------------------------------- /** * Scan a single file for URLs and bare IP references. * * @param {string} content - File text content * @param {string} relPath - Relative file path for finding output * @returns {{ urlOccurrences: Map, * ipUrlOccurrences: Map, * bareIpOccurrences: Map }} */ function scanFileContent(content, relPath) { const urlOccurrences = new Map(); // normalized URL → [{relPath, line}] const ipUrlOccurrences = new Map(); // ip-based URL → [{relPath, line}] const bareIpOccurrences = new Map(); // bare IP → [{relPath, line}] const lines = content.split('\n'); for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) { const line = lines[lineIdx]; const lineNo = lineIdx + 1; // --- Extract standard http/https URLs --- const urlMatches = [...line.matchAll(URL_REGEX)]; for (const m of urlMatches) { const rawUrl = m[0].replace(/[.,;:!?]+$/, ''); // strip trailing punctuation const domain = extractDomain(rawUrl); if (!domain) continue; if (isIpAddress(domain)) { // Record as IP-based URL const key = rawUrl; if (!ipUrlOccurrences.has(key)) ipUrlOccurrences.set(key, []); ipUrlOccurrences.get(key).push({ relPath, line: lineNo }); } else { // Record as domain-based URL const key = rawUrl; if (!urlOccurrences.has(key)) urlOccurrences.set(key, []); urlOccurrences.get(key).push({ relPath, line: lineNo }); } } // --- Extract bare IP addresses (only when near network-context keywords) --- if (NETWORK_KEYWORDS.test(line)) { const ipMatches = [...line.matchAll(BARE_IP_REGEX)]; for (const m of ipMatches) { const ip = m[0]; if (!isValidIpv4(ip)) continue; // Skip IPs already captured as part of a URL in this line if (urlMatches.some((u) => u[0].includes(ip))) continue; const key = ip; if (!bareIpOccurrences.has(key)) bareIpOccurrences.set(key, []); bareIpOccurrences.get(key).push({ relPath, line: lineNo }); } } } return { urlOccurrences, ipUrlOccurrences, bareIpOccurrences }; } // --------------------------------------------------------------------------- // Merge occurrence maps across files // --------------------------------------------------------------------------- function mergeOccurrences(target, source) { for (const [key, locs] of source) { if (!target.has(key)) { target.set(key, [...locs]); } else { target.get(key).push(...locs); } } } // --------------------------------------------------------------------------- // Evidence formatter // --------------------------------------------------------------------------- /** * Build a compact evidence string from an occurrence list. * Shows up to 3 file+line references to keep findings readable. */ function formatLocations(occurrences) { const unique = []; const seenFiles = new Set(); for (const loc of occurrences) { const key = `${loc.relPath}:${loc.line}`; if (!seenFiles.has(key)) { seenFiles.add(key); unique.push(loc); } } const shown = unique.slice(0, 3); const overflow = unique.length - shown.length; const parts = shown.map((l) => `${l.relPath}:${l.line}`); if (overflow > 0) parts.push(`+${overflow} more`); return parts.join(', '); } // --------------------------------------------------------------------------- // Public scanner entry point // --------------------------------------------------------------------------- /** * Scan a target path for outbound URLs and network references. * * @param {string} targetPath - Absolute path to scan (file or directory root) * @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery * Pre-computed file discovery result from the orchestrator. * @returns {Promise} Scanner result envelope */ export async function scan(targetPath, discovery) { const startMs = Date.now(); const allFindings = []; let filesScanned = 0; // Aggregate occurrence maps across all files const allUrlOccurrences = new Map(); // rawUrl → [{relPath, line}] const allIpUrlOccurrences = new Map(); // rawUrl → [{relPath, line}] const allBareIpOccurrences = new Map(); // ip → [{relPath, line}] try { // --- Phase 1: File scanning --- for (const fileInfo of discovery.files) { const content = await readTextFile(fileInfo.absPath); if (content === null) continue; filesScanned++; const { urlOccurrences, ipUrlOccurrences, bareIpOccurrences } = scanFileContent(content, fileInfo.relPath); mergeOccurrences(allUrlOccurrences, urlOccurrences); mergeOccurrences(allIpUrlOccurrences, ipUrlOccurrences); mergeOccurrences(allBareIpOccurrences, bareIpOccurrences); } // --- Phase 2: Domain deduplication and classification --- // Collect unique domains from standard URLs, keyed by domain → [rawUrls] const domainToUrls = new Map(); for (const rawUrl of allUrlOccurrences.keys()) { const domain = extractDomain(rawUrl); if (!domain) continue; if (!domainToUrls.has(domain)) domainToUrls.set(domain, []); domainToUrls.get(domain).push(rawUrl); } // --- Phase 3: DNS resolution for suspicious + unknown domains (optional) --- let dnsLookupCount = 0; const dnsResults = new Map(); // domain → { resolved, addresses, lowTtl } const suspiciousAndUnknown = [...domainToUrls.keys()].filter( (d) => !TRUSTED_DOMAINS.has(d) && !isIpAddress(d) ); for (const domain of suspiciousAndUnknown) { if (dnsLookupCount >= DNS_MAX_LOOKUPS) break; dnsLookupCount++; const result = await resolveDomain(domain); dnsResults.set(domain, result); } // --- Phase 4: Generate findings for domain-based URLs --- for (const [domain, rawUrls] of domainToUrls) { // Skip trusted domains entirely if (TRUSTED_DOMAINS.has(domain)) continue; // Gather all occurrence locations for this domain const allLocs = rawUrls.flatMap((u) => allUrlOccurrences.get(u) || []); const locationStr = formatLocations(allLocs); // Choose a representative URL for evidence (shortest/cleanest) const repUrl = rawUrls.sort((a, b) => a.length - b.length)[0]; const repUrlRedacted = redact(repUrl, 60, 0); const dnsInfo = dnsResults.get(domain); const dnsNote = dnsInfo ? dnsInfo.resolved ? dnsInfo.lowTtl ? ` DNS resolved (LOW TTL <60s — suspicious infrastructure churn).` : ` DNS resolved to: ${dnsInfo.addresses.slice(0, 3).join(', ')}.` : ` DNS: NXDOMAIN or unreachable.` : ''; if (SUSPICIOUS_DOMAINS.has(domain)) { // HIGH: known exfiltration/tunneling/shortener domain allFindings.push( finding({ scanner: 'NET', severity: SEVERITY.HIGH, title: `Suspicious network endpoint: ${domain}`, description: `Domain "${domain}" is known to be used for data exfiltration, webhook interception, ` + `tunneling (bypasses corporate egress filtering), URL shortening (masks final destination), ` + `or ephemeral file sharing. Its presence in plugin/skill code is a strong indicator of ` + `malicious intent or accidental exfiltration risk.${dnsNote}`, file: allLocs[0]?.relPath || null, line: allLocs[0]?.line || null, evidence: `${repUrlRedacted} | found at: ${locationStr}`, owasp: 'LLM02', recommendation: 'This domain is commonly used for data exfiltration or tunneling. ' + 'Verify this URL is necessary and intended. If this is test code, move it to ' + 'a properly isolated test fixture. If it is production code, remove it.', }) ); } else { // INFO: unknown domain — catalog for review, no automatic blocking const lowTtlNote = dnsInfo?.resolved && dnsInfo?.lowTtl ? ' Low DNS TTL detected — possible domain reassignment risk (rug-pull vector).' : ''; allFindings.push( finding({ scanner: 'NET', severity: SEVERITY.INFO, title: `Unknown external domain: ${domain}`, description: `Domain "${domain}" is referenced in the codebase but is not on the trusted allowlist. ` + `This may be a legitimate third-party dependency, or it may be an unexpected outbound call. ` + `Review all network references to verify they are necessary and intentional.${dnsNote}${lowTtlNote}`, file: allLocs[0]?.relPath || null, line: allLocs[0]?.line || null, evidence: `${repUrlRedacted} | found at: ${locationStr}`, owasp: 'LLM03', recommendation: 'Verify this external domain is a known, trusted dependency. ' + 'Document its purpose if it is legitimate.', }) ); } } // --- Phase 5: IP-based URL findings --- for (const [rawUrl, locs] of allIpUrlOccurrences) { const domain = extractDomain(rawUrl); if (!domain) continue; if (!isValidIpv4(domain)) continue; // Skip loopback/private — these are MEDIUM, not HIGH const isPrivate = isPrivateOrLoopback(domain); const severity = isPrivate ? SEVERITY.MEDIUM : SEVERITY.HIGH; const locationStr = formatLocations(locs); const urlRedacted = redact(rawUrl, 60, 0); allFindings.push( finding({ scanner: 'NET', severity, title: `IP-based URL: ${domain}`, description: isPrivate ? `URL "${urlRedacted}" uses a private/loopback IP address instead of a domain name. ` + `While likely targeting a local service, hardcoded private IPs reduce portability ` + `and can indicate development-time infrastructure left in production code.` : `URL "${urlRedacted}" uses a public IP address instead of a domain name. ` + `IP-based URLs bypass DNS-based security controls, certificate transparency, ` + `and many proxy/firewall filtering mechanisms. This is a common technique used ` + `by malware to connect to C2 infrastructure that avoids domain reputation checks.`, file: locs[0]?.relPath || null, line: locs[0]?.line || null, evidence: `${urlRedacted} | found at: ${locationStr}`, owasp: isPrivate ? 'LLM03' : 'LLM02', recommendation: isPrivate ? 'Replace hardcoded private IP with a configurable hostname or environment variable.' : 'IP-based URLs bypass DNS and certificate validation. Use a domain name instead.', }) ); } // --- Phase 6: Bare IP findings --- for (const [ip, locs] of allBareIpOccurrences) { if (!isValidIpv4(ip)) continue; if (isPrivateOrLoopback(ip)) continue; // Low signal for bare private IPs — skip const locationStr = formatLocations(locs); allFindings.push( finding({ scanner: 'NET', severity: SEVERITY.MEDIUM, title: `Bare public IP address in network context: ${ip}`, description: `A public IP address "${ip}" appears near network-related code (fetch, http, connect, etc.) ` + `without being part of a URL. This may indicate a hardcoded server address that bypasses ` + `DNS resolution and certificate validation controls.`, file: locs[0]?.relPath || null, line: locs[0]?.line || null, evidence: `IP: ${ip} | found at: ${locationStr}`, owasp: 'LLM02', recommendation: 'IP-based URLs bypass DNS and certificate validation. Use a domain name instead.', }) ); } const durationMs = Date.now() - startMs; return scannerResult('network-mapper', 'ok', allFindings, filesScanned, durationMs); } catch (err) { const durationMs = Date.now() - startMs; return scannerResult( 'network-mapper', 'error', allFindings, filesScanned, durationMs, String(err?.message || err) ); } }