// network-mapper.mjs — Discovers and classifies all outbound URLs and network references
// Zero dependencies (Node.js builtins only via lib helpers + node:dns).
//
// Rationale: Malicious skills and MCP servers frequently phone home to attacker-controlled
// infrastructure — data exfiltration webhooks, tunneling services, URL shorteners that
// redirect to C2 endpoints, or hardcoded IP addresses that bypass DNS/cert validation.
// This scanner catalogs every network reference and flags anything suspicious.
//
// References:
//   - OWASP LLM02 (Sensitive Information Disclosure — exfiltration endpoints)
//   - OWASP LLM03 (Supply Chain — third-party network dependencies)
//   - MCPTox research: rug-pull via domain reassignment after install
//   - Pillar Security: MCP tool description injection with exfiltration callbacks

import { readTextFile } from './lib/file-discovery.mjs';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { redact } from './lib/string-utils.mjs';
import dns from 'node:dns';
import { promisify } from 'node:util';

// ---------------------------------------------------------------------------
// DNS helpers
// ---------------------------------------------------------------------------

const resolve4 = promisify(dns.resolve4);

// ---------------------------------------------------------------------------
// URL extraction patterns
// ---------------------------------------------------------------------------

/** Standard http/https URLs including query strings and fragments. */
const URL_REGEX = /https?:\/\/[^\s'"<>\]\)}{,]+/g;

/** IP-based URLs — numeric host in http/https scheme. */
const IP_URL_REGEX = /https?:\/\/(\d{1,3}\.){3}\d{1,3}(?:[:/][^\s'"<>\]\)}{,]*)?/g;

/** Bare IP addresses in source code, only matched when near network-related keywords. */
const BARE_IP_REGEX = /(?<!\d)(\d{1,3}\.){3}\d{1,3}(?!\d)/g;

/** Network-context keywords that make a bare IP worth reporting. */
const NETWORK_KEYWORDS = /\b(?:fetch|http|https|connect|socket|tcp|udp|url|endpoint|host|addr|address|server|client|request|xhr|axios|got|superagent|node-fetch|undici)\b/i;

// ---------------------------------------------------------------------------
// Domain classification sets
// ---------------------------------------------------------------------------

/**
 * Trusted domains — documentation sites, package registries, major cloud providers,
 * and RFC 2606 reserved example domains. No finding generated for these.
 */
const TRUSTED_DOMAINS = new Set([
  // Source forges and package registries
  'github.com', 'api.github.com', 'raw.githubusercontent.com', 'gist.github.com',
  'gitlab.com', 'bitbucket.org',
  'npmjs.org', 'www.npmjs.com', 'registry.npmjs.org',
  'pypi.org', 'files.pythonhosted.org',
  'crates.io', 'static.crates.io',
  'pkg.go.dev', 'proxy.golang.org',
  'rubygems.org', 'packagist.org',
  'nuget.org', 'api.nuget.org',

  // Microsoft ecosystem
  'microsoft.com', 'learn.microsoft.com', 'aka.ms', 'azure.com',
  'azurewebsites.net', 'azurestaticapps.net',
  'dev.azure.com', 'management.azure.com',
  'login.microsoftonline.com', 'graph.microsoft.com',
  'schemas.microsoft.com',
  'outlook.com', 'office.com', 'office365.com',

  // AI providers (primary)
  'anthropic.com', 'api.anthropic.com',
  'openai.com', 'api.openai.com',
  'huggingface.co', 'api-inference.huggingface.co',

  // Google / GCP
  'google.com', 'googleapis.com', 'gstatic.com', 'googleusercontent.com',
  'cloud.google.com',

  // AWS
  'amazonaws.com', 'aws.amazon.com', 'awsstatic.com',

  // Standards and documentation
  'stackoverflow.com',
  'developer.mozilla.org', 'mdn.io',
  'wikipedia.org', 'en.wikipedia.org',
  'www.w3.org', 'w3.org',
  'json-schema.org',
  'spdx.org',
  'creativecommons.org',
  'owasp.org',
  'ietf.org', 'rfc-editor.org', 'tools.ietf.org',
  'ecma-international.org',

  // CI/CD and devtools
  'travis-ci.com', 'travis-ci.org',
  'circleci.com',
  'codecov.io', 'coveralls.io',
  'snyk.io',
  'semver.org',
  'shields.io', 'img.shields.io',  // badge URLs in README

  // RFC 2606 reserved (safe by design)
  'example.com', 'example.org', 'example.net',
  'test.com', 'localhost',

  // Local addresses (handled as trusted, not flagged as IP-based)
  '127.0.0.1', '0.0.0.0', '::1',
]);

/**
 * Suspicious domains known to be used for data exfiltration, webhook interception,
 * tunneling, URL shortening (which can redirect to C2), or paste-bin style exfiltration.
 *
 * Severity: HIGH
 */
const SUSPICIOUS_DOMAINS = new Set([
  // Webhook inspection / interception services
  'webhook.site', 'webhookinbox.com',
  'requestbin.com', 'requestbin.net', 'requestbin.org',
  'pipedream.net',
  'hookbin.com',
  'beeceptor.com',
  'requestcatcher.com',
  'smee.io',
  'hookdeck.com',

  // HTTP tunneling / ngrok-alikes
  'ngrok.io', 'ngrok.app', 'ngrok-free.app', 'ngrok.com',
  'serveo.net',
  'localtunnel.me',
  'localhost.run',
  'bore.pub',
  'telebit.cloud',
  'zrok.io',
  'pagekite.me',

  // Paste / ephemeral file sharing (exfiltration vectors)
  'pastebin.com', 'pastebin.pl',
  'paste.ee',
  'hastebin.com',
  'dpaste.org', 'dpaste.com',
  'sprunge.us',
  'ix.io',
  'clbin.com',

  // Ephemeral file hosting
  'transfer.sh',
  'file.io',
  'filedropper.com',
  'filebin.net',
  'tmpfiles.org',
  'temp.sh',

  // URL shorteners (can mask final destination)
  'bit.ly',
  'tinyurl.com',
  'is.gd',
  't.co',
  'goo.gl',
  'ow.ly',
  'buff.ly',
  'rebrand.ly',
  'shorturl.at',
  'cutt.ly',
  'tiny.cc',

  // Chat platform webhooks (legitimate, but suspicious in plugin code)
  'discord.gg',
  'discord.com',       // discord.com/api/webhooks is a common exfil target
  'slack.com',         // slack.com/api with a bot token in code is suspicious
  'telegram.org', 'api.telegram.org',
]);

// ---------------------------------------------------------------------------
// Local / loopback address detection
// ---------------------------------------------------------------------------

/** Pattern matching loopback and non-routable addresses. These get MEDIUM (not HIGH). */
const LOOPBACK_PATTERNS = [
  /^127\.\d+\.\d+\.\d+$/,    // 127.x.x.x loopback range
  /^0\.0\.0\.0$/,             // wildcard bind
  /^10\.\d+\.\d+\.\d+$/,     // RFC 1918 — Class A private
  /^192\.168\.\d+\.\d+$/,    // RFC 1918 — Class C private
  /^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$/, // RFC 1918 — Class B private
  /^169\.254\.\d+\.\d+$/,    // Link-local
  /^::1$/,                    // IPv6 loopback
];

function isPrivateOrLoopback(ip) {
  return LOOPBACK_PATTERNS.some((p) => p.test(ip));
}

// ---------------------------------------------------------------------------
// URL normalization helpers
// ---------------------------------------------------------------------------

/**
 * Extract the effective domain from a URL string, stripping port and path.
 * Returns null if the URL cannot be parsed.
 */
function extractDomain(rawUrl) {
  try {
    const u = new URL(rawUrl);
    return u.hostname.toLowerCase().replace(/\.$/, ''); // strip trailing dot
  } catch {
    // Fallback: strip scheme and extract up to first / : ? #
    const m = rawUrl.match(/^https?:\/\/([^/:?#]+)/i);
    return m ? m[1].toLowerCase() : null;
  }
}

/**
 * Check whether a hostname is purely numeric (IPv4 address).
 */
function isIpAddress(host) {
  return /^(\d{1,3}\.){3}\d{1,3}$/.test(host);
}

/**
 * Validate that each octet of an IPv4 string is 0-255.
 */
function isValidIpv4(host) {
  const parts = host.split('.');
  if (parts.length !== 4) return false;
  return parts.every((p) => {
    const n = Number(p);
    return Number.isInteger(n) && n >= 0 && n <= 255;
  });
}

// ---------------------------------------------------------------------------
// DNS resolution with timeout
// ---------------------------------------------------------------------------

const DNS_TIMEOUT_MS = 3000;
const DNS_MAX_LOOKUPS = 50;

/**
 * Attempt to resolve a domain to IPv4, with a hard timeout.
 * Returns { resolved: boolean, addresses: string[], lowTtl: boolean } or null on timeout/error.
 */
async function resolveDomain(domain) {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), DNS_TIMEOUT_MS);

  try {
    // node:dns resolve4 does not natively support AbortController — we race with a
    // timeout promise instead.
    const raceResult = await Promise.race([
      resolve4(domain),
      new Promise((_, reject) =>
        setTimeout(() => reject(new Error('dns_timeout')), DNS_TIMEOUT_MS)
      ),
    ]);
    clearTimeout(timer);

    // Check for suspiciously low TTL (infrastructure churn indicator — common in rug-pulls).
    // node:dns.resolve4 with options is available from Node >=18.
    let lowTtl = false;
    try {
      const withTtl = await Promise.race([
        dns.promises.resolve4(domain, { ttl: true }),
        new Promise((_, reject) =>
          setTimeout(() => reject(new Error('dns_timeout')), DNS_TIMEOUT_MS)
        ),
      ]);
      if (Array.isArray(withTtl)) {
        lowTtl = withTtl.some((r) => typeof r === 'object' && r.ttl < 60);
      }
    } catch {
      // TTL check failed — non-fatal, ignore
    }

    return { resolved: true, addresses: raceResult, lowTtl };
  } catch {
    clearTimeout(timer);
    return { resolved: false, addresses: [], lowTtl: false };
  }
}

// ---------------------------------------------------------------------------
// Per-file scanning
// ---------------------------------------------------------------------------

/**
 * Scan a single file for URLs and bare IP references.
 *
 * @param {string} content  - File text content
 * @param {string} relPath  - Relative file path for finding output
 * @returns {{ urlOccurrences: Map<string, { relPath: string, line: number }[]>,
 *             ipUrlOccurrences: Map<string, { relPath: string, line: number }[]>,
 *             bareIpOccurrences: Map<string, { relPath: string, line: number }[]> }}
 */
function scanFileContent(content, relPath) {
  const urlOccurrences = new Map();    // normalized URL → [{relPath, line}]
  const ipUrlOccurrences = new Map();  // ip-based URL → [{relPath, line}]
  const bareIpOccurrences = new Map(); // bare IP → [{relPath, line}]

  const lines = content.split('\n');

  for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
    const line = lines[lineIdx];
    const lineNo = lineIdx + 1;

    // --- Extract standard http/https URLs ---
    const urlMatches = [...line.matchAll(URL_REGEX)];
    for (const m of urlMatches) {
      const rawUrl = m[0].replace(/[.,;:!?]+$/, ''); // strip trailing punctuation
      const domain = extractDomain(rawUrl);
      if (!domain) continue;

      if (isIpAddress(domain)) {
        // Record as IP-based URL
        const key = rawUrl;
        if (!ipUrlOccurrences.has(key)) ipUrlOccurrences.set(key, []);
        ipUrlOccurrences.get(key).push({ relPath, line: lineNo });
      } else {
        // Record as domain-based URL
        const key = rawUrl;
        if (!urlOccurrences.has(key)) urlOccurrences.set(key, []);
        urlOccurrences.get(key).push({ relPath, line: lineNo });
      }
    }

    // --- Extract bare IP addresses (only when near network-context keywords) ---
    if (NETWORK_KEYWORDS.test(line)) {
      const ipMatches = [...line.matchAll(BARE_IP_REGEX)];
      for (const m of ipMatches) {
        const ip = m[0];
        if (!isValidIpv4(ip)) continue;
        // Skip IPs already captured as part of a URL in this line
        if (urlMatches.some((u) => u[0].includes(ip))) continue;

        const key = ip;
        if (!bareIpOccurrences.has(key)) bareIpOccurrences.set(key, []);
        bareIpOccurrences.get(key).push({ relPath, line: lineNo });
      }
    }
  }

  return { urlOccurrences, ipUrlOccurrences, bareIpOccurrences };
}

// ---------------------------------------------------------------------------
// Merge occurrence maps across files
// ---------------------------------------------------------------------------

function mergeOccurrences(target, source) {
  for (const [key, locs] of source) {
    if (!target.has(key)) {
      target.set(key, [...locs]);
    } else {
      target.get(key).push(...locs);
    }
  }
}

// ---------------------------------------------------------------------------
// Evidence formatter
// ---------------------------------------------------------------------------

/**
 * Build a compact evidence string from an occurrence list.
 * Shows up to 3 file+line references to keep findings readable.
 */
function formatLocations(occurrences) {
  const unique = [];
  const seenFiles = new Set();
  for (const loc of occurrences) {
    const key = `${loc.relPath}:${loc.line}`;
    if (!seenFiles.has(key)) {
      seenFiles.add(key);
      unique.push(loc);
    }
  }
  const shown = unique.slice(0, 3);
  const overflow = unique.length - shown.length;
  const parts = shown.map((l) => `${l.relPath}:${l.line}`);
  if (overflow > 0) parts.push(`+${overflow} more`);
  return parts.join(', ');
}

// ---------------------------------------------------------------------------
// Public scanner entry point
// ---------------------------------------------------------------------------

/**
 * Scan a target path for outbound URLs and network references.
 *
 * @param {string} targetPath - Absolute path to scan (file or directory root)
 * @param {{ files: Array<{ absPath: string, relPath: string, ext: string, size: number }> }} discovery
 *   Pre-computed file discovery result from the orchestrator.
 * @returns {Promise<object>} Scanner result envelope
 */
export async function scan(targetPath, discovery) {
  const startMs = Date.now();
  const allFindings = [];
  let filesScanned = 0;

  // Aggregate occurrence maps across all files
  const allUrlOccurrences = new Map();    // rawUrl → [{relPath, line}]
  const allIpUrlOccurrences = new Map();  // rawUrl → [{relPath, line}]
  const allBareIpOccurrences = new Map(); // ip → [{relPath, line}]

  try {
    // --- Phase 1: File scanning ---
    for (const fileInfo of discovery.files) {
      const content = await readTextFile(fileInfo.absPath);
      if (content === null) continue;
      filesScanned++;

      const { urlOccurrences, ipUrlOccurrences, bareIpOccurrences } =
        scanFileContent(content, fileInfo.relPath);

      mergeOccurrences(allUrlOccurrences, urlOccurrences);
      mergeOccurrences(allIpUrlOccurrences, ipUrlOccurrences);
      mergeOccurrences(allBareIpOccurrences, bareIpOccurrences);
    }

    // --- Phase 2: Domain deduplication and classification ---
    // Collect unique domains from standard URLs, keyed by domain → [rawUrls]
    const domainToUrls = new Map();
    for (const rawUrl of allUrlOccurrences.keys()) {
      const domain = extractDomain(rawUrl);
      if (!domain) continue;
      if (!domainToUrls.has(domain)) domainToUrls.set(domain, []);
      domainToUrls.get(domain).push(rawUrl);
    }

    // --- Phase 3: DNS resolution for suspicious + unknown domains (optional) ---
    let dnsLookupCount = 0;
    const dnsResults = new Map(); // domain → { resolved, addresses, lowTtl }

    const suspiciousAndUnknown = [...domainToUrls.keys()].filter(
      (d) => !TRUSTED_DOMAINS.has(d) && !isIpAddress(d)
    );

    for (const domain of suspiciousAndUnknown) {
      if (dnsLookupCount >= DNS_MAX_LOOKUPS) break;
      dnsLookupCount++;
      const result = await resolveDomain(domain);
      dnsResults.set(domain, result);
    }

    // --- Phase 4: Generate findings for domain-based URLs ---
    for (const [domain, rawUrls] of domainToUrls) {
      // Skip trusted domains entirely
      if (TRUSTED_DOMAINS.has(domain)) continue;

      // Gather all occurrence locations for this domain
      const allLocs = rawUrls.flatMap((u) => allUrlOccurrences.get(u) || []);
      const locationStr = formatLocations(allLocs);

      // Choose a representative URL for evidence (shortest/cleanest)
      const repUrl = rawUrls.sort((a, b) => a.length - b.length)[0];
      const repUrlRedacted = redact(repUrl, 60, 0);

      const dnsInfo = dnsResults.get(domain);
      const dnsNote = dnsInfo
        ? dnsInfo.resolved
          ? dnsInfo.lowTtl
            ? ` DNS resolved (LOW TTL <60s — suspicious infrastructure churn).`
            : ` DNS resolved to: ${dnsInfo.addresses.slice(0, 3).join(', ')}.`
          : ` DNS: NXDOMAIN or unreachable.`
        : '';

      if (SUSPICIOUS_DOMAINS.has(domain)) {
        // HIGH: known exfiltration/tunneling/shortener domain
        allFindings.push(
          finding({
            scanner: 'NET',
            severity: SEVERITY.HIGH,
            title: `Suspicious network endpoint: ${domain}`,
            description:
              `Domain "${domain}" is known to be used for data exfiltration, webhook interception, ` +
              `tunneling (bypasses corporate egress filtering), URL shortening (masks final destination), ` +
              `or ephemeral file sharing. Its presence in plugin/skill code is a strong indicator of ` +
              `malicious intent or accidental exfiltration risk.${dnsNote}`,
            file: allLocs[0]?.relPath || null,
            line: allLocs[0]?.line || null,
            evidence: `${repUrlRedacted} | found at: ${locationStr}`,
            owasp: 'LLM02',
            recommendation:
              'This domain is commonly used for data exfiltration or tunneling. ' +
              'Verify this URL is necessary and intended. If this is test code, move it to ' +
              'a properly isolated test fixture. If it is production code, remove it.',
          })
        );
      } else {
        // INFO: unknown domain — catalog for review, no automatic blocking
        const lowTtlNote =
          dnsInfo?.resolved && dnsInfo?.lowTtl
            ? ' Low DNS TTL detected — possible domain reassignment risk (rug-pull vector).'
            : '';

        allFindings.push(
          finding({
            scanner: 'NET',
            severity: SEVERITY.INFO,
            title: `Unknown external domain: ${domain}`,
            description:
              `Domain "${domain}" is referenced in the codebase but is not on the trusted allowlist. ` +
              `This may be a legitimate third-party dependency, or it may be an unexpected outbound call. ` +
              `Review all network references to verify they are necessary and intentional.${dnsNote}${lowTtlNote}`,
            file: allLocs[0]?.relPath || null,
            line: allLocs[0]?.line || null,
            evidence: `${repUrlRedacted} | found at: ${locationStr}`,
            owasp: 'LLM03',
            recommendation:
              'Verify this external domain is a known, trusted dependency. ' +
              'Document its purpose if it is legitimate.',
          })
        );
      }
    }

    // --- Phase 5: IP-based URL findings ---
    for (const [rawUrl, locs] of allIpUrlOccurrences) {
      const domain = extractDomain(rawUrl);
      if (!domain) continue;
      if (!isValidIpv4(domain)) continue;

      // Skip loopback/private — these are MEDIUM, not HIGH
      const isPrivate = isPrivateOrLoopback(domain);
      const severity = isPrivate ? SEVERITY.MEDIUM : SEVERITY.HIGH;
      const locationStr = formatLocations(locs);
      const urlRedacted = redact(rawUrl, 60, 0);

      allFindings.push(
        finding({
          scanner: 'NET',
          severity,
          title: `IP-based URL: ${domain}`,
          description:
            isPrivate
              ? `URL "${urlRedacted}" uses a private/loopback IP address instead of a domain name. ` +
                `While likely targeting a local service, hardcoded private IPs reduce portability ` +
                `and can indicate development-time infrastructure left in production code.`
              : `URL "${urlRedacted}" uses a public IP address instead of a domain name. ` +
                `IP-based URLs bypass DNS-based security controls, certificate transparency, ` +
                `and many proxy/firewall filtering mechanisms. This is a common technique used ` +
                `by malware to connect to C2 infrastructure that avoids domain reputation checks.`,
          file: locs[0]?.relPath || null,
          line: locs[0]?.line || null,
          evidence: `${urlRedacted} | found at: ${locationStr}`,
          owasp: isPrivate ? 'LLM03' : 'LLM02',
          recommendation: isPrivate
            ? 'Replace hardcoded private IP with a configurable hostname or environment variable.'
            : 'IP-based URLs bypass DNS and certificate validation. Use a domain name instead.',
        })
      );
    }

    // --- Phase 6: Bare IP findings ---
    for (const [ip, locs] of allBareIpOccurrences) {
      if (!isValidIpv4(ip)) continue;
      if (isPrivateOrLoopback(ip)) continue; // Low signal for bare private IPs — skip

      const locationStr = formatLocations(locs);

      allFindings.push(
        finding({
          scanner: 'NET',
          severity: SEVERITY.MEDIUM,
          title: `Bare public IP address in network context: ${ip}`,
          description:
            `A public IP address "${ip}" appears near network-related code (fetch, http, connect, etc.) ` +
            `without being part of a URL. This may indicate a hardcoded server address that bypasses ` +
            `DNS resolution and certificate validation controls.`,
          file: locs[0]?.relPath || null,
          line: locs[0]?.line || null,
          evidence: `IP: ${ip} | found at: ${locationStr}`,
          owasp: 'LLM02',
          recommendation:
            'IP-based URLs bypass DNS and certificate validation. Use a domain name instead.',
        })
      );
    }

    const durationMs = Date.now() - startMs;
    return scannerResult('network-mapper', 'ok', allFindings, filesScanned, durationMs);
  } catch (err) {
    const durationMs = Date.now() - startMs;
    return scannerResult(
      'network-mapper',
      'error',
      allFindings,
      filesScanned,
      durationMs,
      String(err?.message || err)
    );
  }
}