// workflow-scanner.mjs — E11 GitHub/Forgejo Actions injection scanner
// Detects `${{ <dangerous-field> }}` interpolations inside `run:` step
// blocks under privileged triggers. Sink-restricted (only `run:` is a
// shell sink — `if:`/`with:`/`env:` are evaluated by the runner's
// expression engine, not the shell, so they are NOT injection sinks).
//
// Discovery: explicitly probes `<target>/.github/workflows/` and
// `<target>/.forgejo/workflows/`. discoverFiles() (file-discovery.mjs)
// does not support glob include patterns, so we walk the two
// directories directly via node:fs/promises.
//
// Knowledge: knowledge/workflow-injection-patterns.md (23-field
// blacklist + severity matrix + Forgejo divergences).
//
// Out of scope (deferred):
//   - Composite-action input tracing
//   - Reusable-workflow call analysis
//   - GITHUB_ENV poisoning detection
//   - Zombie-workflow scanning across non-default branches
//
// Zero external dependencies.

import { readdir, readFile, stat } from 'node:fs/promises';
import { join, relative, basename } from 'node:path';
import { existsSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { dirname } from 'node:path';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { parseWorkflow } from './lib/workflow-yaml-state.mjs';

const __dirname = dirname(fileURLToPath(import.meta.url));

const MAX_FILES = 100;
const MAX_FILE_SIZE = 256 * 1024;
const SCANNER_NAME = 'workflow';
const SCANNER_PREFIX = 'WFL';

// ---------------------------------------------------------------------------
// 23-field canonical blacklist (GHSL Security Lab 17 + 6 GlueStack-class
// additions per research/01-github-forgejo-actions-injection.md). Stored
// as patterns matching the inner expression after `${{ ` and before ` }}`.
// All patterns match BOTH `github.*` and `forgejo.*` prefixes.
// ---------------------------------------------------------------------------

const PREFIX = '(?:github|forgejo)';

const DANGEROUS_FIELDS = [
  // GHSL 17
  `${PREFIX}\\.event\\.issue\\.title`,
  `${PREFIX}\\.event\\.issue\\.body`,
  `${PREFIX}\\.event\\.pull_request\\.title`,
  `${PREFIX}\\.event\\.pull_request\\.body`,
  `${PREFIX}\\.event\\.pull_request\\.head\\.ref`,
  `${PREFIX}\\.event\\.pull_request\\.head\\.label`,
  `${PREFIX}\\.event\\.pull_request\\.head\\.repo\\.default_branch`,
  `${PREFIX}\\.event\\.comment\\.body`,
  `${PREFIX}\\.event\\.review\\.body`,
  `${PREFIX}\\.event\\.commits\\.\\*\\.message`,
  `${PREFIX}\\.event\\.commits\\.\\*\\.author\\.email`,
  `${PREFIX}\\.event\\.commits\\.\\*\\.author\\.name`,
  `${PREFIX}\\.event\\.head_commit\\.message`,
  `${PREFIX}\\.event\\.head_commit\\.author\\.email`,
  `${PREFIX}\\.event\\.head_commit\\.author\\.name`,
  `${PREFIX}\\.event\\.pages\\.\\*\\.page_name`,
  `${PREFIX}\\.head_ref`,
  // GlueStack-class additions
  `${PREFIX}\\.event\\.discussion\\.title`,
  `${PREFIX}\\.event\\.discussion\\.body`,
  `${PREFIX}\\.event\\.discussion\\.user\\.login`,
  `${PREFIX}\\.event\\.inputs\\.[\\w-]+`,
  `${PREFIX}\\.event\\.client_payload\\.[\\w-]+`,
  `inputs\\.[\\w-]+`,
];

const DANGEROUS_RE = new RegExp(
  '(?:' +
    DANGEROUS_FIELDS.map(p => p.replace(/\\\.\\\*/g, '\\.[^.]+')).join('|') +
  ')',
);

// Numeric/hex/fixed-string fields — INFO-level, never injection sinks
const SAFE_FIELDS_RE = new RegExp(
  '^(?:' +
    `${PREFIX}\\.event\\.pull_request\\.number|` +
    `${PREFIX}\\.event\\.pull_request\\.head\\.sha|` +
    `${PREFIX}\\.run_id|` +
    `${PREFIX}\\.run_number|` +
    `${PREFIX}\\.sha|` +
    `${PREFIX}\\.event\\.action|` +
    `${PREFIX}\\.event\\.repository\\.full_name` +
  ')$',
);

// Triggers that grant attacker-controlled context with elevated
// privileges (read/write tokens).
const PRIVILEGED_TRIGGERS = new Set([
  'pull_request_target',
  'issue_comment',
  'discussion',
  'discussion_comment',
  'workflow_run',
]);

// Triggers where attacker can supply input but token is read-only or
// scoped (still an injection sink, just lower severity).
const SEMI_PRIVILEGED_TRIGGERS = new Set([
  'pull_request',
  'workflow_dispatch',
  'repository_dispatch',
]);

// Sink contexts that ARE shell:
const SINK_PARENTS = new Set(['run']);
// Contexts where ${{ ... }} is evaluated by the runner expression
// engine, NOT the shell. These are sink mismatches, not injection.
const NON_SINK_PARENTS = new Set(['if', 'with', 'env', 'name', 'runs-on', 'timeout-minutes', 'continue-on-error']);

// B4: auth-bypass — github.actor or forgejo.actor compared against a
// bot identity in if: contexts (Synacktiv 2023 Dependabot spoofing).
const AUTH_BYPASS_RE = /\b(?:github|forgejo)\.actor\s*(?:==|!=)\s*['"][\w-]+\[bot\]['"]/;

// ---------------------------------------------------------------------------
// Discovery
// ---------------------------------------------------------------------------

/**
 * Walk `<targetPath>/.github/workflows/` and `<targetPath>/.forgejo/workflows/`
 * one level deep. Return absolute paths of `.yml` and `.yaml` files,
 * combined and capped at MAX_FILES total.
 *
 * @param {string} targetPath
 * @returns {Promise<string[]>}
 */
export async function discoverWorkflows(targetPath) {
  const out = [];
  const dirs = [
    join(targetPath, '.github', 'workflows'),
    join(targetPath, '.forgejo', 'workflows'),
  ];
  for (const dir of dirs) {
    if (!existsSync(dir)) continue;
    let entries;
    try {
      entries = await readdir(dir, { withFileTypes: true });
    } catch {
      continue;
    }
    for (const entry of entries) {
      if (!entry.isFile()) continue;
      if (!/\.ya?ml$/i.test(entry.name)) continue;
      out.push(join(dir, entry.name));
      if (out.length >= MAX_FILES) return out;
    }
  }
  return out;
}

// ---------------------------------------------------------------------------
// Severity matrix
// ---------------------------------------------------------------------------

/**
 * Map (triggerSet, fieldClass) → severity.
 *
 * @param {Set<string>} triggers
 * @param {'dangerous'|'safe'|'other'} fieldClass
 * @returns {string|null}  SEVERITY constant, or null = suppress
 */
function severityFor(triggers, fieldClass) {
  if (fieldClass === 'safe') return SEVERITY.INFO;
  if (fieldClass !== 'dangerous') return null;
  for (const t of triggers) {
    if (PRIVILEGED_TRIGGERS.has(t)) return SEVERITY.HIGH;
  }
  for (const t of triggers) {
    if (SEMI_PRIVILEGED_TRIGGERS.has(t)) return SEVERITY.MEDIUM;
  }
  // No relevant trigger → still flag at MEDIUM (e.g. push events
  // can still be reachable from forks via PRs).
  return SEVERITY.MEDIUM;
}

function classifyField(expr) {
  if (SAFE_FIELDS_RE.test(expr)) return 'safe';
  if (DANGEROUS_RE.test(expr)) return 'dangerous';
  return 'other';
}

// ---------------------------------------------------------------------------
// Platform detection (filename-based; keeps schema unchanged)
// ---------------------------------------------------------------------------

function detectPlatform(absPath) {
  if (absPath.includes('/.forgejo/workflows/')) return 'forgejo';
  if (absPath.includes('/.github/workflows/')) return 'github';
  return 'unknown';
}

// ---------------------------------------------------------------------------
// Recommendation text
// ---------------------------------------------------------------------------

function buildRecommendation(platform, parent) {
  const base = parent === 'run'
    ? 'Bind the expression to an env var first, then consume it via $VAR in the run script: `env: { TITLE: ${{ ... }} }; run: echo "$TITLE"`. Re-interpolating ${{ env.TITLE }} inside run: cancels the mitigation.'
    : 'This expression is not a shell injection sink, but the underlying field is attacker-controlled. Review its downstream use.';
  if (platform === 'forgejo') {
    return base + ' Forgejo note: job-level `permissions:` is ignored on Forgejo (admin-guide); rely on token scoping at server level instead.';
  }
  return base;
}

// ---------------------------------------------------------------------------
// Scan one workflow file
// ---------------------------------------------------------------------------

async function scanFile(absPath, targetPath, stderrLog) {
  const findings = [];
  const stat_ = await stat(absPath).catch(() => null);
  if (!stat_ || stat_.size > MAX_FILE_SIZE) return findings;
  const text = await readFile(absPath, 'utf8').catch(() => null);
  if (text === null) return findings;

  const relPath = relative(targetPath, absPath) || basename(absPath);
  const platform = detectPlatform(absPath);

  let parsed;
  try {
    parsed = parseWorkflow(text);
  } catch (err) {
    stderrLog(`[workflow-scanner] parse error in ${relPath}: ${err.message}\n`);
    return findings;
  }

  const triggers = parsed.triggers;

  // Forgejo divergence advisory: `workflow_run` is not documented for
  // Forgejo. Emit to stderr (not as a finding) so the user knows the
  // severity-matrix logic applied as if it were privileged.
  if (platform === 'forgejo' && triggers.has('workflow_run')) {
    stderrLog(
      `[workflow-scanner] ${relPath}: 'workflow_run' trigger is not documented for Forgejo Actions; ` +
      `severity logic still treats it as privileged. See knowledge/workflow-injection-patterns.md §Forgejo.\n`
    );
  }

  const platformLabel = platform === 'forgejo' ? 'Forgejo' : 'GitHub';
  const triggerList = [...triggers].join(', ') || 'unknown';

  // B4: collect env: bindings (key -> source-expression). Used for
  // re-interpolation detection. A binding is an event whose parent is
  // a key under an `env:` block — i.e. parentChain includes 'env'
  // and the parent is not 'env' itself.
  const envBindings = new Map();
  for (const ev of parsed.events) {
    if (!ev.parentChain.includes('env')) continue;
    if (ev.parent === 'env') continue;
    if (SINK_PARENTS.has(ev.parent)) continue;
    if (NON_SINK_PARENTS.has(ev.parent)) continue;
    envBindings.set(ev.parent, ev.expr);
  }

  for (const ev of parsed.events) {
    // B4: auth-bypass first — fires only on if: events
    if (ev.parent === 'if' && AUTH_BYPASS_RE.test(ev.expr)) {
      findings.push(finding({
        scanner: SCANNER_PREFIX,
        severity: SEVERITY.MEDIUM,
        title: `Actor auth-bypass: ${platformLabel} workflow trusts bot identity`,
        description:
          `Actor auth-bypass: if-condition trusts bot identity that can be ` +
          `spoofed via pull_request_target. ${platformLabel} workflow at ` +
          `${relPath}: ${ev.expr}.`,
        file: relPath,
        line: ev.line,
        evidence: `\${{ ${ev.expr} }}`,
        owasp: 'LLM06',
        recommendation:
          'Use `github.event.pull_request.user.login` (immutable per PR) ' +
          'instead of `github.actor` for authorization decisions. The actor ' +
          'name can be spoofed via Synacktiv-2023 Dependabot path. If the ' +
          'check must remain, gate it on an `id-token` OIDC claim.',
      }));
      continue;
    }

    if (NON_SINK_PARENTS.has(ev.parent)) continue;
    if (!SINK_PARENTS.has(ev.parent)) continue;

    // B4: re-interpolation pattern — `${{ env.<KEY> }}` inside run:
    // where <KEY> was bound from a blacklisted field via top-level
    // or job-level env:. Cancels the env-indirection mitigation.
    const reinterpMatch = ev.expr.match(/^env\.([\w-]+)$/);
    if (reinterpMatch) {
      const key = reinterpMatch[1];
      const source = envBindings.get(key);
      if (source && DANGEROUS_RE.test(source)) {
        findings.push(finding({
          scanner: SCANNER_PREFIX,
          severity: SEVERITY.MEDIUM,
          title: `Re-interpolation: env.${key} re-injects ${source.split(/\s+/)[0]} at ${platformLabel} run:`,
          description:
            `Re-interpolation: env.${key} was set from \${{ ${source} }}; reading via ` +
            `\${{ env.${key} }} in run: re-injects the unsafe value (Appsmith ` +
            `GHSL-2024-277 stealth pattern). Workflow: ${relPath}.`,
          file: relPath,
          line: ev.line,
          evidence: `\${{ env.${key} }}`,
          owasp: 'LLM02',
          recommendation:
            `Consume the env var via \$${key} (shell variable) inside run:, ` +
            `not via \${{ env.${key} }}. Template substitution happens before ` +
            `shell parsing — re-interpolating cancels the env-indirection ` +
            'mitigation and re-introduces the original injection.',
        }));
        continue;
      }
    }

    const fieldClass = classifyField(ev.expr);
    if (fieldClass === 'other') continue;

    const severity = severityFor(triggers, fieldClass);
    if (!severity) continue;

    findings.push(finding({
      scanner: SCANNER_PREFIX,
      severity,
      title: severity === SEVERITY.INFO
        ? `Safe expression in ${platformLabel} workflow run:`
        : `Workflow injection: ${platformLabel} ${ev.expr} in run: under ${triggerList}`,
      description:
        `${platformLabel} workflow at ${relPath} interpolates \${{ ${ev.expr} }} ` +
        `inside a run: step. Triggers: ${triggerList}. ` +
        `Field class: ${fieldClass}. Block scalar: ${ev.blockScalar}.`,
      file: relPath,
      line: ev.line,
      evidence: `\${{ ${ev.expr} }}`,
      owasp: 'LLM02',
      recommendation: buildRecommendation(platform, ev.parent),
    }));
  }

  return findings;
}

// ---------------------------------------------------------------------------
// Public entry — orchestrator-compatible
// ---------------------------------------------------------------------------

/**
 * Scan a target path for workflow injection.
 *
 * @param {string} targetPath
 * @param {object} [_discovery]   Ignored — workflow-scanner does its own
 *                                directory probe.
 * @returns {Promise<object>} scannerResult envelope
 */
export async function scan(targetPath, _discovery) {
  const startMs = Date.now();
  const allFindings = [];
  let filesScanned = 0;
  const stderrLog = (msg) => process.stderr.write(msg);

  try {
    const files = await discoverWorkflows(targetPath);
    for (const f of files) {
      filesScanned++;
      const fileFindings = await scanFile(f, targetPath, stderrLog);
      allFindings.push(...fileFindings);
    }
    return scannerResult(SCANNER_NAME, 'ok', allFindings, filesScanned, Date.now() - startMs);
  } catch (err) {
    return scannerResult(
      SCANNER_NAME,
      'error',
      allFindings,
      filesScanned,
      Date.now() - startMs,
      err.message,
    );
  }
}

// ---------------------------------------------------------------------------
// CLI entry
// ---------------------------------------------------------------------------

const isDirectRun = process.argv[1] === fileURLToPath(import.meta.url);
if (isDirectRun) {
  const target = process.argv[2];
  if (!target) {
    console.error('Usage: node workflow-scanner.mjs <target-path>');
    process.exit(1);
  }
  scan(target).then(result => {
    process.stdout.write(JSON.stringify(result, null, 2) + '\n');
  });
}