ktg-plugin-marketplace/plugins/llm-security/scanners/lib/workflow-yaml-state.mjs

// workflow-yaml-state.mjs — Line-based YAML state machine for E11
// (workflow-scanner). Zero dependencies. Tracks indentation, parent
// context, and `run:` block-scalar entry/exit so the scanner can
// distinguish injection sinks (`run:`) from sink-mismatch contexts
// (`if:`, `env:`, `with:`).
//
// Why hand-roll instead of importing a YAML library:
//   - Zero-dependency invariant (CLAUDE.md)
//   - Workflows live in `.github/workflows/` and `.forgejo/workflows/`,
//     have a constrained shape (top-level `on:`, `jobs:`, with each
//     job a mapping of {steps, env, …}). A line-based state machine
//     captures everything we need without a full YAML parser.
//
// Out of scope:
//   - Anchors / aliases (treated as no-op; rarely used in workflows)
//   - Multi-line flow scalars spanning lines via `... \n ...`
//   - Full `${{ <expr> }}` AST (we extract substring text only)

const EXPR_RE = /\$\{\{\s*([\s\S]+?)\s*\}\}/g;
const KV_RE = /^([A-Za-z_][\w-]*)\s*:\s*(.*)$/;
const LIST_KV_RE = /^-\s+([A-Za-z_][\w-]*)\s*:\s*(.*)$/;
const TRIGGER_RE = /^([a-z_]+)(?::|$)/;
const BLOCK_SCALAR_VALUES = new Set(['|', '>', '|-', '>-', '|+', '>+']);

/**
 * Strip comments after first unquoted `#`. Workflows rarely embed `#`
 * in strings; an over-eager strip is acceptable since we never write
 * the stripped text back.
 */
function stripComments(line) {
  // Preserve `#` inside ${{ }} expressions (rare, but possible)
  return line.replace(/(^|\s)#.*$/, '');
}

/** Count leading spaces. YAML disallows tabs in indent, so we treat them as 1. */
function getIndent(line) {
  let i = 0;
  while (i < line.length && (line[i] === ' ' || line[i] === '\t')) i++;
  return i;
}

/** Extract `${{ <expr> }}` substrings with line/column metadata. */
function findExpressions(rawLine, lineNum) {
  const out = [];
  EXPR_RE.lastIndex = 0;
  let m;
  while ((m = EXPR_RE.exec(rawLine)) !== null) {
    out.push({
      line: lineNum,
      column: m.index + 1,
      expr: m[1].trim(),
    });
  }
  return out;
}

/**
 * Extract the set of triggers declared by top-level `on:`. Handles all
 * four common forms (string, inline-list, block-list, block-mapping).
 *
 * @param {string[]} lines
 * @returns {Set<string>}
 */
export function extractTriggers(lines) {
  const triggers = new Set();
  for (let i = 0; i < lines.length; i++) {
    const stripped = stripComments(lines[i]);
    const trimmed = stripped.trim();
    if (!trimmed) continue;
    // Top-level keys are at indent 0
    if (getIndent(stripped) !== 0) continue;
    const m = stripped.match(/^on\s*:\s*(.*)$/);
    if (!m) continue;
    const tail = m[1].trim();

    // Form 1: `on: push` or `on: [push, pull_request]`
    if (tail) {
      if (tail.startsWith('[')) {
        const inner = tail.replace(/^\[|\]$/g, '');
        for (const part of inner.split(',')) {
          const name = part.trim().replace(/^["']|["']$/g, '');
          if (name) triggers.add(name);
        }
      } else {
        const name = tail.replace(/^["']|["']$/g, '');
        if (name) triggers.add(name);
      }
      return triggers;
    }

    // Form 2/3: block list or block mapping. Only collect entries at
    // the FIRST nested indent — anything deeper is a sub-property of
    // the trigger (e.g. `branches:`, `types:`), not a new trigger.
    let triggerIndent = null;
    for (let j = i + 1; j < lines.length; j++) {
      const sj = stripComments(lines[j]);
      const tj = sj.trim();
      if (!tj) continue;
      const indent = getIndent(sj);
      if (indent === 0) break; // back to top-level key
      if (triggerIndent === null) triggerIndent = indent;
      if (indent > triggerIndent) continue; // sub-property of a trigger
      // List item: `- push`
      if (tj.startsWith('- ')) {
        const name = tj.slice(2).trim().replace(/^["']|["']$/g, '');
        if (name) triggers.add(name);
        continue;
      }
      // Mapping key: `push:` or `pull_request_target:`
      const tm = tj.match(TRIGGER_RE);
      if (tm) triggers.add(tm[1]);
    }
    return triggers;
  }
  return triggers;
}

/**
 * Walk the workflow text line-by-line and emit `${{ <expr> }}` events
 * tagged with the parent context (`run`, `if`, `with`, `env`, …) and
 * a flag indicating whether the expression appeared inside a `run:`
 * block-scalar body.
 *
 * @param {string} text
 * @returns {{
 *   triggers: Set<string>,
 *   events: {
 *     line: number,
 *     column: number,
 *     expr: string,
 *     parent: string,
 *     parentChain: string[],
 *     blockScalar: boolean,
 *   }[],
 * }}
 */
export function parseWorkflow(text) {
  const lines = text.split('\n');
  const triggers = extractTriggers(lines);
  const events = [];
  /** @type {{indent: number, key: string, isBlockScalar: boolean}[]} */
  const stack = [];

  for (let i = 0; i < lines.length; i++) {
    const raw = lines[i];
    const stripped = stripComments(raw);
    const trimmed = stripped.trim();
    if (!trimmed) continue;

    const indent = getIndent(stripped);

    // Pop frames whose indent >= current indent. Block-scalar frames
    // are popped when we leave the scalar body (indent shallower).
    while (stack.length && stack[stack.length - 1].indent >= indent) {
      stack.pop();
    }

    const top = stack.length ? stack[stack.length - 1] : null;

    // Inside a block-scalar body? Body lines have indent strictly
    // greater than the opener; the opener frame is on top of stack.
    if (top && top.isBlockScalar) {
      const exprs = findExpressions(raw, i + 1);
      for (const e of exprs) {
        events.push({
          ...e,
          parent: top.key,
          parentChain: stack.map(s => s.key),
          blockScalar: true,
        });
      }
      continue;
    }

    // Try `<key>: <value>` first
    const kv = trimmed.match(KV_RE);
    if (kv) {
      const key = kv[1];
      const value = kv[2];
      const isBlock = BLOCK_SCALAR_VALUES.has(value);
      const exprs = !isBlock && value ? findExpressions(raw, i + 1) : [];
      for (const e of exprs) {
        events.push({
          ...e,
          parent: key,
          parentChain: [...stack.map(s => s.key), key],
          blockScalar: false,
        });
      }
      stack.push({ indent, key, isBlockScalar: isBlock });
      continue;
    }

    // List item: `- <key>: <value>` or just `- <value>`
    const lkv = trimmed.match(LIST_KV_RE);
    if (lkv) {
      const key = lkv[1];
      const value = lkv[2];
      const isBlock = BLOCK_SCALAR_VALUES.has(value);
      const exprs = !isBlock && value ? findExpressions(raw, i + 1) : [];
      for (const e of exprs) {
        events.push({
          ...e,
          parent: key,
          parentChain: [...stack.map(s => s.key), key],
          blockScalar: false,
        });
      }
      // List items create a deeper synthetic indent so subsequent
      // sibling keys at the same column still resolve to this item.
      stack.push({ indent: indent + 2, key, isBlockScalar: isBlock });
      continue;
    }

    // Plain list item `- something` — no key. Still scan for ${{ ... }}
    // (rare but possible) and tag with the enclosing parent.
    if (trimmed.startsWith('- ')) {
      const exprs = findExpressions(raw, i + 1);
      const enclosing = top ? top.key : '';
      for (const e of exprs) {
        events.push({
          ...e,
          parent: enclosing,
          parentChain: stack.map(s => s.key),
          blockScalar: false,
        });
      }
      continue;
    }
  }

  return { triggers, events };
}