// workflow-yaml-state.mjs — Line-based YAML state machine for E11 // (workflow-scanner). Zero dependencies. Tracks indentation, parent // context, and `run:` block-scalar entry/exit so the scanner can // distinguish injection sinks (`run:`) from sink-mismatch contexts // (`if:`, `env:`, `with:`). // // Why hand-roll instead of importing a YAML library: // - Zero-dependency invariant (CLAUDE.md) // - Workflows live in `.github/workflows/` and `.forgejo/workflows/`, // have a constrained shape (top-level `on:`, `jobs:`, with each // job a mapping of {steps, env, …}). A line-based state machine // captures everything we need without a full YAML parser. // // Out of scope: // - Anchors / aliases (treated as no-op; rarely used in workflows) // - Multi-line flow scalars spanning lines via `... \n ...` // - Full `${{ }}` AST (we extract substring text only) const EXPR_RE = /\$\{\{\s*([\s\S]+?)\s*\}\}/g; const KV_RE = /^([A-Za-z_][\w-]*)\s*:\s*(.*)$/; const LIST_KV_RE = /^-\s+([A-Za-z_][\w-]*)\s*:\s*(.*)$/; const TRIGGER_RE = /^([a-z_]+)(?::|$)/; const BLOCK_SCALAR_VALUES = new Set(['|', '>', '|-', '>-', '|+', '>+']); /** * Strip comments after first unquoted `#`. Workflows rarely embed `#` * in strings; an over-eager strip is acceptable since we never write * the stripped text back. */ function stripComments(line) { // Preserve `#` inside ${{ }} expressions (rare, but possible) return line.replace(/(^|\s)#.*$/, ''); } /** Count leading spaces. YAML disallows tabs in indent, so we treat them as 1. */ function getIndent(line) { let i = 0; while (i < line.length && (line[i] === ' ' || line[i] === '\t')) i++; return i; } /** Extract `${{ }}` substrings with line/column metadata. */ function findExpressions(rawLine, lineNum) { const out = []; EXPR_RE.lastIndex = 0; let m; while ((m = EXPR_RE.exec(rawLine)) !== null) { out.push({ line: lineNum, column: m.index + 1, expr: m[1].trim(), }); } return out; } /** * Extract the set of triggers declared by top-level `on:`. Handles all * four common forms (string, inline-list, block-list, block-mapping). * * @param {string[]} lines * @returns {Set} */ export function extractTriggers(lines) { const triggers = new Set(); for (let i = 0; i < lines.length; i++) { const stripped = stripComments(lines[i]); const trimmed = stripped.trim(); if (!trimmed) continue; // Top-level keys are at indent 0 if (getIndent(stripped) !== 0) continue; const m = stripped.match(/^on\s*:\s*(.*)$/); if (!m) continue; const tail = m[1].trim(); // Form 1: `on: push` or `on: [push, pull_request]` if (tail) { if (tail.startsWith('[')) { const inner = tail.replace(/^\[|\]$/g, ''); for (const part of inner.split(',')) { const name = part.trim().replace(/^["']|["']$/g, ''); if (name) triggers.add(name); } } else { const name = tail.replace(/^["']|["']$/g, ''); if (name) triggers.add(name); } return triggers; } // Form 2/3: block list or block mapping. Only collect entries at // the FIRST nested indent — anything deeper is a sub-property of // the trigger (e.g. `branches:`, `types:`), not a new trigger. let triggerIndent = null; for (let j = i + 1; j < lines.length; j++) { const sj = stripComments(lines[j]); const tj = sj.trim(); if (!tj) continue; const indent = getIndent(sj); if (indent === 0) break; // back to top-level key if (triggerIndent === null) triggerIndent = indent; if (indent > triggerIndent) continue; // sub-property of a trigger // List item: `- push` if (tj.startsWith('- ')) { const name = tj.slice(2).trim().replace(/^["']|["']$/g, ''); if (name) triggers.add(name); continue; } // Mapping key: `push:` or `pull_request_target:` const tm = tj.match(TRIGGER_RE); if (tm) triggers.add(tm[1]); } return triggers; } return triggers; } /** * Walk the workflow text line-by-line and emit `${{ }}` events * tagged with the parent context (`run`, `if`, `with`, `env`, …) and * a flag indicating whether the expression appeared inside a `run:` * block-scalar body. * * @param {string} text * @returns {{ * triggers: Set, * events: { * line: number, * column: number, * expr: string, * parent: string, * parentChain: string[], * blockScalar: boolean, * }[], * }} */ export function parseWorkflow(text) { const lines = text.split('\n'); const triggers = extractTriggers(lines); const events = []; /** @type {{indent: number, key: string, isBlockScalar: boolean}[]} */ const stack = []; for (let i = 0; i < lines.length; i++) { const raw = lines[i]; const stripped = stripComments(raw); const trimmed = stripped.trim(); if (!trimmed) continue; const indent = getIndent(stripped); // Pop frames whose indent >= current indent. Block-scalar frames // are popped when we leave the scalar body (indent shallower). while (stack.length && stack[stack.length - 1].indent >= indent) { stack.pop(); } const top = stack.length ? stack[stack.length - 1] : null; // Inside a block-scalar body? Body lines have indent strictly // greater than the opener; the opener frame is on top of stack. if (top && top.isBlockScalar) { const exprs = findExpressions(raw, i + 1); for (const e of exprs) { events.push({ ...e, parent: top.key, parentChain: stack.map(s => s.key), blockScalar: true, }); } continue; } // Try `: ` first const kv = trimmed.match(KV_RE); if (kv) { const key = kv[1]; const value = kv[2]; const isBlock = BLOCK_SCALAR_VALUES.has(value); const exprs = !isBlock && value ? findExpressions(raw, i + 1) : []; for (const e of exprs) { events.push({ ...e, parent: key, parentChain: [...stack.map(s => s.key), key], blockScalar: false, }); } stack.push({ indent, key, isBlockScalar: isBlock }); continue; } // List item: `- : ` or just `- ` const lkv = trimmed.match(LIST_KV_RE); if (lkv) { const key = lkv[1]; const value = lkv[2]; const isBlock = BLOCK_SCALAR_VALUES.has(value); const exprs = !isBlock && value ? findExpressions(raw, i + 1) : []; for (const e of exprs) { events.push({ ...e, parent: key, parentChain: [...stack.map(s => s.key), key], blockScalar: false, }); } // List items create a deeper synthetic indent so subsequent // sibling keys at the same column still resolve to this item. stack.push({ indent: indent + 2, key, isBlockScalar: isBlock }); continue; } // Plain list item `- something` — no key. Still scan for ${{ ... }} // (rare but possible) and tag with the enclosing parent. if (trimmed.startsWith('- ')) { const exprs = findExpressions(raw, i + 1); const enclosing = top ? top.key : ''; for (const e of exprs) { events.push({ ...e, parent: enclosing, parentChain: stack.map(s => s.key), blockScalar: false, }); } continue; } } return { triggers, events }; }