feat(workflow-scanner): E11 part 1 — core file-walk + 23-field blacklist + sink-restriction

Adds a deterministic GitHub Actions / Forgejo Actions injection
scanner. Detects \${{ <dangerous-field> }} interpolations inside
\`run:\` step blocks under privileged or semi-privileged triggers.
Sink-restricted: \`if:\` / \`with:\` / \`env:\` (block-level) are
evaluated by the runner expression engine, not the shell, so they
are NOT injection sinks and are suppressed at parser level.

Why: workflow expression injection is the most prevalent SAST class
on GitHub (CodeQL preview: 800K+ findings across 158K repos). The
graduated severity matrix (HIGH for pull_request_target / discussion
/ workflow_run; MEDIUM for pull_request / workflow_dispatch) is the
community-converged calibration target — uniform HIGH causes alert
fatigue.

Components:
- scanners/lib/workflow-yaml-state.mjs — line-based YAML state
  machine. Tracks indentation, parent-context stack, and
  \`run: |\` / \`run: >\` block-scalar entry/exit. Zero deps.
- scanners/workflow-scanner.mjs — discoverWorkflows() probes
  .github/workflows/ and .forgejo/workflows/ directly (file-discovery
  has no glob include). 23-field blacklist (GHSL 17 + 6 GlueStack-
  class additions). Platform encoded via file path; no schema
  extension to finding(). Forgejo-specific: workflow_run advisory
  emitted to stderr; recommendation text mentions Forgejo's
  server-level token scoping (job-level permissions: is ignored).
- knowledge/workflow-injection-patterns.md — 23-field blacklist,
  trigger taxonomy, severity matrix, Forgejo divergences, NVD CVE
  corpus.

Tests (47 new):
- tests/lib/workflow-yaml-state.test.mjs (15): trigger forms
  (string / inline-list / block-list / block-mapping), single-line
  run, block-scalar | and > tracking, env/with sink-mismatch,
  multi-line, comment stripping, line-number accuracy.
- tests/scanners/workflow-scanner.test.mjs (14): TP head_ref
  pull_request_target, TP discussion.title gluestack pattern,
  TP comment.body pull_request, TP issue.body block-scalar,
  FP if-context, FP env-block, INFO numeric, Forgejo TP, Forgejo
  workflow_run advisory, envelope shape, WFL prefix.
- 9 fixtures in tests/fixtures/workflows/{.github,.forgejo}/workflows/.

Out of scope (B4 / Batch D):
- Re-interpolation detection (env.VAR after env: from blacklisted source)
- github.actor authorization-bypass category
- WFL prefix in severity.mjs OWASP maps + scan-orchestrator
  registration (B4)
- Composite-action input tracing, GITHUB_ENV poisoning (Batch D)

Test count: 1685 → 1732 (+47). Pre-compact-scan flake unchanged
(passes in isolation).
This commit is contained in:
Kjell Tore Guttormsen 2026-04-30 15:48:48 +02:00
commit c31d4b1718
14 changed files with 1167 additions and 0 deletions

View file

@ -0,0 +1,228 @@
// workflow-yaml-state.mjs — Line-based YAML state machine for E11
// (workflow-scanner). Zero dependencies. Tracks indentation, parent
// context, and `run:` block-scalar entry/exit so the scanner can
// distinguish injection sinks (`run:`) from sink-mismatch contexts
// (`if:`, `env:`, `with:`).
//
// Why hand-roll instead of importing a YAML library:
// - Zero-dependency invariant (CLAUDE.md)
// - Workflows live in `.github/workflows/` and `.forgejo/workflows/`,
// have a constrained shape (top-level `on:`, `jobs:`, with each
// job a mapping of {steps, env, …}). A line-based state machine
// captures everything we need without a full YAML parser.
//
// Out of scope:
// - Anchors / aliases (treated as no-op; rarely used in workflows)
// - Multi-line flow scalars spanning lines via `... \n ...`
// - Full `${{ <expr> }}` AST (we extract substring text only)
const EXPR_RE = /\$\{\{\s*([\s\S]+?)\s*\}\}/g;
const KV_RE = /^([A-Za-z_][\w-]*)\s*:\s*(.*)$/;
const LIST_KV_RE = /^-\s+([A-Za-z_][\w-]*)\s*:\s*(.*)$/;
const TRIGGER_RE = /^([a-z_]+)(?::|$)/;
const BLOCK_SCALAR_VALUES = new Set(['|', '>', '|-', '>-', '|+', '>+']);
/**
* Strip comments after first unquoted `#`. Workflows rarely embed `#`
* in strings; an over-eager strip is acceptable since we never write
* the stripped text back.
*/
function stripComments(line) {
// Preserve `#` inside ${{ }} expressions (rare, but possible)
return line.replace(/(^|\s)#.*$/, '');
}
/** Count leading spaces. YAML disallows tabs in indent, so we treat them as 1. */
function getIndent(line) {
let i = 0;
while (i < line.length && (line[i] === ' ' || line[i] === '\t')) i++;
return i;
}
/** Extract `${{ <expr> }}` substrings with line/column metadata. */
function findExpressions(rawLine, lineNum) {
const out = [];
EXPR_RE.lastIndex = 0;
let m;
while ((m = EXPR_RE.exec(rawLine)) !== null) {
out.push({
line: lineNum,
column: m.index + 1,
expr: m[1].trim(),
});
}
return out;
}
/**
* Extract the set of triggers declared by top-level `on:`. Handles all
* four common forms (string, inline-list, block-list, block-mapping).
*
* @param {string[]} lines
* @returns {Set<string>}
*/
export function extractTriggers(lines) {
const triggers = new Set();
for (let i = 0; i < lines.length; i++) {
const stripped = stripComments(lines[i]);
const trimmed = stripped.trim();
if (!trimmed) continue;
// Top-level keys are at indent 0
if (getIndent(stripped) !== 0) continue;
const m = stripped.match(/^on\s*:\s*(.*)$/);
if (!m) continue;
const tail = m[1].trim();
// Form 1: `on: push` or `on: [push, pull_request]`
if (tail) {
if (tail.startsWith('[')) {
const inner = tail.replace(/^\[|\]$/g, '');
for (const part of inner.split(',')) {
const name = part.trim().replace(/^["']|["']$/g, '');
if (name) triggers.add(name);
}
} else {
const name = tail.replace(/^["']|["']$/g, '');
if (name) triggers.add(name);
}
return triggers;
}
// Form 2/3: block list or block mapping
for (let j = i + 1; j < lines.length; j++) {
const sj = stripComments(lines[j]);
const tj = sj.trim();
if (!tj) continue;
const indent = getIndent(sj);
if (indent === 0) break; // back to top-level key
// List item: `- push`
if (tj.startsWith('- ')) {
const name = tj.slice(2).trim().replace(/^["']|["']$/g, '');
if (name) triggers.add(name);
continue;
}
// Mapping key: `push:` or `pull_request_target:`
const tm = tj.match(TRIGGER_RE);
if (tm) triggers.add(tm[1]);
}
return triggers;
}
return triggers;
}
/**
* Walk the workflow text line-by-line and emit `${{ <expr> }}` events
* tagged with the parent context (`run`, `if`, `with`, `env`, ) and
* a flag indicating whether the expression appeared inside a `run:`
* block-scalar body.
*
* @param {string} text
* @returns {{
* triggers: Set<string>,
* events: {
* line: number,
* column: number,
* expr: string,
* parent: string,
* parentChain: string[],
* blockScalar: boolean,
* }[],
* }}
*/
export function parseWorkflow(text) {
const lines = text.split('\n');
const triggers = extractTriggers(lines);
const events = [];
/** @type {{indent: number, key: string, isBlockScalar: boolean}[]} */
const stack = [];
for (let i = 0; i < lines.length; i++) {
const raw = lines[i];
const stripped = stripComments(raw);
const trimmed = stripped.trim();
if (!trimmed) continue;
const indent = getIndent(stripped);
// Pop frames whose indent >= current indent. Block-scalar frames
// are popped when we leave the scalar body (indent shallower).
while (stack.length && stack[stack.length - 1].indent >= indent) {
stack.pop();
}
const top = stack.length ? stack[stack.length - 1] : null;
// Inside a block-scalar body? Body lines have indent strictly
// greater than the opener; the opener frame is on top of stack.
if (top && top.isBlockScalar) {
const exprs = findExpressions(raw, i + 1);
for (const e of exprs) {
events.push({
...e,
parent: top.key,
parentChain: stack.map(s => s.key),
blockScalar: true,
});
}
continue;
}
// Try `<key>: <value>` first
const kv = trimmed.match(KV_RE);
if (kv) {
const key = kv[1];
const value = kv[2];
const isBlock = BLOCK_SCALAR_VALUES.has(value);
const exprs = !isBlock && value ? findExpressions(raw, i + 1) : [];
for (const e of exprs) {
events.push({
...e,
parent: key,
parentChain: [...stack.map(s => s.key), key],
blockScalar: false,
});
}
stack.push({ indent, key, isBlockScalar: isBlock });
continue;
}
// List item: `- <key>: <value>` or just `- <value>`
const lkv = trimmed.match(LIST_KV_RE);
if (lkv) {
const key = lkv[1];
const value = lkv[2];
const isBlock = BLOCK_SCALAR_VALUES.has(value);
const exprs = !isBlock && value ? findExpressions(raw, i + 1) : [];
for (const e of exprs) {
events.push({
...e,
parent: key,
parentChain: [...stack.map(s => s.key), key],
blockScalar: false,
});
}
// List items create a deeper synthetic indent so subsequent
// sibling keys at the same column still resolve to this item.
stack.push({ indent: indent + 2, key, isBlockScalar: isBlock });
continue;
}
// Plain list item `- something` — no key. Still scan for ${{ ... }}
// (rare but possible) and tag with the enclosing parent.
if (trimmed.startsWith('- ')) {
const exprs = findExpressions(raw, i + 1);
const enclosing = top ? top.key : '';
for (const e of exprs) {
events.push({
...e,
parent: enclosing,
parentChain: stack.map(s => s.key),
blockScalar: false,
});
}
continue;
}
}
return { triggers, events };
}

View file

@ -0,0 +1,330 @@
// workflow-scanner.mjs — E11 GitHub/Forgejo Actions injection scanner
// Detects `${{ <dangerous-field> }}` interpolations inside `run:` step
// blocks under privileged triggers. Sink-restricted (only `run:` is a
// shell sink — `if:`/`with:`/`env:` are evaluated by the runner's
// expression engine, not the shell, so they are NOT injection sinks).
//
// Discovery: explicitly probes `<target>/.github/workflows/` and
// `<target>/.forgejo/workflows/`. discoverFiles() (file-discovery.mjs)
// does not support glob include patterns, so we walk the two
// directories directly via node:fs/promises.
//
// Knowledge: knowledge/workflow-injection-patterns.md (23-field
// blacklist + severity matrix + Forgejo divergences).
//
// Out of scope (deferred):
// - Composite-action input tracing
// - Reusable-workflow call analysis
// - GITHUB_ENV poisoning detection
// - Zombie-workflow scanning across non-default branches
//
// Zero external dependencies.
import { readdir, readFile, stat } from 'node:fs/promises';
import { join, relative, basename } from 'node:path';
import { existsSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { dirname } from 'node:path';
import { finding, scannerResult } from './lib/output.mjs';
import { SEVERITY } from './lib/severity.mjs';
import { parseWorkflow } from './lib/workflow-yaml-state.mjs';
const __dirname = dirname(fileURLToPath(import.meta.url));
const MAX_FILES = 100;
const MAX_FILE_SIZE = 256 * 1024;
const SCANNER_NAME = 'workflow';
const SCANNER_PREFIX = 'WFL';
// ---------------------------------------------------------------------------
// 23-field canonical blacklist (GHSL Security Lab 17 + 6 GlueStack-class
// additions per research/01-github-forgejo-actions-injection.md). Stored
// as patterns matching the inner expression after `${{ ` and before ` }}`.
// All patterns match BOTH `github.*` and `forgejo.*` prefixes.
// ---------------------------------------------------------------------------
const PREFIX = '(?:github|forgejo)';
const DANGEROUS_FIELDS = [
// GHSL 17
`${PREFIX}\\.event\\.issue\\.title`,
`${PREFIX}\\.event\\.issue\\.body`,
`${PREFIX}\\.event\\.pull_request\\.title`,
`${PREFIX}\\.event\\.pull_request\\.body`,
`${PREFIX}\\.event\\.pull_request\\.head\\.ref`,
`${PREFIX}\\.event\\.pull_request\\.head\\.label`,
`${PREFIX}\\.event\\.pull_request\\.head\\.repo\\.default_branch`,
`${PREFIX}\\.event\\.comment\\.body`,
`${PREFIX}\\.event\\.review\\.body`,
`${PREFIX}\\.event\\.commits\\.\\*\\.message`,
`${PREFIX}\\.event\\.commits\\.\\*\\.author\\.email`,
`${PREFIX}\\.event\\.commits\\.\\*\\.author\\.name`,
`${PREFIX}\\.event\\.head_commit\\.message`,
`${PREFIX}\\.event\\.head_commit\\.author\\.email`,
`${PREFIX}\\.event\\.head_commit\\.author\\.name`,
`${PREFIX}\\.event\\.pages\\.\\*\\.page_name`,
`${PREFIX}\\.head_ref`,
// GlueStack-class additions
`${PREFIX}\\.event\\.discussion\\.title`,
`${PREFIX}\\.event\\.discussion\\.body`,
`${PREFIX}\\.event\\.discussion\\.user\\.login`,
`${PREFIX}\\.event\\.inputs\\.[\\w-]+`,
`${PREFIX}\\.event\\.client_payload\\.[\\w-]+`,
`inputs\\.[\\w-]+`,
];
const DANGEROUS_RE = new RegExp(
'(?:' +
DANGEROUS_FIELDS.map(p => p.replace(/\\\.\\\*/g, '\\.[^.]+')).join('|') +
')',
);
// Numeric/hex/fixed-string fields — INFO-level, never injection sinks
const SAFE_FIELDS_RE = new RegExp(
'^(?:' +
`${PREFIX}\\.event\\.pull_request\\.number|` +
`${PREFIX}\\.event\\.pull_request\\.head\\.sha|` +
`${PREFIX}\\.run_id|` +
`${PREFIX}\\.run_number|` +
`${PREFIX}\\.sha|` +
`${PREFIX}\\.event\\.action|` +
`${PREFIX}\\.event\\.repository\\.full_name` +
')$',
);
// Triggers that grant attacker-controlled context with elevated
// privileges (read/write tokens).
const PRIVILEGED_TRIGGERS = new Set([
'pull_request_target',
'issue_comment',
'discussion',
'discussion_comment',
'workflow_run',
]);
// Triggers where attacker can supply input but token is read-only or
// scoped (still an injection sink, just lower severity).
const SEMI_PRIVILEGED_TRIGGERS = new Set([
'pull_request',
'workflow_dispatch',
'repository_dispatch',
]);
// Sink contexts that ARE shell:
const SINK_PARENTS = new Set(['run']);
// Contexts where ${{ ... }} is evaluated by the runner expression
// engine, NOT the shell. These are sink mismatches, not injection.
const NON_SINK_PARENTS = new Set(['if', 'with', 'env', 'name', 'runs-on', 'timeout-minutes', 'continue-on-error']);
// ---------------------------------------------------------------------------
// Discovery
// ---------------------------------------------------------------------------
/**
* Walk `<targetPath>/.github/workflows/` and `<targetPath>/.forgejo/workflows/`
* one level deep. Return absolute paths of `.yml` and `.yaml` files,
* combined and capped at MAX_FILES total.
*
* @param {string} targetPath
* @returns {Promise<string[]>}
*/
export async function discoverWorkflows(targetPath) {
const out = [];
const dirs = [
join(targetPath, '.github', 'workflows'),
join(targetPath, '.forgejo', 'workflows'),
];
for (const dir of dirs) {
if (!existsSync(dir)) continue;
let entries;
try {
entries = await readdir(dir, { withFileTypes: true });
} catch {
continue;
}
for (const entry of entries) {
if (!entry.isFile()) continue;
if (!/\.ya?ml$/i.test(entry.name)) continue;
out.push(join(dir, entry.name));
if (out.length >= MAX_FILES) return out;
}
}
return out;
}
// ---------------------------------------------------------------------------
// Severity matrix
// ---------------------------------------------------------------------------
/**
* Map (triggerSet, fieldClass) severity.
*
* @param {Set<string>} triggers
* @param {'dangerous'|'safe'|'other'} fieldClass
* @returns {string|null} SEVERITY constant, or null = suppress
*/
function severityFor(triggers, fieldClass) {
if (fieldClass === 'safe') return SEVERITY.INFO;
if (fieldClass !== 'dangerous') return null;
for (const t of triggers) {
if (PRIVILEGED_TRIGGERS.has(t)) return SEVERITY.HIGH;
}
for (const t of triggers) {
if (SEMI_PRIVILEGED_TRIGGERS.has(t)) return SEVERITY.MEDIUM;
}
// No relevant trigger → still flag at MEDIUM (e.g. push events
// can still be reachable from forks via PRs).
return SEVERITY.MEDIUM;
}
function classifyField(expr) {
if (SAFE_FIELDS_RE.test(expr)) return 'safe';
if (DANGEROUS_RE.test(expr)) return 'dangerous';
return 'other';
}
// ---------------------------------------------------------------------------
// Platform detection (filename-based; keeps schema unchanged)
// ---------------------------------------------------------------------------
function detectPlatform(absPath) {
if (absPath.includes('/.forgejo/workflows/')) return 'forgejo';
if (absPath.includes('/.github/workflows/')) return 'github';
return 'unknown';
}
// ---------------------------------------------------------------------------
// Recommendation text
// ---------------------------------------------------------------------------
function buildRecommendation(platform, parent) {
const base = parent === 'run'
? 'Bind the expression to an env var first, then consume it via $VAR in the run script: `env: { TITLE: ${{ ... }} }; run: echo "$TITLE"`. Re-interpolating ${{ env.TITLE }} inside run: cancels the mitigation.'
: 'This expression is not a shell injection sink, but the underlying field is attacker-controlled. Review its downstream use.';
if (platform === 'forgejo') {
return base + ' Forgejo note: job-level `permissions:` is ignored on Forgejo (admin-guide); rely on token scoping at server level instead.';
}
return base;
}
// ---------------------------------------------------------------------------
// Scan one workflow file
// ---------------------------------------------------------------------------
async function scanFile(absPath, targetPath, stderrLog) {
const findings = [];
const stat_ = await stat(absPath).catch(() => null);
if (!stat_ || stat_.size > MAX_FILE_SIZE) return findings;
const text = await readFile(absPath, 'utf8').catch(() => null);
if (text === null) return findings;
const relPath = relative(targetPath, absPath) || basename(absPath);
const platform = detectPlatform(absPath);
let parsed;
try {
parsed = parseWorkflow(text);
} catch (err) {
stderrLog(`[workflow-scanner] parse error in ${relPath}: ${err.message}\n`);
return findings;
}
const triggers = parsed.triggers;
// Forgejo divergence advisory: `workflow_run` is not documented for
// Forgejo. Emit to stderr (not as a finding) so the user knows the
// severity-matrix logic applied as if it were privileged.
if (platform === 'forgejo' && triggers.has('workflow_run')) {
stderrLog(
`[workflow-scanner] ${relPath}: 'workflow_run' trigger is not documented for Forgejo Actions; ` +
`severity logic still treats it as privileged. See knowledge/workflow-injection-patterns.md §Forgejo.\n`
);
}
for (const ev of parsed.events) {
if (NON_SINK_PARENTS.has(ev.parent)) continue;
if (!SINK_PARENTS.has(ev.parent)) continue;
const fieldClass = classifyField(ev.expr);
if (fieldClass === 'other') continue;
const severity = severityFor(triggers, fieldClass);
if (!severity) continue;
const platformLabel = platform === 'forgejo' ? 'Forgejo' : 'GitHub';
const triggerList = [...triggers].join(', ') || 'unknown';
findings.push(finding({
scanner: SCANNER_PREFIX,
severity,
title: severity === SEVERITY.INFO
? `Safe expression in ${platformLabel} workflow run:`
: `Workflow injection: ${platformLabel} ${ev.expr} in run: under ${triggerList}`,
description:
`${platformLabel} workflow at ${relPath} interpolates \${{ ${ev.expr} }} ` +
`inside a run: step. Triggers: ${triggerList}. ` +
`Field class: ${fieldClass}. Block scalar: ${ev.blockScalar}.`,
file: relPath,
line: ev.line,
evidence: `\${{ ${ev.expr} }}`,
owasp: 'LLM02',
recommendation: buildRecommendation(platform, ev.parent),
}));
}
return findings;
}
// ---------------------------------------------------------------------------
// Public entry — orchestrator-compatible
// ---------------------------------------------------------------------------
/**
* Scan a target path for workflow injection.
*
* @param {string} targetPath
* @param {object} [_discovery] Ignored workflow-scanner does its own
* directory probe.
* @returns {Promise<object>} scannerResult envelope
*/
export async function scan(targetPath, _discovery) {
const startMs = Date.now();
const allFindings = [];
let filesScanned = 0;
const stderrLog = (msg) => process.stderr.write(msg);
try {
const files = await discoverWorkflows(targetPath);
for (const f of files) {
filesScanned++;
const fileFindings = await scanFile(f, targetPath, stderrLog);
allFindings.push(...fileFindings);
}
return scannerResult(SCANNER_NAME, 'ok', allFindings, filesScanned, Date.now() - startMs);
} catch (err) {
return scannerResult(
SCANNER_NAME,
'error',
allFindings,
filesScanned,
Date.now() - startMs,
err.message,
);
}
}
// ---------------------------------------------------------------------------
// CLI entry
// ---------------------------------------------------------------------------
const isDirectRun = process.argv[1] === fileURLToPath(import.meta.url);
if (isDirectRun) {
const target = process.argv[2];
if (!target) {
console.error('Usage: node workflow-scanner.mjs <target-path>');
process.exit(1);
}
scan(target).then(result => {
process.stdout.write(JSON.stringify(result, null, 2) + '\n');
});
}