// bash-normalize.mjs — Normalize bash parameter expansion evasion techniques. // // Attackers can evade command-name matching by inserting shell metacharacters // that are transparent to bash but break regex patterns. // // This module strips these constructs from command names so that downstream // pattern matching sees the canonical form. // // Exported as a shared module — used by pre-bash-destructive.mjs and // pre-install-supply-chain.mjs. // // Pipeline (defense-in-depth layer above Claude Code 2.1.98+ harness fixes): // T1 — empty single quotes: w''get -> wget // T2 — empty double quotes: r""m -> rm // T3 — parameter expansion: ${x} / ${FOO} -> x / '' // T4 — backslash-between-words: c\u\r\l -> curl // T5 — IFS word-splitting: rm${IFS}-rf${IFS}/ -> rm -rf / // T6 — ANSI-C hex quoting: $'\x72\x6d' -rf / -> rm -rf / // // Execution order: // 1. Strip empty single-quote pairs (T1) so c''u''rl -> curl before masking. // 2. Decode ANSI-C hex inside $'...' (T6) before masking. // 3. Mask remaining non-empty single-quoted regions. T3's ${...} sweep and // other transforms cannot rewrite their content, preserving literals // (false-positive probe: `echo '${IFS}'` stays untouched). // 4. Run T5 (IFS) and T2/T3/T4 on the masked string. // 5. Unmask. const MASK = '\x00'; /** * Decode ANSI-C hex quoting inside `$'...'` contexts. * * Shell treats $'\x72\x6d' as the bytes r and m. We decode only \xHH escape * sequences inside the $'...' wrapper. The $'...' construct itself is * replaced with its decoded bytes (matching shell evaluation). */ function decodeAnsiCHex(cmd) { return cmd.replace(/\$'([^']*)'/g, (_, content) => content.replace(/\\x([0-9a-fA-F]{2})/g, (_m, hex) => String.fromCharCode(parseInt(hex, 16)), ), ); } /** * Mask non-empty single-quoted regions with placeholders. Empty '' is NOT * masked — T1 already stripped them in the previous pass. */ function maskSingleQuoted(cmd) { const placeholders = []; const masked = cmd.replace(/'[^']+'/g, (match) => { placeholders.push(match); return `${MASK}${placeholders.length - 1}${MASK}`; }); return { masked, placeholders }; } function unmaskSingleQuoted(str, placeholders) { return str.replace( new RegExp(`${MASK}(\\d+)${MASK}`, 'g'), (_, idx) => placeholders[parseInt(idx, 10)], ); } /** * Normalize bash parameter expansion and quoting evasion in a command string. * * Strips / rewrites (T1-T6): * - T1 Empty single quotes: '' (e.g., w''get -> wget) * - T2 Empty double quotes: "" (e.g., r""m -> rm) * - T3 Single-char parameter expansion: ${x} -> x (c${u}rl -> curl) * - T3 Multi-char parameter expansion: ${FOO} -> '' (unknown value) * - T4 Backslash escapes between word chars, iteratively (c\u\r\l -> curl) * - T5 IFS word-splitting: ${IFS} / ${IFS:0:1} / $IFS -> ' ' * - T6 ANSI-C hex quoting inside $'...' -> decoded bytes * - Backtick subshell with empty/whitespace content * * Does NOT rewrite: * - Quotes around arguments (only targets empty quotes that split command names) * - $VAR without braces (non-IFS; not an evasion pattern) * - Backslashes before non-word chars (\n, \t, etc.) * - Content inside non-empty single-quoted regions * (false-positive probe: `echo '${IFS}'` stays untouched) * * @param {string} cmd - Raw command string * @returns {string} Normalized command string */ export function normalizeBashExpansion(cmd) { if (!cmd || typeof cmd !== 'string') return cmd || ''; // T1 — strip empty single-quote pairs first so adjacent-empty-quote evasion // (c''u''rl -> curl) resolves before single-quote masking runs. let result = cmd.replace(/''/g, ''); // T6 — decode ANSI-C hex inside $'...' before masking treats it as a literal. result = decodeAnsiCHex(result); // Mask remaining non-empty single-quoted regions. const { masked, placeholders } = maskSingleQuoted(result); result = masked; // T5 — IFS word-splitting. Runs before T2/T3/T4 so the canonical spaces // it emits feed into subsequent transforms. result = result .replace(/\$\{IFS:0:1\}/g, ' ') .replace(/\$\{IFS\}/g, ' ') .replace(/\$IFS\b/g, ' '); result = result // T2 Strip empty double quotes: r""m -> rm .replace(/""/g, '') // T3 Single-char ${x} -> x (evasion: c${u}rl -> curl, assumes x=x) .replace(/\$\{(\w)\}/g, '$1') // T3 Multi-char ${ANYTHING} -> '' (unknown value, strip entirely) .replace(/\$\{[^}]*\}/g, '') // Strip backtick subshell with empty/whitespace content .replace(/`\s*`/g, ''); // T4 — iteratively strip backslash between word chars (c\u\r\l needs 2 passes) let prev; do { prev = result; result = result.replace(/(\w)\\(\w)/g, '$1$2'); } while (result !== prev); return unmaskSingleQuoted(result, placeholders); }