From f881cf9251b5554bd36c290f7e2fbc9b1dfa9063 Mon Sep 17 00:00:00 2001 From: Kjell Tore Guttormsen Date: Fri, 17 Apr 2026 14:29:02 +0200 Subject: [PATCH] fix(scanners): preserve single-quoted regions through bash-normalize pipeline Masks non-empty '...' content before T5/T2-T4 run so literal strings such as `echo '${IFS}'` are not rewritten. Empty '' pairs are stripped first so c''u''rl -> curl evasion keeps resolving. ANSI-C $'...' is decoded before masking. Caught by the false-positive probe added in Step 3 of ultraplan-v6.2.0. --- .../scanners/lib/bash-normalize.mjs | 115 ++++++++++-------- 1 file changed, 64 insertions(+), 51 deletions(-) diff --git a/plugins/llm-security/scanners/lib/bash-normalize.mjs b/plugins/llm-security/scanners/lib/bash-normalize.mjs index b1ce1fc..6036c76 100644 --- a/plugins/llm-security/scanners/lib/bash-normalize.mjs +++ b/plugins/llm-security/scanners/lib/bash-normalize.mjs @@ -17,68 +17,71 @@ // T5 — IFS word-splitting: rm${IFS}-rf${IFS}/ -> rm -rf / // T6 — ANSI-C hex quoting: $'\x72\x6d' -rf / -> rm -rf / // -// T5 and T6 run before T1-T4 so their outputs feed the rest of the pipeline -// in canonical form. Both preserve single-quoted literals (false-positive -// probe: `echo '${IFS}'` stays untouched). +// Execution order: +// 1. Strip empty single-quote pairs (T1) so c''u''rl -> curl before masking. +// 2. Decode ANSI-C hex inside $'...' (T6) before masking. +// 3. Mask remaining non-empty single-quoted regions. T3's ${...} sweep and +// other transforms cannot rewrite their content, preserving literals +// (false-positive probe: `echo '${IFS}'` stays untouched). +// 4. Run T5 (IFS) and T2/T3/T4 on the masked string. +// 5. Unmask. + +const MASK = '\x00'; /** - * T5 — strip IFS-based word splitting outside single-quoted regions. + * Decode ANSI-C hex quoting inside `$'...'` contexts. * - * Patterns matched: ${IFS}, ${IFS:0:1}, $IFS. Each replaced with a single - * space. Content inside '...' is preserved via placeholder masking so the - * literal string `'${IFS}'` never expands. + * Shell treats $'\x72\x6d' as the bytes r and m. We decode only \xHH escape + * sequences inside the $'...' wrapper. The $'...' construct itself is + * replaced with its decoded bytes (matching shell evaluation). */ -function normalizeIFS(cmd) { +function decodeAnsiCHex(cmd) { + return cmd.replace(/\$'([^']*)'/g, (_, content) => + content.replace(/\\x([0-9a-fA-F]{2})/g, (_m, hex) => + String.fromCharCode(parseInt(hex, 16)), + ), + ); +} + +/** + * Mask non-empty single-quoted regions with placeholders. Empty '' is NOT + * masked — T1 already stripped them in the previous pass. + */ +function maskSingleQuoted(cmd) { const placeholders = []; - const MARK = '\x00'; - const masked = cmd.replace(/'[^']*'/g, (match) => { + const masked = cmd.replace(/'[^']+'/g, (match) => { placeholders.push(match); - return `${MARK}${placeholders.length - 1}${MARK}`; + return `${MASK}${placeholders.length - 1}${MASK}`; }); - const normalized = masked - .replace(/\$\{IFS:0:1\}/g, ' ') - .replace(/\$\{IFS\}/g, ' ') - .replace(/\$IFS\b/g, ' '); - return normalized.replace( - new RegExp(`${MARK}(\\d+)${MARK}`, 'g'), + return { masked, placeholders }; +} + +function unmaskSingleQuoted(str, placeholders) { + return str.replace( + new RegExp(`${MASK}(\\d+)${MASK}`, 'g'), (_, idx) => placeholders[parseInt(idx, 10)], ); } -/** - * T6 — decode ANSI-C hex quoting inside `$'...'` contexts only. - * - * Shell treats $'\x72\x6d' as the bytes r and m. Attackers use this to - * hide command names from regex gates. We decode only the \xHH escape - * sequences inside the $'...' wrapper. Regular single-quoted strings - * '...' are not touched. - */ -function normalizeAnsiCHex(cmd) { - return cmd.replace(/\$'([^']*)'/g, (match, content) => { - return content.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) => - String.fromCharCode(parseInt(hex, 16)), - ); - }); -} - /** * Normalize bash parameter expansion and quoting evasion in a command string. * - * Strips (T1-T6): - * - T1 Empty single quotes: '' (e.g., w''get -> wget) - * - T2 Empty double quotes: "" (e.g., r""m -> rm) - * - T3 Single-char parameter expansion: ${x} -> x (evasion: attacker sets x=x) - * - T3 Multi-char parameter expansion: ${ANYTHING} -> '' (unknown value) + * Strips / rewrites (T1-T6): + * - T1 Empty single quotes: '' (e.g., w''get -> wget) + * - T2 Empty double quotes: "" (e.g., r""m -> rm) + * - T3 Single-char parameter expansion: ${x} -> x (c${u}rl -> curl) + * - T3 Multi-char parameter expansion: ${FOO} -> '' (unknown value) * - T4 Backslash escapes between word chars, iteratively (c\u\r\l -> curl) - * - T5 IFS word-splitting: ${IFS}, ${IFS:0:1}, $IFS -> ' ' + * - T5 IFS word-splitting: ${IFS} / ${IFS:0:1} / $IFS -> ' ' * - T6 ANSI-C hex quoting inside $'...' -> decoded bytes * - Backtick subshell with empty/whitespace content * - * Does NOT strip: + * Does NOT rewrite: * - Quotes around arguments (only targets empty quotes that split command names) - * - $VAR without braces outside IFS (not an evasion pattern) + * - $VAR without braces (non-IFS; not an evasion pattern) * - Backslashes before non-word chars (\n, \t, etc.) - * - Content inside single-quoted regions (T5 preserves them; `echo '${IFS}'` untouched) + * - Content inside non-empty single-quoted regions + * (false-positive probe: `echo '${IFS}'` stays untouched) * * @param {string} cmd - Raw command string * @returns {string} Normalized command string @@ -86,15 +89,25 @@ function normalizeAnsiCHex(cmd) { export function normalizeBashExpansion(cmd) { if (!cmd || typeof cmd !== 'string') return cmd || ''; - // T5 + T6 run first so their outputs feed the rest of the pipeline in - // canonical form. Order inside T5/T6 is internal; externally we label - // the full pipeline T1-T6. - let result = normalizeIFS(cmd); - result = normalizeAnsiCHex(result); + // T1 — strip empty single-quote pairs first so adjacent-empty-quote evasion + // (c''u''rl -> curl) resolves before single-quote masking runs. + let result = cmd.replace(/''/g, ''); + + // T6 — decode ANSI-C hex inside $'...' before masking treats it as a literal. + result = decodeAnsiCHex(result); + + // Mask remaining non-empty single-quoted regions. + const { masked, placeholders } = maskSingleQuoted(result); + result = masked; + + // T5 — IFS word-splitting. Runs before T2/T3/T4 so the canonical spaces + // it emits feed into subsequent transforms. + result = result + .replace(/\$\{IFS:0:1\}/g, ' ') + .replace(/\$\{IFS\}/g, ' ') + .replace(/\$IFS\b/g, ' '); result = result - // T1 Strip empty single quotes: w''get -> wget - .replace(/''/g, '') // T2 Strip empty double quotes: r""m -> rm .replace(/""/g, '') // T3 Single-char ${x} -> x (evasion: c${u}rl -> curl, assumes x=x) @@ -104,12 +117,12 @@ export function normalizeBashExpansion(cmd) { // Strip backtick subshell with empty/whitespace content .replace(/`\s*`/g, ''); - // T4 Iteratively strip backslash between word chars (c\u\r\l needs 2 passes) + // T4 — iteratively strip backslash between word chars (c\u\r\l needs 2 passes) let prev; do { prev = result; result = result.replace(/(\w)\\(\w)/g, '$1$2'); } while (result !== prev); - return result; + return unmaskSingleQuoted(result, placeholders); }