fix(scanners): preserve single-quoted regions through bash-normalize pipeline

Masks non-empty '...' content before T5/T2-T4 run so literal strings such
as `echo '${IFS}'` are not rewritten. Empty '' pairs are stripped first
so c''u''rl -> curl evasion keeps resolving. ANSI-C $'...' is decoded
before masking.

Caught by the false-positive probe added in Step 3 of ultraplan-v6.2.0.
This commit is contained in:
Kjell Tore Guttormsen 2026-04-17 14:29:02 +02:00
commit f881cf9251

View file

@ -17,68 +17,71 @@
// T5 — IFS word-splitting: rm${IFS}-rf${IFS}/ -> rm -rf / // T5 — IFS word-splitting: rm${IFS}-rf${IFS}/ -> rm -rf /
// T6 — ANSI-C hex quoting: $'\x72\x6d' -rf / -> rm -rf / // T6 — ANSI-C hex quoting: $'\x72\x6d' -rf / -> rm -rf /
// //
// T5 and T6 run before T1-T4 so their outputs feed the rest of the pipeline // Execution order:
// in canonical form. Both preserve single-quoted literals (false-positive // 1. Strip empty single-quote pairs (T1) so c''u''rl -> curl before masking.
// probe: `echo '${IFS}'` stays untouched). // 2. Decode ANSI-C hex inside $'...' (T6) before masking.
// 3. Mask remaining non-empty single-quoted regions. T3's ${...} sweep and
// other transforms cannot rewrite their content, preserving literals
// (false-positive probe: `echo '${IFS}'` stays untouched).
// 4. Run T5 (IFS) and T2/T3/T4 on the masked string.
// 5. Unmask.
const MASK = '\x00';
/** /**
* T5 strip IFS-based word splitting outside single-quoted regions. * Decode ANSI-C hex quoting inside `$'...'` contexts.
* *
* Patterns matched: ${IFS}, ${IFS:0:1}, $IFS. Each replaced with a single * Shell treats $'\x72\x6d' as the bytes r and m. We decode only \xHH escape
* space. Content inside '...' is preserved via placeholder masking so the * sequences inside the $'...' wrapper. The $'...' construct itself is
* literal string `'${IFS}'` never expands. * replaced with its decoded bytes (matching shell evaluation).
*/ */
function normalizeIFS(cmd) { function decodeAnsiCHex(cmd) {
return cmd.replace(/\$'([^']*)'/g, (_, content) =>
content.replace(/\\x([0-9a-fA-F]{2})/g, (_m, hex) =>
String.fromCharCode(parseInt(hex, 16)),
),
);
}
/**
* Mask non-empty single-quoted regions with placeholders. Empty '' is NOT
* masked T1 already stripped them in the previous pass.
*/
function maskSingleQuoted(cmd) {
const placeholders = []; const placeholders = [];
const MARK = '\x00'; const masked = cmd.replace(/'[^']+'/g, (match) => {
const masked = cmd.replace(/'[^']*'/g, (match) => {
placeholders.push(match); placeholders.push(match);
return `${MARK}${placeholders.length - 1}${MARK}`; return `${MASK}${placeholders.length - 1}${MASK}`;
}); });
const normalized = masked return { masked, placeholders };
.replace(/\$\{IFS:0:1\}/g, ' ') }
.replace(/\$\{IFS\}/g, ' ')
.replace(/\$IFS\b/g, ' '); function unmaskSingleQuoted(str, placeholders) {
return normalized.replace( return str.replace(
new RegExp(`${MARK}(\\d+)${MARK}`, 'g'), new RegExp(`${MASK}(\\d+)${MASK}`, 'g'),
(_, idx) => placeholders[parseInt(idx, 10)], (_, idx) => placeholders[parseInt(idx, 10)],
); );
} }
/**
* T6 decode ANSI-C hex quoting inside `$'...'` contexts only.
*
* Shell treats $'\x72\x6d' as the bytes r and m. Attackers use this to
* hide command names from regex gates. We decode only the \xHH escape
* sequences inside the $'...' wrapper. Regular single-quoted strings
* '...' are not touched.
*/
function normalizeAnsiCHex(cmd) {
return cmd.replace(/\$'([^']*)'/g, (match, content) => {
return content.replace(/\\x([0-9a-fA-F]{2})/g, (_, hex) =>
String.fromCharCode(parseInt(hex, 16)),
);
});
}
/** /**
* Normalize bash parameter expansion and quoting evasion in a command string. * Normalize bash parameter expansion and quoting evasion in a command string.
* *
* Strips (T1-T6): * Strips / rewrites (T1-T6):
* - T1 Empty single quotes: '' (e.g., w''get -> wget) * - T1 Empty single quotes: '' (e.g., w''get -> wget)
* - T2 Empty double quotes: "" (e.g., r""m -> rm) * - T2 Empty double quotes: "" (e.g., r""m -> rm)
* - T3 Single-char parameter expansion: ${x} -> x (evasion: attacker sets x=x) * - T3 Single-char parameter expansion: ${x} -> x (c${u}rl -> curl)
* - T3 Multi-char parameter expansion: ${ANYTHING} -> '' (unknown value) * - T3 Multi-char parameter expansion: ${FOO} -> '' (unknown value)
* - T4 Backslash escapes between word chars, iteratively (c\u\r\l -> curl) * - T4 Backslash escapes between word chars, iteratively (c\u\r\l -> curl)
* - T5 IFS word-splitting: ${IFS}, ${IFS:0:1}, $IFS -> ' ' * - T5 IFS word-splitting: ${IFS} / ${IFS:0:1} / $IFS -> ' '
* - T6 ANSI-C hex quoting inside $'...' -> decoded bytes * - T6 ANSI-C hex quoting inside $'...' -> decoded bytes
* - Backtick subshell with empty/whitespace content * - Backtick subshell with empty/whitespace content
* *
* Does NOT strip: * Does NOT rewrite:
* - Quotes around arguments (only targets empty quotes that split command names) * - Quotes around arguments (only targets empty quotes that split command names)
* - $VAR without braces outside IFS (not an evasion pattern) * - $VAR without braces (non-IFS; not an evasion pattern)
* - Backslashes before non-word chars (\n, \t, etc.) * - Backslashes before non-word chars (\n, \t, etc.)
* - Content inside single-quoted regions (T5 preserves them; `echo '${IFS}'` untouched) * - Content inside non-empty single-quoted regions
* (false-positive probe: `echo '${IFS}'` stays untouched)
* *
* @param {string} cmd - Raw command string * @param {string} cmd - Raw command string
* @returns {string} Normalized command string * @returns {string} Normalized command string
@ -86,15 +89,25 @@ function normalizeAnsiCHex(cmd) {
export function normalizeBashExpansion(cmd) { export function normalizeBashExpansion(cmd) {
if (!cmd || typeof cmd !== 'string') return cmd || ''; if (!cmd || typeof cmd !== 'string') return cmd || '';
// T5 + T6 run first so their outputs feed the rest of the pipeline in // T1 — strip empty single-quote pairs first so adjacent-empty-quote evasion
// canonical form. Order inside T5/T6 is internal; externally we label // (c''u''rl -> curl) resolves before single-quote masking runs.
// the full pipeline T1-T6. let result = cmd.replace(/''/g, '');
let result = normalizeIFS(cmd);
result = normalizeAnsiCHex(result); // T6 — decode ANSI-C hex inside $'...' before masking treats it as a literal.
result = decodeAnsiCHex(result);
// Mask remaining non-empty single-quoted regions.
const { masked, placeholders } = maskSingleQuoted(result);
result = masked;
// T5 — IFS word-splitting. Runs before T2/T3/T4 so the canonical spaces
// it emits feed into subsequent transforms.
result = result
.replace(/\$\{IFS:0:1\}/g, ' ')
.replace(/\$\{IFS\}/g, ' ')
.replace(/\$IFS\b/g, ' ');
result = result result = result
// T1 Strip empty single quotes: w''get -> wget
.replace(/''/g, '')
// T2 Strip empty double quotes: r""m -> rm // T2 Strip empty double quotes: r""m -> rm
.replace(/""/g, '') .replace(/""/g, '')
// T3 Single-char ${x} -> x (evasion: c${u}rl -> curl, assumes x=x) // T3 Single-char ${x} -> x (evasion: c${u}rl -> curl, assumes x=x)
@ -104,12 +117,12 @@ export function normalizeBashExpansion(cmd) {
// Strip backtick subshell with empty/whitespace content // Strip backtick subshell with empty/whitespace content
.replace(/`\s*`/g, ''); .replace(/`\s*`/g, '');
// T4 Iteratively strip backslash between word chars (c\u\r\l needs 2 passes) // T4 — iteratively strip backslash between word chars (c\u\r\l needs 2 passes)
let prev; let prev;
do { do {
prev = result; prev = result;
result = result.replace(/(\w)\\(\w)/g, '$1$2'); result = result.replace(/(\w)\\(\w)/g, '$1$2');
} while (result !== prev); } while (result !== prev);
return result; return unmaskSingleQuoted(result, placeholders);
} }