ktg-plugin-marketplace/plugins/llm-security/scanners/lib/bash-normalize.mjs

// bash-normalize.mjs — Normalize bash parameter expansion evasion techniques.
//
// Attackers can evade command-name matching by inserting shell metacharacters
// that are transparent to bash but break regex patterns.
//
// This module strips these constructs from command names so that downstream
// pattern matching sees the canonical form.
//
// Exported as a shared module — used by pre-bash-destructive.mjs and
// pre-install-supply-chain.mjs.
//
// Pipeline (defense-in-depth layer above Claude Code 2.1.98+ harness fixes):
//   T1 — empty single quotes:       w''get           -> wget
//   T2 — empty double quotes:       r""m             -> rm
//   T3 — parameter expansion:       ${x} / ${FOO}    -> x / ''
//   T4 — backslash-between-words:   c\u\r\l          -> curl
//   T5 — IFS word-splitting:        rm${IFS}-rf${IFS}/ -> rm -rf /
//   T6 — ANSI-C hex quoting:        $'\x72\x6d' -rf / -> rm -rf /
//   T9 — eval-via-variable:         X=rm; eval "$X" -> X=rm; eval rm
//        (one-level forward-flow; T7 process-substitution + T8 base64-pipe-shell
//        live in adjacent layers, see workflow-scanner / pre-bash-destructive)
//
// Execution order:
//   1. Strip empty single-quote pairs (T1) so c''u''rl -> curl before masking.
//   2. Decode ANSI-C hex inside $'...' (T6) before masking.
//   3. Mask remaining non-empty single-quoted regions. T3's ${...} sweep and
//      other transforms cannot rewrite their content, preserving literals
//      (false-positive probe: `echo '${IFS}'` stays untouched).
//   4. Run T5 (IFS) and T2/T3/T4 on the masked string.
//   5. Unmask.

const MASK = '\x00';

/**
 * Decode ANSI-C hex quoting inside `$'...'` contexts.
 *
 * Shell treats $'\x72\x6d' as the bytes r and m. We decode only \xHH escape
 * sequences inside the $'...' wrapper. The $'...' construct itself is
 * replaced with its decoded bytes (matching shell evaluation).
 */
function decodeAnsiCHex(cmd) {
  return cmd.replace(/\$'([^']*)'/g, (_, content) =>
    content.replace(/\\x([0-9a-fA-F]{2})/g, (_m, hex) =>
      String.fromCharCode(parseInt(hex, 16)),
    ),
  );
}

/**
 * T9 — Substitute single-level variable assignments into ${VAR} and $VAR
 * references. Defeats split-and-eval evasion (X=rm; eval "$X" -rf /).
 *
 * One-level forward-flow only: assignments are scanned once at the prefix of
 * each command segment (start of string OR after ; & |) and applied to
 * later references in the same string. Multi-level chained vars
 * (X=Y; Y=rm; eval "$X") are intentionally not followed.
 *
 * Limitations (documented for adversarial review):
 *   - Quoted values (X="rm -rf") are not parsed — value capture stops at
 *     whitespace. Unquoted single-token values are the common evasion idiom.
 *   - Substitution is global within the string, not scoped to eval.
 *     Acceptable because T3 already strips unknown ${VAR} to '', and known
 *     vars get substituted to their literal value before T3 runs.
 */
function decodeEvalViaVariable(cmd) {
  const assignments = new Map();
  const ASSIGN_RE = /(?:^|[;&|])\s*([A-Za-z_]\w*)=([^\s;&|]+)/g;
  let m;
  while ((m = ASSIGN_RE.exec(cmd)) !== null) {
    if (!assignments.has(m[1])) assignments.set(m[1], m[2]);
  }
  if (assignments.size === 0) return cmd;
  let result = cmd;
  for (const [name, value] of assignments) {
    const curlyRe = new RegExp(`\\$\\{${name}\\}`, 'g');
    result = result.replace(curlyRe, () => value);
    const bareRe = new RegExp(`\\$${name}\\b`, 'g');
    result = result.replace(bareRe, () => value);
  }
  return result;
}

/**
 * Mask non-empty single-quoted regions with placeholders. Empty '' is NOT
 * masked — T1 already stripped them in the previous pass.
 */
function maskSingleQuoted(cmd) {
  const placeholders = [];
  const masked = cmd.replace(/'[^']+'/g, (match) => {
    placeholders.push(match);
    return `${MASK}${placeholders.length - 1}${MASK}`;
  });
  return { masked, placeholders };
}

function unmaskSingleQuoted(str, placeholders) {
  return str.replace(
    new RegExp(`${MASK}(\\d+)${MASK}`, 'g'),
    (_, idx) => placeholders[parseInt(idx, 10)],
  );
}

/**
 * Normalize bash parameter expansion and quoting evasion in a command string.
 *
 * Strips / rewrites (T1-T6 + T9):
 *   - T1 Empty single quotes: ''                      (e.g., w''get -> wget)
 *   - T2 Empty double quotes: ""                      (e.g., r""m -> rm)
 *   - T3 Single-char parameter expansion: ${x} -> x   (c${u}rl -> curl)
 *   - T3 Multi-char parameter expansion:  ${FOO} -> '' (unknown value)
 *   - T4 Backslash escapes between word chars, iteratively (c\u\r\l -> curl)
 *   - T5 IFS word-splitting: ${IFS} / ${IFS:0:1} / $IFS -> ' '
 *   - T6 ANSI-C hex quoting inside $'...' -> decoded bytes
 *   - T9 Eval-via-variable: X=rm; eval "$X" -> X=rm; eval rm
 *   - Backtick subshell with empty/whitespace content
 *
 * Does NOT rewrite:
 *   - Quotes around arguments (only targets empty quotes that split command names)
 *   - $VAR without braces (non-IFS; not an evasion pattern)
 *   - Backslashes before non-word chars (\n, \t, etc.)
 *   - Content inside non-empty single-quoted regions
 *     (false-positive probe: `echo '${IFS}'` stays untouched)
 *
 * @param {string} cmd - Raw command string
 * @returns {string} Normalized command string
 */
export function normalizeBashExpansion(cmd) {
  if (!cmd || typeof cmd !== 'string') return cmd || '';

  // T1 — strip empty single-quote pairs first so adjacent-empty-quote evasion
  // (c''u''rl -> curl) resolves before single-quote masking runs.
  let result = cmd.replace(/''/g, '');

  // T6 — decode ANSI-C hex inside $'...' before masking treats it as a literal.
  result = decodeAnsiCHex(result);

  // Mask remaining non-empty single-quoted regions.
  const { masked, placeholders } = maskSingleQuoted(result);
  result = masked;

  // T5 — IFS word-splitting. Runs before T2/T3/T4 so the canonical spaces
  // it emits feed into subsequent transforms.
  result = result
    .replace(/\$\{IFS:0:1\}/g, ' ')
    .replace(/\$\{IFS\}/g, ' ')
    .replace(/\$IFS\b/g, ' ');

  // T9 — substitute one-level VAR=value assignments into ${VAR}/$VAR
  // references. Must run BEFORE T3 (which strips unknown ${VAR} to '').
  result = decodeEvalViaVariable(result);

  result = result
    // T2 Strip empty double quotes: r""m -> rm
    .replace(/""/g, '')
    // T3 Single-char ${x} -> x (evasion: c${u}rl -> curl, assumes x=x)
    .replace(/\$\{(\w)\}/g, '$1')
    // T3 Multi-char ${ANYTHING} -> '' (unknown value, strip entirely)
    .replace(/\$\{[^}]*\}/g, '')
    // Strip backtick subshell with empty/whitespace content
    .replace(/`\s*`/g, '');

  // T4 — iteratively strip backslash between word chars (c\u\r\l needs 2 passes)
  let prev;
  do {
    prev = result;
    result = result.replace(/(\w)\\(\w)/g, '$1$2');
  } while (result !== prev);

  return unmaskSingleQuoted(result, placeholders);
}