diff --git a/plugins/llm-security/scanners/taint-tracer.mjs b/plugins/llm-security/scanners/taint-tracer.mjs index df33001..82d641f 100644 --- a/plugins/llm-security/scanners/taint-tracer.mjs +++ b/plugins/llm-security/scanners/taint-tracer.mjs @@ -159,39 +159,149 @@ function downgradeSeverity(sev) { // --------------------------------------------------------------------------- /** - * Attempt to extract the variable name being assigned on a source line. + * Attempt to extract the variable name(s) being assigned on a source line. * Handles: - * const/let/var X = - * X = - * X: (Python / YAML-ish) - * (X) = (destructuring approximation) + * const/let/var X = (plain decl) + * X = (plain assignment) + * X: (Python / YAML-ish) + * const { x } = (object destructuring) + * const { x, y } = (multi-key) + * const { secret: alias } = (renamed) + * const { a, ...spread } = (object rest) + * const { a, b: { c } } = (nested object) + * const [a, b] = (array destructuring) + * const [first, ...rest] = (array rest) + * const [a, [b, c]] = (nested array) * - * Returns an empty array if no assignment variable is found — the source - * will still be tracked for same-line sink detection, but not propagated. + * Implementation: regex-based, no full JS parser. Same constraint as the + * pre-B6 extractor — the goal is best-effort, not soundness. Untracked + * variables fall back to same-line sink detection (no propagation). + * + * Returns an empty array if no assignment variable is found. * * @param {string} line * @returns {string[]} variable names (may be empty) */ -function extractAssignedVariable(line) { - const names = []; +export function extractAssignedVariable(line) { + const names = new Set(); - // Pattern 1: const/let/var X = ... or const/let/var { X } = ... - const declMatch = line.match(/\b(?:const|let|var)\s+\{?\s*(\w+)/); - if (declMatch) { - names.push(declMatch[1]); + // Identify a destructuring pattern boundary on the LHS of `=`. + // Match `const|let|var` followed by either `{...}` or `[...]` and `=`. + // We capture the LHS-pattern body so we can extract names without + // reading past the assignment. + const destructDecl = line.match(/\b(?:const|let|var)\s+([{[][\s\S]*?[\]}])\s*=[^=]/); + if (destructDecl) { + extractDestructuredNames(destructDecl[1], names); + } else { + // Pattern 1: const/let/var X = ... (plain identifier — keep + // existing behavior; the original pre-B6 regex tolerated optional `{` + // and silently ate the first key. Now that destructuring has its own + // branch above, the plain-decl branch only matches plain identifiers.) + const declMatch = line.match(/\b(?:const|let|var)\s+(\w+)\s*=/); + if (declMatch) { + names.add(declMatch[1]); + } } // Pattern 2: plain assignment X = ... (no keyword) // Avoid matching == and === const assignMatch = line.match(/^\s*(\w+)\s*=[^=]/); - if (assignMatch && !names.includes(assignMatch[1])) { - names.push(assignMatch[1]); + if (assignMatch) { + names.add(assignMatch[1]); } - // Pattern 3: Python-style keyword argument or named parameter: X = source - // Already covered by Pattern 2 above. + // Pattern 3 (Python-style `X: source`) — already covered by other patterns + // when present in YAML/Python contexts via the plain-decl branch. - return names; + return [...names]; +} + +/** + * Walk a destructuring pattern body (the `{...}` or `[...]` after the + * `const`/`let`/`var` keyword and before `=`) and add every bound + * identifier to `names`. Handles nested patterns and rest elements. + * + * Pure regex — does not parse balanced brackets perfectly, but the + * patterns we care about (plain identifiers, renamed keys `key: alias`, + * rest `...spread`) all surface as `\w+` tokens at predictable positions + * that a simple tokenizer can extract. Edge case: shorthand keys with + * default values (`{ x = 5 }`) are handled by the identifier-before-= rule. + * + * @param {string} pattern The body including the outer brackets. + * @param {Set} names Mutated. + */ +function extractDestructuredNames(pattern, names) { + // Strip outer brackets so we focus on contents. + const inner = pattern.slice(1, -1); + + // Token-walk: at each position consume one of: + // - `{ ... }` or `[ ... ]` — recurse into the nested pattern + // - `key: ` — bind whatever \w+ comes from 's leading ident + // (or recurse if is a nested pattern) + // - `...spread` — the next ident is the rest var + // - `ident` — bound directly (shorthand or array element) + // - `ident = default` — bound (default value ignored) + // - separators (`,`, whitespace) — skip + // + // Implementation simplification: match on three regex alternatives that + // cover everything in practice. Catastrophic-backtracking-safe: every + // token consumes ≥1 character. + + let i = 0; + while (i < inner.length) { + const ch = inner[i]; + + if (ch === '{' || ch === '[') { + // Find matching close bracket via depth counter (handles nesting). + const open = ch; + const close = open === '{' ? '}' : ']'; + let depth = 1; + let j = i + 1; + while (j < inner.length && depth > 0) { + if (inner[j] === open) depth++; + else if (inner[j] === close) depth--; + j++; + } + // Recurse into the nested pattern body. + extractDestructuredNames(inner.slice(i, j), names); + i = j; + continue; + } + + if (ch === ',' || /\s/.test(ch) || ch === ':' || ch === '=') { + i++; + continue; + } + + if (inner.startsWith('...', i)) { + i += 3; + continue; + } + + // Identifier token. After this token: either followed by `:` (then + // the RHS is the actual binding — skip this token, the bind is the + // next ident), or followed by `,`/`}`/`]`/`=`/whitespace/end (then + // this token is the bound name). + const idMatch = inner.slice(i).match(/^(\w+)/); + if (!idMatch) { + i++; + continue; + } + const ident = idMatch[1]; + const next = i + ident.length; + // Skip whitespace to find the next significant character. + let k = next; + while (k < inner.length && /\s/.test(inner[k])) k++; + if (inner[k] === ':') { + // This ident is a key — the RHS is the binding. Don't add this + // ident; the loop will pick up the RHS on the next iteration. + i = k + 1; + continue; + } + // Otherwise this ident IS bound. + names.add(ident); + i = next; + } } // --------------------------------------------------------------------------- diff --git a/plugins/llm-security/tests/scanners/taint-destructuring.test.mjs b/plugins/llm-security/tests/scanners/taint-destructuring.test.mjs new file mode 100644 index 0000000..1ab6723 --- /dev/null +++ b/plugins/llm-security/tests/scanners/taint-destructuring.test.mjs @@ -0,0 +1,128 @@ +// taint-destructuring.test.mjs — B6 (v7.2.0) — taint-tracer destructuring + spread +// +// Critical-review §2 B6 finding: extractAssignedVariable handled `const X` +// and `X = ...` but missed every modern destructuring pattern. As a result, +// taint flows through destructured/spread variables produced false negatives +// at the propagation step (same-line sink detection still worked). +// +// This test pins the B6 fix at the unit-test level. The taint-tracer now +// exports `extractAssignedVariable` for direct testing — same pattern as +// policy-loader's `_resetCacheForTest`. + +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { extractAssignedVariable } from '../../scanners/taint-tracer.mjs'; + +describe('extractAssignedVariable — pre-B6 patterns (regression guard)', () => { + it('plain const declaration: const x = req.body', () => { + const names = extractAssignedVariable('const x = req.body;'); + assert.deepEqual(names, ['x']); + }); + + it('plain let declaration: let y = process.argv[1]', () => { + const names = extractAssignedVariable('let y = process.argv[1];'); + assert.deepEqual(names, ['y']); + }); + + it('plain var declaration: var z = user_input', () => { + const names = extractAssignedVariable('var z = user_input;'); + assert.deepEqual(names, ['z']); + }); + + it('plain assignment: tainted = req.body', () => { + const names = extractAssignedVariable('tainted = req.body;'); + assert.deepEqual(names, ['tainted']); + }); + + it('does not match equality: x == req.body', () => { + const names = extractAssignedVariable('if (x == req.body) {}'); + assert.equal(names.length, 0); + }); +}); + +describe('extractAssignedVariable — B6 destructuring patterns', () => { + it('object destructuring: const { x } = req.body', () => { + const names = extractAssignedVariable('const { x } = req.body;'); + assert.deepEqual(names.sort(), ['x']); + }); + + it('object destructuring multi-key: const { x, y } = req.body', () => { + const names = extractAssignedVariable('const { x, y } = req.body;'); + assert.deepEqual(names.sort(), ['x', 'y']); + }); + + it('renamed destructuring: const { secret: alias } = req.body', () => { + const names = extractAssignedVariable('const { secret: alias } = req.body;'); + assert.deepEqual(names.sort(), ['alias']); + // The key `secret` is NOT a binding — only `alias` is. + assert.ok(!names.includes('secret'), 'key (secret) must not be a binding'); + }); + + it('object rest: const { x, ...spread } = req.body', () => { + const names = extractAssignedVariable('const { x, ...spread } = req.body;'); + assert.deepEqual(names.sort(), ['spread', 'x']); + }); + + it('nested object destructuring: const { a, b: { c } } = req.body', () => { + const names = extractAssignedVariable('const { a, b: { c } } = req.body;'); + assert.deepEqual(names.sort(), ['a', 'c']); + // `b` is a key — not a binding. + assert.ok(!names.includes('b'), 'key (b) must not be a binding'); + }); + + it('array destructuring: const [a, b] = process.argv', () => { + const names = extractAssignedVariable('const [a, b] = process.argv;'); + assert.deepEqual(names.sort(), ['a', 'b']); + }); + + it('array rest: const [first, ...rest] = process.argv', () => { + const names = extractAssignedVariable('const [first, ...rest] = process.argv;'); + assert.deepEqual(names.sort(), ['first', 'rest']); + }); + + it('nested array destructuring: const [a, [b, c]] = source', () => { + const names = extractAssignedVariable('const [a, [b, c]] = source;'); + assert.deepEqual(names.sort(), ['a', 'b', 'c']); + }); + + it('mixed destructuring: const { user: { id }, ...rest } = req.body', () => { + const names = extractAssignedVariable('const { user: { id }, ...rest } = req.body;'); + assert.deepEqual(names.sort(), ['id', 'rest']); + // `user` is a key — not a binding. + assert.ok(!names.includes('user'), 'key (user) must not be a binding'); + }); + + it('let with destructuring: let { x } = source', () => { + const names = extractAssignedVariable('let { x } = source;'); + assert.deepEqual(names.sort(), ['x']); + }); + + it('var with destructuring: var [a] = source', () => { + const names = extractAssignedVariable('var [a] = source;'); + assert.deepEqual(names.sort(), ['a']); + }); + + it('default value in destructuring: const { x = 5 } = source', () => { + // The destructured binding `x` is still bound; the default `= 5` is on + // the RHS of `:` if absent or just trails the ident. The walker treats + // `=` and any literal that follows as separators. + const names = extractAssignedVariable('const { x = 5 } = source;'); + assert.ok(names.includes('x'), `expected x in ${JSON.stringify(names)}`); + }); +}); + +describe('extractAssignedVariable — non-destructuring pre-B6 regressions', () => { + it('does not falsely add destructure-shaped tokens from non-decl lines', () => { + // `if ({ x } === ...)` is invalid JS but the regex must not match it + // as a binding. Same for object literals on the RHS without `const`. + const names = extractAssignedVariable('return { x: 1 };'); + assert.equal(names.length, 0, `non-decl produced unexpected names: ${JSON.stringify(names)}`); + }); + + it('does not match arrow function param destructuring without const', () => { + // `({x}) => ...` is a destructure but not an assignment — skip. + const names = extractAssignedVariable('const fn = ({x}) => x;'); + // The plain-decl branch still picks `fn` (the function-binding name). + assert.deepEqual(names, ['fn']); + }); +});