From 04f1593df3c760f93851b9c80bbda65af268997d Mon Sep 17 00:00:00 2001 From: Kjell Tore Guttormsen Date: Wed, 29 Apr 2026 15:13:13 +0200 Subject: [PATCH] =?UTF-8?q?refactor(entropy):=20B5=20=E2=80=94=20two-stage?= =?UTF-8?q?=20context-classified=20suppression=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v7.0.0 entropy-scanner ran rules 11-13 (GLSL/CSS-in-JS/inline-markup line-proximity suppressions) for every line regardless of file type. A polyglot `.ts` file with an embedded fragment-shader template literal could therefore mask a real high-entropy credential when the credential literal happened to share a line with a GLSL keyword. Critical-review B5 documented the false-negative class. Refactor: * New `classifyFileContext(absPath, lines)` returns `'shader-dominant' | 'markup-dominant' | 'code-dominant' | 'mixed'`, keyed off file extension with a content-density fallback for code-extension files (≥50% of sampled non-blank lines matching GLSL/inline-markup → downgrade to `mixed`). * `isFalsePositive(str, line, absPath, context)` gates rules 11-13 on `context !== 'code-dominant'`. Rules 1-10 and 14-19 still run unconditionally, so URL/path/test-fixture/ffmpeg/UA/SQL/error- template suppression behaves identically. * `scanFileContent` computes `fileContext` once per file and threads it through every per-string suppression check. Conservative defaults to keep the regression surface minimal: * Files with `<5` sampled non-blank lines fall back to `mixed` (preserves the existing rule-11/12/13 behaviour for the single- line .js fixtures used by entropy-context.test.mjs). * Unknown extensions fall back to `mixed`. * Code-extension files densely populated with shader/markup content fall back to `mixed`. Net effect: a `.ts` file with an embedded GLSL block but mostly TS code on the surrounding lines now surfaces credentials that the v7.0.0 line-proximity heuristic suppressed. Pure shader/markup files are unaffected (extension skip / mixed default). New fixture: tests/fixtures/entropy/polyglot-ts-with-glsl.ts (with runtime placeholder so it does not commit a high-entropy literal). +3 tests in tests/scanners/entropy-context.test.mjs (26 → 29). Existing entropy.test.mjs and entropy-context.test.mjs all remain green. Full suite 1658 → 1661. Refs: Batch B Wave 5 / Step 12 / v7.2.0 critical-review-2026-04-20.md §B5 --- .../llm-security/scanners/entropy-scanner.mjs | 95 +++++++++++++++++-- .../fixtures/entropy/polyglot-ts-with-glsl.ts | 32 +++++++ .../tests/scanners/entropy-context.test.mjs | 78 +++++++++++++++ 3 files changed, 197 insertions(+), 8 deletions(-) create mode 100644 plugins/llm-security/tests/fixtures/entropy/polyglot-ts-with-glsl.ts diff --git a/plugins/llm-security/scanners/entropy-scanner.mjs b/plugins/llm-security/scanners/entropy-scanner.mjs index bbd7ce7..c023bc4 100644 --- a/plugins/llm-security/scanners/entropy-scanner.mjs +++ b/plugins/llm-security/scanners/entropy-scanner.mjs @@ -176,6 +176,69 @@ const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxErr */ const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//; +// --------------------------------------------------------------------------- +// File-context classification (B5, v7.2.0) +// --------------------------------------------------------------------------- + +/** File extensions treated as pure shader/markup/code by classifyFileContext. */ +const SHADER_EXTENSIONS = new Set(['.glsl', '.frag', '.vert', '.shader', '.wgsl']); +const MARKUP_EXTENSIONS = new Set(['.html', '.htm', '.svg', '.xml', '.md', '.markdown', '.mdx']); +const CODE_EXTENSIONS = new Set([ + '.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs', + '.py', '.go', '.rs', '.rb', '.java', '.cs', + '.kt', '.scala', '.swift', '.cpp', '.c', '.h', '.hpp', '.php', +]); + +/** + * Classify a file as shader-dominant, markup-dominant, code-dominant, or mixed. + * + * Used by isFalsePositive() to gate rules 11-13 (GLSL/CSS-in-JS/inline-markup + * line-proximity suppressions). Those rules fire only when context !== 'code-dominant' + * — preventing the v7.0.0 polyglot false-negative (a real credential on a line + * with an inline GLSL keyword would be incorrectly suppressed). + * + * Conservative defaults to minimize regression risk: + * - Unknown extensions → 'mixed' (all rules apply) + * - Code-extension files with very few non-blank lines (<5 sampled) → 'mixed' + * - Code-extension files where ≥50% of sampled lines match GLSL/inline-markup → 'mixed' + * - Code-extension files otherwise → 'code-dominant' + * + * @param {string} absPath + * @param {string[]} lines + * @returns {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'} + */ +function classifyFileContext(absPath, lines) { + const lower = absPath.toLowerCase(); + // Pull the actual extension; supports compound names by taking last dot. + const slashIdx = Math.max(lower.lastIndexOf('/'), lower.lastIndexOf('\\')); + const baseName = slashIdx >= 0 ? lower.slice(slashIdx + 1) : lower; + const dotIdx = baseName.lastIndexOf('.'); + const ext = dotIdx >= 0 ? baseName.slice(dotIdx) : ''; + + if (SHADER_EXTENSIONS.has(ext)) return 'shader-dominant'; + if (MARKUP_EXTENSIONS.has(ext)) return 'markup-dominant'; + + if (CODE_EXTENSIONS.has(ext)) { + let sampled = 0; + let suppressionHits = 0; + for (let i = 0; i < lines.length && sampled < 50; i++) { + const trimmed = lines[i].trim(); + if (trimmed.length === 0) continue; + sampled++; + if (GLSL_KEYWORDS.test(trimmed) || INLINE_MARKUP.test(trimmed)) { + suppressionHits++; + } + } + // Too few non-blank lines to classify confidently → conservative default. + if (sampled < 5) return 'mixed'; + // Mostly shader/markup despite the code extension → conservative default. + if (suppressionHits / sampled >= 0.5) return 'mixed'; + return 'code-dominant'; + } + + return 'mixed'; +} + // --------------------------------------------------------------------------- // False-positive suppression helpers // --------------------------------------------------------------------------- @@ -183,12 +246,20 @@ const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//; /** * Decide whether a candidate string should be suppressed (likely a false positive). * + * v7.2.0 (B5): rules 11-13 (GLSL/CSS-in-JS/inline-markup line-proximity) are + * gated on `context !== 'code-dominant'`. In code-dominant files, an inline + * shader keyword next to a credential-shaped string is no longer a reason + * to suppress — that was the v7.0.0 polyglot false-negative (e.g. a `.ts` + * file with embedded GLSL block hiding a real secret on the next line). + * * @param {string} str - The extracted string literal value * @param {string} line - The full source line it came from * @param {string} absPath - Absolute file path + * @param {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'} [context='mixed'] + * File-level classification from classifyFileContext. * @returns {boolean} - true if this string should be skipped */ -function isFalsePositive(str, line, absPath) { +function isFalsePositive(str, line, absPath, context = 'mixed') { // 1. URLs — entropy is misleading for long query strings / JWTs in URLs if (str.startsWith('http://') || str.startsWith('https://')) return true; @@ -234,14 +305,19 @@ function isFalsePositive(str, line, absPath) { if (str.startsWith(prefix)) return true; } - // 11. GLSL/WGSL shader keywords on the line — inline shader source - if (GLSL_KEYWORDS.test(line)) return true; + // Rules 11-13 (v7.2.0 B5): line-proximity suppressions for shader/CSS/markup. + // Gated on context !== 'code-dominant' so that a credential adjacent to an + // inline GLSL keyword in a `.ts` file is no longer suppressed. + if (context !== 'code-dominant') { + // 11. GLSL/WGSL shader keywords on the line — inline shader source + if (GLSL_KEYWORDS.test(line)) return true; - // 12. CSS-in-JS (styled-components, emotion, vanilla-extract) - if (CSS_IN_JS_PATTERN.test(line)) return true; + // 12. CSS-in-JS (styled-components, emotion, vanilla-extract) + if (CSS_IN_JS_PATTERN.test(line)) return true; - // 13. Inline HTML/SVG markup — React/Vue components, email templates - if (INLINE_MARKUP.test(line)) return true; + // 13. Inline HTML/SVG markup — React/Vue components, email templates + if (INLINE_MARKUP.test(line)) return true; + } // 14. ffmpeg filter-graph syntax — long structured strings, not encoded if (FFMPEG_SYNTAX.test(line)) return true; @@ -337,6 +413,9 @@ function maxSeverity(a, b) { function scanFileContent(content, absPath, relPath) { const findings = []; const lines = content.split('\n'); + // v7.2.0 (B5): classify the file once per scan; rules 11-13 inside + // isFalsePositive are gated on this context. + const fileContext = classifyFileContext(absPath, lines); // De-duplicate: track (line, evidence) pairs to avoid reporting the same // string twice when it appears in both extractStringLiterals and assignment @@ -364,7 +443,7 @@ function scanFileContent(content, absPath, relPath) { if (!str || str.length < 10) continue; // False positive suppression - if (isFalsePositive(str, line, absPath)) continue; + if (isFalsePositive(str, line, absPath, fileContext)) continue; const H = shannonEntropy(str); let severity = classifyEntropy(H, str.length); diff --git a/plugins/llm-security/tests/fixtures/entropy/polyglot-ts-with-glsl.ts b/plugins/llm-security/tests/fixtures/entropy/polyglot-ts-with-glsl.ts new file mode 100644 index 0000000..c42943b --- /dev/null +++ b/plugins/llm-security/tests/fixtures/entropy/polyglot-ts-with-glsl.ts @@ -0,0 +1,32 @@ +// Polyglot TypeScript fixture for the entropy-scanner B5 regression. +// +// Pre-B5 behaviour: rule 11 (GLSL_KEYWORDS line-proximity) suppressed any +// high-entropy string that happened to share a line with shader keywords. +// In a `.ts` file with an embedded fragment-shader template literal, a real +// credential on the closing brace line would be silently dismissed. +// +// Post-B5 behaviour: classifyFileContext returns 'code-dominant' for `.ts` +// files (unless the file is overwhelmingly shader/markup), which disables +// rules 11-13. The credential below is therefore detected. +// +// The placeholder __ENTROPY_PAYLOAD_PLACEHOLDER__ is replaced at test time +// with a randomly generated high-entropy string. The static fixture stays +// out of the pre-edit-secrets hook because no real high-entropy literal is +// committed to disk. + +const fragmentShader = ` + precision highp float; + uniform vec3 u_resolution; + uniform float u_time; + varying vec2 v_uv; + void main() { + vec3 color = vec3(v_uv, sin(u_time)); + gl_FragColor = vec4(color, 1.0); + } +`; + +// The next line ends a uniform vec3 declaration AND carries the placeholder +// — exactly the kind of GLSL-adjacent line that rule 11 used to suppress. +const placeholder = "__ENTROPY_PAYLOAD_PLACEHOLDER__"; // uniform vec3 normal; + +export { fragmentShader, placeholder }; diff --git a/plugins/llm-security/tests/scanners/entropy-context.test.mjs b/plugins/llm-security/tests/scanners/entropy-context.test.mjs index bf84d4b..6c292e9 100644 --- a/plugins/llm-security/tests/scanners/entropy-context.test.mjs +++ b/plugins/llm-security/tests/scanners/entropy-context.test.mjs @@ -262,4 +262,82 @@ describe('entropy-scanner context suppression (v7.0.0+)', () => { await rm(fx, { recursive: true, force: true }); }); }); + + describe('D. B5 file-context classification (v7.2.0)', () => { + it('B5 regression: code-dominant .ts file with embedded GLSL — credential adjacent to shader is detected', async () => { + // Polyglot TS file: many code lines, a few GLSL lines inside a template + // literal, and a credential-shaped string on a line that happens to + // contain GLSL keyword tokens. Pre-B5 rule 11 line-proximity suppressed + // this. Post-B5 classifyFileContext returns 'code-dominant' (sample is + // mostly TS code, <50% GLSL/markup), rules 11-13 are gated off, and + // the credential is detected. + const fx = await newRoot('ent-b5-polyglot-'); + const fixtureContent = [ + 'import { Renderer } from "./renderer";', + '', + 'const fragmentShader = `', + ' precision highp float;', + ' uniform vec3 u_resolution;', + ' varying vec2 v_uv;', + '`;', + '', + '// Adjacent line carries GLSL tokens AND the credential payload.', + 'const blob = "' + PAYLOAD + '"; // uniform vec3 normal;', + '', + 'export { fragmentShader, blob };', + ].join('\n'); + await writeFixture(fx, 'shader-app.ts', fixtureContent); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.ok( + result.findings.length >= 1, + 'expected B5 to surface credential in code-dominant .ts despite GLSL neighbour; got ' + result.findings.length + ); + await rm(fx, { recursive: true, force: true }); + }); + + it('B5 control: legitimate .glsl file with high-entropy hash in shader source is still suppressed (extension skip)', async () => { + // A pure-shader file is skipped at the file-extension gate, never + // reaching classifyFileContext. This control confirms the extension + // skip still works (B5 only changed line-level rule gating). + const fx = await newRoot('ent-b5-glsl-'); + await writeFixture(fx, 'noise.glsl', + 'uniform vec3 u_seed;\nvec3 rand = vec3(' + PAYLOAD + ');\n'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.equal(result.findings.length, 0, '.glsl files remain extension-skipped'); + await rm(fx, { recursive: true, force: true }); + }); + + it('B5 control: shader-dominant .ts file with ≥50% GLSL lines downgrades to mixed and suppresses', async () => { + // A code-extension file that is *mostly* shader template content — + // rule 11 should still fire because classifyFileContext downgrades it + // to 'mixed' (≥50% sampled lines match GLSL/INLINE_MARKUP). + const fx = await newRoot('ent-b5-shader-ts-'); + const fixtureContent = [ + 'uniform vec3 u_resolution;', + 'uniform vec3 u_camera_pos;', + 'uniform float u_time;', + 'varying vec2 v_uv;', + 'varying vec3 v_normal;', + 'attribute vec3 position;', + 'attribute vec2 uv;', + 'precision highp float;', + 'gl_Position = vec4(position, 1.0);', + 'gl_FragColor = vec4(1.0);', + 'const blob = "' + PAYLOAD + '"; // uniform vec3 normal;', + ].join('\n'); + await writeFixture(fx, 'shader-heavy.ts', fixtureContent); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.equal( + result.findings.length, 0, + 'expected shader-dense .ts (≥50% GLSL lines) to downgrade to mixed and suppress; got ' + result.findings.length + ); + await rm(fx, { recursive: true, force: true }); + }); + }); });