refactor(entropy): B5 — two-stage context-classified suppression pipeline

The v7.0.0 entropy-scanner ran rules 11-13 (GLSL/CSS-in-JS/inline-markup
line-proximity suppressions) for every line regardless of file type. A
polyglot `.ts` file with an embedded fragment-shader template literal
could therefore mask a real high-entropy credential when the credential
literal happened to share a line with a GLSL keyword. Critical-review
B5 documented the false-negative class.

Refactor:

  * New `classifyFileContext(absPath, lines)` returns
    `'shader-dominant' | 'markup-dominant' | 'code-dominant' | 'mixed'`,
    keyed off file extension with a content-density fallback for
    code-extension files (≥50% of sampled non-blank lines matching
    GLSL/inline-markup → downgrade to `mixed`).

  * `isFalsePositive(str, line, absPath, context)` gates rules 11-13
    on `context !== 'code-dominant'`. Rules 1-10 and 14-19 still run
    unconditionally, so URL/path/test-fixture/ffmpeg/UA/SQL/error-
    template suppression behaves identically.

  * `scanFileContent` computes `fileContext` once per file and threads
    it through every per-string suppression check.

Conservative defaults to keep the regression surface minimal:

  * Files with `<5` sampled non-blank lines fall back to `mixed`
    (preserves the existing rule-11/12/13 behaviour for the single-
    line .js fixtures used by entropy-context.test.mjs).
  * Unknown extensions fall back to `mixed`.
  * Code-extension files densely populated with shader/markup
    content fall back to `mixed`.

Net effect: a `.ts` file with an embedded GLSL block but mostly TS
code on the surrounding lines now surfaces credentials that the
v7.0.0 line-proximity heuristic suppressed. Pure shader/markup
files are unaffected (extension skip / mixed default).

New fixture: tests/fixtures/entropy/polyglot-ts-with-glsl.ts (with
runtime placeholder so it does not commit a high-entropy literal).

+3 tests in tests/scanners/entropy-context.test.mjs (26 → 29).
Existing entropy.test.mjs and entropy-context.test.mjs all remain
green. Full suite 1658 → 1661.

Refs: Batch B Wave 5 / Step 12 / v7.2.0
critical-review-2026-04-20.md §B5
This commit is contained in:
Kjell Tore Guttormsen 2026-04-29 15:13:13 +02:00
commit 04f1593df3
3 changed files with 197 additions and 8 deletions

View file

@ -176,6 +176,69 @@ const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxErr
*/
const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//;
// ---------------------------------------------------------------------------
// File-context classification (B5, v7.2.0)
// ---------------------------------------------------------------------------
/** File extensions treated as pure shader/markup/code by classifyFileContext. */
const SHADER_EXTENSIONS = new Set(['.glsl', '.frag', '.vert', '.shader', '.wgsl']);
const MARKUP_EXTENSIONS = new Set(['.html', '.htm', '.svg', '.xml', '.md', '.markdown', '.mdx']);
const CODE_EXTENSIONS = new Set([
'.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs',
'.py', '.go', '.rs', '.rb', '.java', '.cs',
'.kt', '.scala', '.swift', '.cpp', '.c', '.h', '.hpp', '.php',
]);
/**
* Classify a file as shader-dominant, markup-dominant, code-dominant, or mixed.
*
* Used by isFalsePositive() to gate rules 11-13 (GLSL/CSS-in-JS/inline-markup
* line-proximity suppressions). Those rules fire only when context !== 'code-dominant'
* preventing the v7.0.0 polyglot false-negative (a real credential on a line
* with an inline GLSL keyword would be incorrectly suppressed).
*
* Conservative defaults to minimize regression risk:
* - Unknown extensions 'mixed' (all rules apply)
* - Code-extension files with very few non-blank lines (<5 sampled) 'mixed'
* - Code-extension files where 50% of sampled lines match GLSL/inline-markup 'mixed'
* - Code-extension files otherwise 'code-dominant'
*
* @param {string} absPath
* @param {string[]} lines
* @returns {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'}
*/
function classifyFileContext(absPath, lines) {
const lower = absPath.toLowerCase();
// Pull the actual extension; supports compound names by taking last dot.
const slashIdx = Math.max(lower.lastIndexOf('/'), lower.lastIndexOf('\\'));
const baseName = slashIdx >= 0 ? lower.slice(slashIdx + 1) : lower;
const dotIdx = baseName.lastIndexOf('.');
const ext = dotIdx >= 0 ? baseName.slice(dotIdx) : '';
if (SHADER_EXTENSIONS.has(ext)) return 'shader-dominant';
if (MARKUP_EXTENSIONS.has(ext)) return 'markup-dominant';
if (CODE_EXTENSIONS.has(ext)) {
let sampled = 0;
let suppressionHits = 0;
for (let i = 0; i < lines.length && sampled < 50; i++) {
const trimmed = lines[i].trim();
if (trimmed.length === 0) continue;
sampled++;
if (GLSL_KEYWORDS.test(trimmed) || INLINE_MARKUP.test(trimmed)) {
suppressionHits++;
}
}
// Too few non-blank lines to classify confidently → conservative default.
if (sampled < 5) return 'mixed';
// Mostly shader/markup despite the code extension → conservative default.
if (suppressionHits / sampled >= 0.5) return 'mixed';
return 'code-dominant';
}
return 'mixed';
}
// ---------------------------------------------------------------------------
// False-positive suppression helpers
// ---------------------------------------------------------------------------
@ -183,12 +246,20 @@ const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//;
/**
* Decide whether a candidate string should be suppressed (likely a false positive).
*
* v7.2.0 (B5): rules 11-13 (GLSL/CSS-in-JS/inline-markup line-proximity) are
* gated on `context !== 'code-dominant'`. In code-dominant files, an inline
* shader keyword next to a credential-shaped string is no longer a reason
* to suppress that was the v7.0.0 polyglot false-negative (e.g. a `.ts`
* file with embedded GLSL block hiding a real secret on the next line).
*
* @param {string} str - The extracted string literal value
* @param {string} line - The full source line it came from
* @param {string} absPath - Absolute file path
* @param {'shader-dominant'|'markup-dominant'|'code-dominant'|'mixed'} [context='mixed']
* File-level classification from classifyFileContext.
* @returns {boolean} - true if this string should be skipped
*/
function isFalsePositive(str, line, absPath) {
function isFalsePositive(str, line, absPath, context = 'mixed') {
// 1. URLs — entropy is misleading for long query strings / JWTs in URLs
if (str.startsWith('http://') || str.startsWith('https://')) return true;
@ -234,14 +305,19 @@ function isFalsePositive(str, line, absPath) {
if (str.startsWith(prefix)) return true;
}
// 11. GLSL/WGSL shader keywords on the line — inline shader source
if (GLSL_KEYWORDS.test(line)) return true;
// Rules 11-13 (v7.2.0 B5): line-proximity suppressions for shader/CSS/markup.
// Gated on context !== 'code-dominant' so that a credential adjacent to an
// inline GLSL keyword in a `.ts` file is no longer suppressed.
if (context !== 'code-dominant') {
// 11. GLSL/WGSL shader keywords on the line — inline shader source
if (GLSL_KEYWORDS.test(line)) return true;
// 12. CSS-in-JS (styled-components, emotion, vanilla-extract)
if (CSS_IN_JS_PATTERN.test(line)) return true;
// 12. CSS-in-JS (styled-components, emotion, vanilla-extract)
if (CSS_IN_JS_PATTERN.test(line)) return true;
// 13. Inline HTML/SVG markup — React/Vue components, email templates
if (INLINE_MARKUP.test(line)) return true;
// 13. Inline HTML/SVG markup — React/Vue components, email templates
if (INLINE_MARKUP.test(line)) return true;
}
// 14. ffmpeg filter-graph syntax — long structured strings, not encoded
if (FFMPEG_SYNTAX.test(line)) return true;
@ -337,6 +413,9 @@ function maxSeverity(a, b) {
function scanFileContent(content, absPath, relPath) {
const findings = [];
const lines = content.split('\n');
// v7.2.0 (B5): classify the file once per scan; rules 11-13 inside
// isFalsePositive are gated on this context.
const fileContext = classifyFileContext(absPath, lines);
// De-duplicate: track (line, evidence) pairs to avoid reporting the same
// string twice when it appears in both extractStringLiterals and assignment
@ -364,7 +443,7 @@ function scanFileContent(content, absPath, relPath) {
if (!str || str.length < 10) continue;
// False positive suppression
if (isFalsePositive(str, line, absPath)) continue;
if (isFalsePositive(str, line, absPath, fileContext)) continue;
const H = shannonEntropy(str);
let severity = classifyEntropy(H, str.length);