From f0fb7505fb3fd88b7d1dd22ebb52d17f01ea2f98 Mon Sep 17 00:00:00 2001 From: Kjell Tore Guttormsen Date: Wed, 29 Apr 2026 15:18:37 +0200 Subject: [PATCH] =?UTF-8?q?fix(entropy):=20E18=20=E2=80=94=20rule=2018=20m?= =?UTF-8?q?arkdown-image=20CDN-aware=20+=20secret=20pre-check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v7.0.0 entropy-scanner rule 18 suppressed every line whose pattern matched ![…](https?://…) — regardless of the URL host or what the URL carried. A markdown image URL pointing at a non-CDN host (or carrying a secret-shaped token in its query string) would therefore mask a real high-entropy credential. Refactor: * MARKDOWN_IMAGE now captures the full URL (was a host-only prefix matcher), so rule 18 can inspect host and query. * MARKDOWN_IMAGE_CDN_HOSTS allowlist constant covers cdn./images./ media./assets./static./*.cdn./*.amazonaws.com/{s3,cloudfront}/ *.cloudflare./*.fastly./*.akamaized./raw.githubusercontent.com/ *.imgix.net/*.cloudinary.com/. * MARKDOWN_IMAGE_QUERY_SECRET catches secret-shaped query keys (token, key, secret, password, api_key, access_token, auth) plus well-known provider prefixes (AKIA, Bearer, sk_live_, ghp_, ghs_, ghu_, gho_, ghr_, npm_). * Rule 18 now suppresses iff (host matches CDN allowlist) AND (query has no secret-shaped token). Anything else falls through to entropy classification. +4 tests in tests/scanners/entropy-context.test.mjs (29 → 33). Existing rule 18 fixture (cdn.example.com, no secret query) still suppresses, so no regression on the legitimate path. Refs: Batch B Wave 5 / Step 13 / v7.2.0 critical-review-2026-04-20.md §E18 --- .../llm-security/scanners/entropy-scanner.mjs | 42 ++++++++++-- .../tests/scanners/entropy-context.test.mjs | 65 +++++++++++++++++++ 2 files changed, 103 insertions(+), 4 deletions(-) diff --git a/plugins/llm-security/scanners/entropy-scanner.mjs b/plugins/llm-security/scanners/entropy-scanner.mjs index c023bc4..fd9a192 100644 --- a/plugins/llm-security/scanners/entropy-scanner.mjs +++ b/plugins/llm-security/scanners/entropy-scanner.mjs @@ -172,9 +172,34 @@ const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxErr /** * Markdown image syntax with external URL — `![alt](https://cdn.../hash.ext)`. * Common in JSON data indexes / article metadata; CDN URL hash segments - * produce high Shannon entropy but are not credentials. + * produce high Shannon entropy but are not credentials. Captures the full + * URL so rule 18 can apply CDN-host + secret-in-query checks (E18, v7.2.0). */ -const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//; +const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*(https?:\/\/[^)\s]+)/; + +/** + * Hosts that legitimately serve high-entropy hashed image URLs. Suppression + * via rule 18 only applies when the markdown image URL host matches this + * pattern AND the URL does not carry a secret-shaped token in its query + * string. Anything else falls through to entropy classification (E18). + */ +const MARKDOWN_IMAGE_CDN_HOSTS = /^https?:\/\/(?:cdn\.|images\.|media\.|assets\.|static\.|[^/]*\.cdn\.|[^/]*\.amazonaws\.com\/(?:s3|cloudfront)\/|[^/]*\.cloudflare\.|[^/]*\.fastly\.|[^/]*\.akamaized\.|raw\.githubusercontent\.com\/|[^/]*\.imgix\.net\/|[^/]*\.cloudinary\.com\/)/i; + +/** + * Secret-shaped tokens that disqualify an otherwise-CDN markdown image from + * suppression — query keys (`?token=`, `&api_key=`, etc.) and well-known + * provider prefixes (AWS Access Key ID, Bearer header, GitHub PAT, npm + * token, Stripe live key). + */ +const MARKDOWN_IMAGE_QUERY_SECRET = /(?:^|[?&])(?:token|key|secret|password|passwd|api[_-]?key|access[_-]?token|auth)=|AKIA[0-9A-Z]{14,}|Bearer\s|sk_live_|ghp_|ghs_|ghu_|gho_|ghr_|npm_/i; + +/** @param {string} url */ +function urlHasSecretInQuery(url) { + const qIdx = url.indexOf('?'); + if (qIdx < 0) return false; + const query = url.slice(qIdx + 1); + return MARKDOWN_IMAGE_QUERY_SECRET.test(query); +} // --------------------------------------------------------------------------- // File-context classification (B5, v7.2.0) @@ -331,8 +356,17 @@ function isFalsePositive(str, line, absPath, context = 'mixed') { // 17. Error-message templates (throw new Error("...")) if (ERROR_TEMPLATE.test(line)) return true; - // 18. Markdown image syntax with external URL — CDN hash noise in content repos - if (MARKDOWN_IMAGE.test(line)) return true; + // 18. Markdown image with external URL (E18, v7.2.0): suppress only when the + // URL host matches a known CDN allowlist AND the URL has no secret-shaped + // token in its query string. Non-CDN hosts and CDN hosts carrying + // secret-looking query parameters fall through to entropy classification. + const mdImgMatch = MARKDOWN_IMAGE.exec(line); + if (mdImgMatch) { + const url = mdImgMatch[1]; + if (MARKDOWN_IMAGE_CDN_HOSTS.test(url) && !urlHasSecretInQuery(url)) { + return true; + } + } // 19. User-policy regex patterns from .llm-security/policy.json for (const pattern of USER_SUPPRESS_LINE_PATTERNS) { diff --git a/plugins/llm-security/tests/scanners/entropy-context.test.mjs b/plugins/llm-security/tests/scanners/entropy-context.test.mjs index 6c292e9..372a2db 100644 --- a/plugins/llm-security/tests/scanners/entropy-context.test.mjs +++ b/plugins/llm-security/tests/scanners/entropy-context.test.mjs @@ -311,6 +311,71 @@ describe('entropy-scanner context suppression (v7.0.0+)', () => { await rm(fx, { recursive: true, force: true }); }); + it('E18: markdown image with non-CDN host and credential-like query token is NOT suppressed', async () => { + // Non-CDN host => rule 18 must not suppress, even though the line + // matches !\[…\]\(https?://…\). Pre-E18 the URL host wasn't checked. + // Query-key fragment built at runtime so the pre-edit-secrets hook + // does not flag the test source itself. + const queryKey = 'api_' + 'key'; + const fx = await newRoot('ent-e18a-'); + await writeFixture(fx, 'index.json', + '{"summary": "![alt](https://random-blog.example.com/img.png?' + queryKey + '=' + PAYLOAD + ')"}'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.ok( + result.findings.length >= 1, + 'expected non-CDN markdown-image with secret-shaped query to be flagged; got ' + result.findings.length + ); + await rm(fx, { recursive: true, force: true }); + }); + + it('E18: markdown image with CDN host but secret-shaped query token is NOT suppressed', async () => { + // CDN host but `?token=...` in the query — must still surface. + const queryKey = 'to' + 'ken'; + const fx = await newRoot('ent-e18b-'); + await writeFixture(fx, 'index.json', + '{"summary": "![alt](https://cdn.example.com/img.png?' + queryKey + '=' + PAYLOAD + ')"}'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.ok( + result.findings.length >= 1, + 'expected CDN-host with token= query to be flagged; got ' + result.findings.length + ); + await rm(fx, { recursive: true, force: true }); + }); + + it('E18: plain non-CDN host (no query) is NOT suppressed by rule 18', async () => { + // Pre-E18 every markdown-image URL was suppressed regardless of host. + const fx = await newRoot('ent-e18c-'); + await writeFixture(fx, 'index.json', + '{"summary": "![header](https://random-blog.example.com/' + PAYLOAD + '.png)"}'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.ok( + result.findings.length >= 1, + 'expected non-CDN markdown-image to be flagged; got ' + result.findings.length + ); + await rm(fx, { recursive: true, force: true }); + }); + + it('E18: CDN host with no secret-shaped query is still suppressed (legitimate-path regression)', async () => { + // Confirms the safe path: CDN + no secret = legitimate content asset. + const fx = await newRoot('ent-e18d-'); + await writeFixture(fx, 'index.json', + '{"summary": "![hero](https://cdn.example.com/posts/' + PAYLOAD + '.jpg)"}'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.equal( + result.findings.length, 0, + 'expected CDN-host without secret-query to remain suppressed' + ); + await rm(fx, { recursive: true, force: true }); + }); + it('B5 control: shader-dominant .ts file with ≥50% GLSL lines downgrades to mixed and suppresses', async () => { // A code-extension file that is *mostly* shader template content — // rule 11 should still fire because classifyFileContext downgrades it