diff --git a/plugins/llm-security/scanners/entropy-scanner.mjs b/plugins/llm-security/scanners/entropy-scanner.mjs index c023bc4..fd9a192 100644 --- a/plugins/llm-security/scanners/entropy-scanner.mjs +++ b/plugins/llm-security/scanners/entropy-scanner.mjs @@ -172,9 +172,34 @@ const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxErr /** * Markdown image syntax with external URL — `![alt](https://cdn.../hash.ext)`. * Common in JSON data indexes / article metadata; CDN URL hash segments - * produce high Shannon entropy but are not credentials. + * produce high Shannon entropy but are not credentials. Captures the full + * URL so rule 18 can apply CDN-host + secret-in-query checks (E18, v7.2.0). */ -const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//; +const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*(https?:\/\/[^)\s]+)/; + +/** + * Hosts that legitimately serve high-entropy hashed image URLs. Suppression + * via rule 18 only applies when the markdown image URL host matches this + * pattern AND the URL does not carry a secret-shaped token in its query + * string. Anything else falls through to entropy classification (E18). + */ +const MARKDOWN_IMAGE_CDN_HOSTS = /^https?:\/\/(?:cdn\.|images\.|media\.|assets\.|static\.|[^/]*\.cdn\.|[^/]*\.amazonaws\.com\/(?:s3|cloudfront)\/|[^/]*\.cloudflare\.|[^/]*\.fastly\.|[^/]*\.akamaized\.|raw\.githubusercontent\.com\/|[^/]*\.imgix\.net\/|[^/]*\.cloudinary\.com\/)/i; + +/** + * Secret-shaped tokens that disqualify an otherwise-CDN markdown image from + * suppression — query keys (`?token=`, `&api_key=`, etc.) and well-known + * provider prefixes (AWS Access Key ID, Bearer header, GitHub PAT, npm + * token, Stripe live key). + */ +const MARKDOWN_IMAGE_QUERY_SECRET = /(?:^|[?&])(?:token|key|secret|password|passwd|api[_-]?key|access[_-]?token|auth)=|AKIA[0-9A-Z]{14,}|Bearer\s|sk_live_|ghp_|ghs_|ghu_|gho_|ghr_|npm_/i; + +/** @param {string} url */ +function urlHasSecretInQuery(url) { + const qIdx = url.indexOf('?'); + if (qIdx < 0) return false; + const query = url.slice(qIdx + 1); + return MARKDOWN_IMAGE_QUERY_SECRET.test(query); +} // --------------------------------------------------------------------------- // File-context classification (B5, v7.2.0) @@ -331,8 +356,17 @@ function isFalsePositive(str, line, absPath, context = 'mixed') { // 17. Error-message templates (throw new Error("...")) if (ERROR_TEMPLATE.test(line)) return true; - // 18. Markdown image syntax with external URL — CDN hash noise in content repos - if (MARKDOWN_IMAGE.test(line)) return true; + // 18. Markdown image with external URL (E18, v7.2.0): suppress only when the + // URL host matches a known CDN allowlist AND the URL has no secret-shaped + // token in its query string. Non-CDN hosts and CDN hosts carrying + // secret-looking query parameters fall through to entropy classification. + const mdImgMatch = MARKDOWN_IMAGE.exec(line); + if (mdImgMatch) { + const url = mdImgMatch[1]; + if (MARKDOWN_IMAGE_CDN_HOSTS.test(url) && !urlHasSecretInQuery(url)) { + return true; + } + } // 19. User-policy regex patterns from .llm-security/policy.json for (const pattern of USER_SUPPRESS_LINE_PATTERNS) { diff --git a/plugins/llm-security/tests/scanners/entropy-context.test.mjs b/plugins/llm-security/tests/scanners/entropy-context.test.mjs index 6c292e9..372a2db 100644 --- a/plugins/llm-security/tests/scanners/entropy-context.test.mjs +++ b/plugins/llm-security/tests/scanners/entropy-context.test.mjs @@ -311,6 +311,71 @@ describe('entropy-scanner context suppression (v7.0.0+)', () => { await rm(fx, { recursive: true, force: true }); }); + it('E18: markdown image with non-CDN host and credential-like query token is NOT suppressed', async () => { + // Non-CDN host => rule 18 must not suppress, even though the line + // matches !\[…\]\(https?://…\). Pre-E18 the URL host wasn't checked. + // Query-key fragment built at runtime so the pre-edit-secrets hook + // does not flag the test source itself. + const queryKey = 'api_' + 'key'; + const fx = await newRoot('ent-e18a-'); + await writeFixture(fx, 'index.json', + '{"summary": "![alt](https://random-blog.example.com/img.png?' + queryKey + '=' + PAYLOAD + ')"}'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.ok( + result.findings.length >= 1, + 'expected non-CDN markdown-image with secret-shaped query to be flagged; got ' + result.findings.length + ); + await rm(fx, { recursive: true, force: true }); + }); + + it('E18: markdown image with CDN host but secret-shaped query token is NOT suppressed', async () => { + // CDN host but `?token=...` in the query — must still surface. + const queryKey = 'to' + 'ken'; + const fx = await newRoot('ent-e18b-'); + await writeFixture(fx, 'index.json', + '{"summary": "![alt](https://cdn.example.com/img.png?' + queryKey + '=' + PAYLOAD + ')"}'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.ok( + result.findings.length >= 1, + 'expected CDN-host with token= query to be flagged; got ' + result.findings.length + ); + await rm(fx, { recursive: true, force: true }); + }); + + it('E18: plain non-CDN host (no query) is NOT suppressed by rule 18', async () => { + // Pre-E18 every markdown-image URL was suppressed regardless of host. + const fx = await newRoot('ent-e18c-'); + await writeFixture(fx, 'index.json', + '{"summary": "![header](https://random-blog.example.com/' + PAYLOAD + '.png)"}'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.ok( + result.findings.length >= 1, + 'expected non-CDN markdown-image to be flagged; got ' + result.findings.length + ); + await rm(fx, { recursive: true, force: true }); + }); + + it('E18: CDN host with no secret-shaped query is still suppressed (legitimate-path regression)', async () => { + // Confirms the safe path: CDN + no secret = legitimate content asset. + const fx = await newRoot('ent-e18d-'); + await writeFixture(fx, 'index.json', + '{"summary": "![hero](https://cdn.example.com/posts/' + PAYLOAD + '.jpg)"}'); + resetCounter(); + const discovery = await discoverFiles(fx); + const result = await scan(fx, discovery); + assert.equal( + result.findings.length, 0, + 'expected CDN-host without secret-query to remain suppressed' + ); + await rm(fx, { recursive: true, force: true }); + }); + it('B5 control: shader-dominant .ts file with ≥50% GLSL lines downgrades to mixed and suppresses', async () => { // A code-extension file that is *mostly* shader template content — // rule 11 should still fire because classifyFileContext downgrades it