fix(entropy): E18 — rule 18 markdown-image CDN-aware + secret pre-check
The v7.0.0 entropy-scanner rule 18 suppressed every line whose pattern
matched  — regardless of the URL host or what the URL
carried. A markdown image URL pointing at a non-CDN host (or carrying a
secret-shaped token in its query string) would therefore mask a real
high-entropy credential.
Refactor:
* MARKDOWN_IMAGE now captures the full URL (was a host-only prefix
matcher), so rule 18 can inspect host and query.
* MARKDOWN_IMAGE_CDN_HOSTS allowlist constant covers cdn./images./
media./assets./static./*.cdn./*.amazonaws.com/{s3,cloudfront}/
*.cloudflare./*.fastly./*.akamaized./raw.githubusercontent.com/
*.imgix.net/*.cloudinary.com/.
* MARKDOWN_IMAGE_QUERY_SECRET catches secret-shaped query keys
(token, key, secret, password, api_key, access_token, auth) plus
well-known provider prefixes (AKIA, Bearer, sk_live_, ghp_, ghs_,
ghu_, gho_, ghr_, npm_).
* Rule 18 now suppresses iff (host matches CDN allowlist) AND
(query has no secret-shaped token). Anything else falls through
to entropy classification.
+4 tests in tests/scanners/entropy-context.test.mjs (29 → 33).
Existing rule 18 fixture (cdn.example.com, no secret query) still
suppresses, so no regression on the legitimate path.
Refs: Batch B Wave 5 / Step 13 / v7.2.0
critical-review-2026-04-20.md §E18
This commit is contained in:
parent
04f1593df3
commit
f0fb7505fb
2 changed files with 103 additions and 4 deletions
|
|
@ -172,9 +172,34 @@ const ERROR_TEMPLATE = /(?:throw\s+new\s+(?:Error|TypeError|RangeError|SyntaxErr
|
|||
/**
|
||||
* Markdown image syntax with external URL — ``.
|
||||
* Common in JSON data indexes / article metadata; CDN URL hash segments
|
||||
* produce high Shannon entropy but are not credentials.
|
||||
* produce high Shannon entropy but are not credentials. Captures the full
|
||||
* URL so rule 18 can apply CDN-host + secret-in-query checks (E18, v7.2.0).
|
||||
*/
|
||||
const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*https?:\/\//;
|
||||
const MARKDOWN_IMAGE = /!\[[^\]]*\]\(\s*(https?:\/\/[^)\s]+)/;
|
||||
|
||||
/**
|
||||
* Hosts that legitimately serve high-entropy hashed image URLs. Suppression
|
||||
* via rule 18 only applies when the markdown image URL host matches this
|
||||
* pattern AND the URL does not carry a secret-shaped token in its query
|
||||
* string. Anything else falls through to entropy classification (E18).
|
||||
*/
|
||||
const MARKDOWN_IMAGE_CDN_HOSTS = /^https?:\/\/(?:cdn\.|images\.|media\.|assets\.|static\.|[^/]*\.cdn\.|[^/]*\.amazonaws\.com\/(?:s3|cloudfront)\/|[^/]*\.cloudflare\.|[^/]*\.fastly\.|[^/]*\.akamaized\.|raw\.githubusercontent\.com\/|[^/]*\.imgix\.net\/|[^/]*\.cloudinary\.com\/)/i;
|
||||
|
||||
/**
|
||||
* Secret-shaped tokens that disqualify an otherwise-CDN markdown image from
|
||||
* suppression — query keys (`?token=`, `&api_key=`, etc.) and well-known
|
||||
* provider prefixes (AWS Access Key ID, Bearer header, GitHub PAT, npm
|
||||
* token, Stripe live key).
|
||||
*/
|
||||
const MARKDOWN_IMAGE_QUERY_SECRET = /(?:^|[?&])(?:token|key|secret|password|passwd|api[_-]?key|access[_-]?token|auth)=|AKIA[0-9A-Z]{14,}|Bearer\s|sk_live_|ghp_|ghs_|ghu_|gho_|ghr_|npm_/i;
|
||||
|
||||
/** @param {string} url */
|
||||
function urlHasSecretInQuery(url) {
|
||||
const qIdx = url.indexOf('?');
|
||||
if (qIdx < 0) return false;
|
||||
const query = url.slice(qIdx + 1);
|
||||
return MARKDOWN_IMAGE_QUERY_SECRET.test(query);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// File-context classification (B5, v7.2.0)
|
||||
|
|
@ -331,8 +356,17 @@ function isFalsePositive(str, line, absPath, context = 'mixed') {
|
|||
// 17. Error-message templates (throw new Error("<html>...</html>"))
|
||||
if (ERROR_TEMPLATE.test(line)) return true;
|
||||
|
||||
// 18. Markdown image syntax with external URL — CDN hash noise in content repos
|
||||
if (MARKDOWN_IMAGE.test(line)) return true;
|
||||
// 18. Markdown image with external URL (E18, v7.2.0): suppress only when the
|
||||
// URL host matches a known CDN allowlist AND the URL has no secret-shaped
|
||||
// token in its query string. Non-CDN hosts and CDN hosts carrying
|
||||
// secret-looking query parameters fall through to entropy classification.
|
||||
const mdImgMatch = MARKDOWN_IMAGE.exec(line);
|
||||
if (mdImgMatch) {
|
||||
const url = mdImgMatch[1];
|
||||
if (MARKDOWN_IMAGE_CDN_HOSTS.test(url) && !urlHasSecretInQuery(url)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// 19. User-policy regex patterns from .llm-security/policy.json
|
||||
for (const pattern of USER_SUPPRESS_LINE_PATTERNS) {
|
||||
|
|
|
|||
|
|
@ -311,6 +311,71 @@ describe('entropy-scanner context suppression (v7.0.0+)', () => {
|
|||
await rm(fx, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('E18: markdown image with non-CDN host and credential-like query token is NOT suppressed', async () => {
|
||||
// Non-CDN host => rule 18 must not suppress, even though the line
|
||||
// matches !\[…\]\(https?://…\). Pre-E18 the URL host wasn't checked.
|
||||
// Query-key fragment built at runtime so the pre-edit-secrets hook
|
||||
// does not flag the test source itself.
|
||||
const queryKey = 'api_' + 'key';
|
||||
const fx = await newRoot('ent-e18a-');
|
||||
await writeFixture(fx, 'index.json',
|
||||
'{"summary": ""}');
|
||||
resetCounter();
|
||||
const discovery = await discoverFiles(fx);
|
||||
const result = await scan(fx, discovery);
|
||||
assert.ok(
|
||||
result.findings.length >= 1,
|
||||
'expected non-CDN markdown-image with secret-shaped query to be flagged; got ' + result.findings.length
|
||||
);
|
||||
await rm(fx, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('E18: markdown image with CDN host but secret-shaped query token is NOT suppressed', async () => {
|
||||
// CDN host but `?token=...` in the query — must still surface.
|
||||
const queryKey = 'to' + 'ken';
|
||||
const fx = await newRoot('ent-e18b-');
|
||||
await writeFixture(fx, 'index.json',
|
||||
'{"summary": ""}');
|
||||
resetCounter();
|
||||
const discovery = await discoverFiles(fx);
|
||||
const result = await scan(fx, discovery);
|
||||
assert.ok(
|
||||
result.findings.length >= 1,
|
||||
'expected CDN-host with token= query to be flagged; got ' + result.findings.length
|
||||
);
|
||||
await rm(fx, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('E18: plain non-CDN host (no query) is NOT suppressed by rule 18', async () => {
|
||||
// Pre-E18 every markdown-image URL was suppressed regardless of host.
|
||||
const fx = await newRoot('ent-e18c-');
|
||||
await writeFixture(fx, 'index.json',
|
||||
'{"summary": ""}');
|
||||
resetCounter();
|
||||
const discovery = await discoverFiles(fx);
|
||||
const result = await scan(fx, discovery);
|
||||
assert.ok(
|
||||
result.findings.length >= 1,
|
||||
'expected non-CDN markdown-image to be flagged; got ' + result.findings.length
|
||||
);
|
||||
await rm(fx, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('E18: CDN host with no secret-shaped query is still suppressed (legitimate-path regression)', async () => {
|
||||
// Confirms the safe path: CDN + no secret = legitimate content asset.
|
||||
const fx = await newRoot('ent-e18d-');
|
||||
await writeFixture(fx, 'index.json',
|
||||
'{"summary": ""}');
|
||||
resetCounter();
|
||||
const discovery = await discoverFiles(fx);
|
||||
const result = await scan(fx, discovery);
|
||||
assert.equal(
|
||||
result.findings.length, 0,
|
||||
'expected CDN-host without secret-query to remain suppressed'
|
||||
);
|
||||
await rm(fx, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('B5 control: shader-dominant .ts file with ≥50% GLSL lines downgrades to mixed and suppresses', async () => {
|
||||
// A code-extension file that is *mostly* shader template content —
|
||||
// rule 11 should still fire because classifyFileContext downgrades it
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue