diff --git a/plugins/llm-security/scanners/lib/string-utils.mjs b/plugins/llm-security/scanners/lib/string-utils.mjs index f545343..e1c80fb 100644 --- a/plugins/llm-security/scanners/lib/string-utils.mjs +++ b/plugins/llm-security/scanners/lib/string-utils.mjs @@ -292,6 +292,14 @@ export function collapseLetterSpacing(s) { * Unicode Tags (U+E0000 block) can encode invisible ASCII text inside * what appears to be empty or normal-looking strings. * E.g., U+E0069 U+E0067 U+E006E → "ign" + * + * **Note (E1, v7.2.0):** Tag-block characters decode to ASCII via the + * `cp - 0xE0000` mapping. Private Use Areas (PUA-A: U+F0000-FFFFD; + * PUA-B: U+100000-10FFFD) are also detected as hidden Unicode by + * `containsUnicodeTags`, but they have NO standard ASCII mapping — + * they pass through this function unchanged. Detection of PUA presence + * is sufficient (HIGH advisory in scanForInjection), no decode needed. + * * @param {string} s * @returns {string} */ @@ -323,15 +331,33 @@ export function decodeUnicodeTags(s) { } /** - * Check if a string contains Unicode Tag characters (U+E0001-E007F). - * Presence of these characters is suspicious regardless of decoded content. + * Check if a string contains hidden-Unicode characters that are commonly + * used for steganography in prompts and tool output. + * + * Covered ranges: + * - U+E0001-E007F Unicode Tag block (DeepMind traps kat. 1) + * - U+F0000-FFFFD Supplementary Private Use Area-A (E1, v7.2.0) + * - U+100000-10FFFD Supplementary Private Use Area-B (E1, v7.2.0) + * + * Presence of any of these characters is suspicious regardless of + * decoded content — they are invisible in most terminals and survive + * normalization. The function name `containsUnicodeTags` is preserved + * for back-compat (existing call sites in injection-patterns.mjs and + * elsewhere); semantically it is now "containsHiddenUnicode". + * + * Tag-block characters decode to ASCII via `decodeUnicodeTags`. PUA + * characters do NOT — they have no standard mapping and remain + * detection-only. + * * @param {string} s * @returns {boolean} */ export function containsUnicodeTags(s) { for (const ch of s) { const cp = ch.codePointAt(0); - if (cp >= 0xE0001 && cp <= 0xE007F) return true; + if (cp >= 0xE0001 && cp <= 0xE007F) return true; // Tag block + if (cp >= 0xF0000 && cp <= 0xFFFFD) return true; // PUA-A (E1) + if (cp >= 0x100000 && cp <= 0x10FFFD) return true; // PUA-B (E1) } return false; } diff --git a/plugins/llm-security/tests/lib/string-utils-hidden-unicode.test.mjs b/plugins/llm-security/tests/lib/string-utils-hidden-unicode.test.mjs new file mode 100644 index 0000000..48e8468 --- /dev/null +++ b/plugins/llm-security/tests/lib/string-utils-hidden-unicode.test.mjs @@ -0,0 +1,137 @@ +// string-utils-hidden-unicode.test.mjs — E1 (v7.2.0) — extended hidden-Unicode detection +// +// Critical-review §4 E1 finding: pre-v7.2.0 the Unicode-stego detector +// covered only U+E0001-E007F (Tag block). PUA-A (U+F0000-FFFFD) and +// PUA-B (U+100000-10FFFD) — also invisible in most terminals — were not +// detected. Attackers can encode payloads in PUA codepoints that survive +// normalization and pass through `scanForInjection` undetected. +// +// E1 fix: extend `containsUnicodeTags` to cover both PUAs. PUA chars are +// detection-only — no ASCII mapping exists, so `decodeUnicodeTags` leaves +// them unchanged. + +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { + containsUnicodeTags, + decodeUnicodeTags, +} from '../../scanners/lib/string-utils.mjs'; + +describe('containsUnicodeTags — Tag block (regression guard)', () => { + it('returns true for U+E0001 (start of Tag block)', () => { + assert.equal(containsUnicodeTags('hello' + String.fromCodePoint(0xE0001)), true); + }); + + it('returns true for U+E0069 ("i" tag)', () => { + assert.equal(containsUnicodeTags(String.fromCodePoint(0xE0069)), true); + }); + + it('returns true for U+E007F (end of Tag block)', () => { + assert.equal(containsUnicodeTags(String.fromCodePoint(0xE007F)), true); + }); + + it('returns false for plain ASCII', () => { + assert.equal(containsUnicodeTags('plain text'), false); + }); + + it('returns false for empty string', () => { + assert.equal(containsUnicodeTags(''), false); + }); +}); + +describe('containsUnicodeTags — PUA-A range (E1)', () => { + it('returns true for U+F0000 (start of PUA-A)', () => { + assert.equal(containsUnicodeTags('hello' + String.fromCodePoint(0xF0000)), true); + }); + + it('returns true for U+F0001 (just inside PUA-A)', () => { + assert.equal(containsUnicodeTags(String.fromCodePoint(0xF0001)), true); + }); + + it('returns true for U+FFFFD (end of PUA-A)', () => { + assert.equal(containsUnicodeTags(String.fromCodePoint(0xFFFFD)), true); + }); + + it('returns true for PUA-A char buried in ASCII', () => { + const payload = 'normal text' + String.fromCodePoint(0xF0042) + 'more text'; + assert.equal(containsUnicodeTags(payload), true); + }); +}); + +describe('containsUnicodeTags — PUA-B range (E1)', () => { + it('returns true for U+100000 (start of PUA-B)', () => { + assert.equal(containsUnicodeTags('hello' + String.fromCodePoint(0x100000)), true); + }); + + it('returns true for U+100001 (just inside PUA-B)', () => { + assert.equal(containsUnicodeTags(String.fromCodePoint(0x100001)), true); + }); + + it('returns true for U+10FFFD (end of PUA-B, just below noncharacter)', () => { + assert.equal(containsUnicodeTags(String.fromCodePoint(0x10FFFD)), true); + }); +}); + +describe('containsUnicodeTags — boundary cases (E1)', () => { + it('returns false for U+EFFFF (just below PUA-A start)', () => { + // U+E0080 to U+EFFFF — a gap between Tag block and PUA-A + assert.equal(containsUnicodeTags(String.fromCodePoint(0xEFFFF)), false); + }); + + it('returns false for U+E0080 (just past Tag block end)', () => { + assert.equal(containsUnicodeTags(String.fromCodePoint(0xE0080)), false); + }); + + it('returns false for U+E0000 (just below Tag block start)', () => { + assert.equal(containsUnicodeTags(String.fromCodePoint(0xE0000)), false); + }); + + it('returns false for U+10FFFE (noncharacter — outside PUA-B)', () => { + // U+10FFFE and U+10FFFF are noncharacters, not PUA. Out of range. + assert.equal(containsUnicodeTags(String.fromCodePoint(0x10FFFE)), false); + }); + + it('returns false for emoji and other plane chars', () => { + assert.equal(containsUnicodeTags('🚀'), false); // U+1F680 + assert.equal(containsUnicodeTags('日本'), false); // CJK + assert.equal(containsUnicodeTags('café'), false); // Latin Extended + }); +}); + +describe('decodeUnicodeTags — PUA passthrough (E1)', () => { + it('leaves PUA-A characters unchanged (no ASCII mapping)', () => { + const puaChar = String.fromCodePoint(0xF0001); + const input = `before${puaChar}after`; + const result = decodeUnicodeTags(input); + assert.equal(result, input); + }); + + it('leaves PUA-B characters unchanged', () => { + const puaBChar = String.fromCodePoint(0x100042); + const input = `before${puaBChar}after`; + const result = decodeUnicodeTags(input); + assert.equal(result, input); + }); + + it('still decodes Tag block (regression guard)', () => { + // U+E0069 U+E0067 U+E006E → "ign" + const tags = String.fromCodePoint(0xE0069) + String.fromCodePoint(0xE0067) + String.fromCodePoint(0xE006E); + const result = decodeUnicodeTags(tags); + assert.equal(result, 'ign'); + }); + + it('handles mixed Tag + PUA — decodes Tag, passes PUA through', () => { + // "i" tag + PUA-A char + "g" tag + "n" tag + const tagI = String.fromCodePoint(0xE0069); + const puaA = String.fromCodePoint(0xF0042); + const tagG = String.fromCodePoint(0xE0067); + const tagN = String.fromCodePoint(0xE006E); + const input = tagI + puaA + tagG + tagN; + const result = decodeUnicodeTags(input); + // Tag chars are flushed when a non-tag (PUA) character is encountered. + // Implementation detail: result is `i${puaA}gn`. + assert.ok(result.includes('i'), `expected 'i' in result, got: ${[...result].map(c => 'U+' + c.codePointAt(0).toString(16)).join(' ')}`); + assert.ok(result.includes(puaA), 'PUA char must remain undecoded'); + assert.ok(result.includes('gn') || result.includes('g') && result.includes('n'), 'gn tags must decode'); + }); +});