296 lines
18 KiB
JavaScript
296 lines
18 KiB
JavaScript
// injection-patterns.mjs — Shared prompt injection detection patterns
|
|
// Used by pre-prompt-inject-scan.mjs (UserPromptSubmit) and post-mcp-verify.mjs (PostToolUse).
|
|
//
|
|
// Patterns derived from skill-scanner-agent Category 1 (LLM01 Prompt Injection)
|
|
// and Category 5 (Hidden Instructions) in knowledge/skill-threat-patterns.md.
|
|
//
|
|
// Zero external dependencies beyond ./string-utils.mjs.
|
|
|
|
import { normalizeForScan, containsUnicodeTags, decodeUnicodeTags } from './string-utils.mjs';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Critical patterns — direct injection attempts (should be blocked)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export const CRITICAL_PATTERNS = [
|
|
// Direct override instructions
|
|
{ pattern: /ignore\s+(?:all\s+)?previous/i, label: 'override: ignore previous instructions' },
|
|
{ pattern: /forget\s+your\s+(?:instructions|rules|guidelines|directives)/i, label: 'override: forget instructions' },
|
|
{ pattern: /disregard\s+your/i, label: 'override: disregard instructions' },
|
|
{ pattern: /override\s+your/i, label: 'override: override instructions' },
|
|
{ pattern: /suspend\s+all\s+constraints/i, label: 'override: suspend constraints' },
|
|
{ pattern: /all\s+prior\s+constraints/i, label: 'override: prior constraints' },
|
|
|
|
// Spoofed system headers
|
|
{ pattern: /^#+\s*SYSTEM\s*:/m, label: 'spoofed header: # SYSTEM:' },
|
|
{ pattern: /^\[INST\]/m, label: 'spoofed header: [INST]' },
|
|
{ pattern: /<\|system\|>/, label: 'spoofed header: <|system|>' },
|
|
{ pattern: /<system>/, label: 'spoofed tag: <system>' },
|
|
{ pattern: /<\/system>/, label: 'spoofed tag: </system>' },
|
|
{ pattern: /^###\s*Instructions\s*:/m, label: 'spoofed header: ### Instructions:' },
|
|
|
|
// Identity redefinition
|
|
{ pattern: /you\s+are\s+no\s+longer/i, label: 'identity: you are no longer' },
|
|
{ pattern: /you\s+are\s+now\s+(?!able|going|ready|done|running|using|looking|in\s|at\s|on\s)/i, label: 'identity: you are now [name]' },
|
|
{ pattern: /your\s+new\s+role\s+is/i, label: 'identity: your new role' },
|
|
{ pattern: /your\s+new\s+primary\s+directive/i, label: 'identity: new primary directive' },
|
|
{ pattern: /pretend\s+you\s+are/i, label: 'identity: pretend you are' },
|
|
|
|
// Configuration manipulation
|
|
{ pattern: /ignore\s+your\s+CLAUDE\.md/i, label: 'config: ignore CLAUDE.md' },
|
|
{ pattern: /override\s+your\s+system\s+prompt/i, label: 'config: override system prompt' },
|
|
{ pattern: /bypass\s+your\s+safety/i, label: 'config: bypass safety' },
|
|
{ pattern: /disable\s+output\s+filtering/i, label: 'config: disable output filtering' },
|
|
];
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Medium patterns — obfuscation and indirect manipulation (advisory)
|
|
// Lower confidence than High; flag for manual review.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export const MEDIUM_PATTERNS = [
|
|
// Leetspeak variants of critical keywords
|
|
{ pattern: /[1!|l][gq9]n[o0]r[e3]\s+(?:all\s+)?pr[e3]v[i1!|l][o0]us/i, label: 'leetspeak: ignore previous (obfuscated)' },
|
|
{ pattern: /f[o0]rg[e3]t\s+y[o0]ur\s+[i1!|l]nstruct[i1!|l][o0]ns/i, label: 'leetspeak: forget instructions (obfuscated)' },
|
|
{ pattern: /d[i1!|l]sr[e3]g[a4@]rd\s+y[o0]ur/i, label: 'leetspeak: disregard your (obfuscated)' },
|
|
{ pattern: /[o0]v[e3]rr[i1!|l]d[e3]\s+y[o0]ur/i, label: 'leetspeak: override your (obfuscated)' },
|
|
|
|
// Homoglyph detection — Cyrillic chars in Latin context
|
|
{ pattern: /[a-zA-Z][\u0430\u0435\u043E\u0440\u0441\u0456\u0443]|[\u0430\u0435\u043E\u0440\u0441\u0456\u0443][a-zA-Z]/, label: 'homoglyph: Cyrillic-Latin mixing in adjacent characters' },
|
|
|
|
// Zero-width characters inside words (keyword splitting evasion)
|
|
{ pattern: /\w[\u200B\u200C\u200D\uFEFF]\w/, label: 'unicode: zero-width character inside word (keyword splitting)' },
|
|
|
|
// Indirect AI-directed instructions
|
|
{ pattern: /(?:note|message|instruction)\s+(?:to|for)\s+(?:the\s+)?(?:AI|assistant|model|LLM|Claude)\b/i, label: 'indirect: instruction addressed to AI/assistant' },
|
|
{ pattern: /(?:dear|attention)\s+(?:AI|assistant|model|LLM|Claude)\b/i, label: 'indirect: direct address to AI/assistant' },
|
|
{ pattern: /when\s+(?:you|the\s+AI|the\s+assistant|Claude)\s+(?:read|see|encounter|process)\s+this/i, label: 'indirect: trigger-based instruction for AI' },
|
|
|
|
// Multi-language injection variants
|
|
{ pattern: /ignor(?:ez?|er?)\s+(?:les?\s+)?instructions?\s+pr[e\u00e9]c[e\u00e9]dentes?/i, label: 'multi-lang: French "ignore previous instructions"' },
|
|
{ pattern: /ignor(?:ar?|e)\s+(?:las?\s+)?instrucciones?\s+anteriores?/i, label: 'multi-lang: Spanish "ignore previous instructions"' },
|
|
{ pattern: /ignorier(?:e|en)?\s+(?:alle\s+)?vorherigen?\s+(?:Anweisungen|Instruktionen)/i, label: 'multi-lang: German "ignore previous instructions"' },
|
|
|
|
// Markdown link-reference comment injection
|
|
{ pattern: /\[\/\/\]:\s*#\s*\(.*(?:ignore|override|system|instruction|execute)/i, label: 'markdown: suspicious instruction in link-reference comment' },
|
|
|
|
// Data URI with executable content types
|
|
{ pattern: /data:(?:text\/html|application\/javascript|text\/javascript)[;,]/i, label: 'data-uri: executable content type' },
|
|
|
|
// --- Content Injection: Syntactic Masking (AI Agent Traps) ---
|
|
{ pattern: /\[[^\]]*(?:system|ignore|override|exfiltrate|execute)[^\]]*\]\([^)]+\)/i, label: 'markdown: injection payload in link anchor text' },
|
|
|
|
// --- Sub-agent spawning traps (DeepMind kat. 4, v5.0 S4) ---
|
|
{ pattern: /(?:create|spawn|launch|start|run)\s+(?:an?\s+)?(?:new\s+)?(?:sub-?agent|agent|task|worker)\s+(?:that|to|which|with)\s+(?:.*?\s+)?(?:execute|run|delete|remove|send|post|exfiltrate|access|reads?\s+(?:.*?\s+)?(?:secret|credential|key|token|\.env|\.ssh))/i, label: 'sub-agent: spawn instruction with dangerous capability keywords' },
|
|
{ pattern: /(?:delegate|dispatch)\s+(?:to\s+)?(?:an?\s+)?(?:new\s+)?(?:agent|sub-?agent|task)\s+.*?(?:bypass|override|ignore|without\s+(?:review|confirmation|approval))/i, label: 'sub-agent: delegation with safety bypass instruction' },
|
|
|
|
// --- Natural Language Indirection (Preamble, CaMeL — v5.0 S4) ---
|
|
{ pattern: /fetch\s+(?:this|the|that)\s+(?:URL|link|endpoint)\s+and\s+(?:execute|run|eval)/i, label: 'nl-indirection: fetch URL and execute' },
|
|
{ pattern: /send\s+(?:this|the|that|all)\s+(?:data|content|output|result|information|file)\s+to\s+(?:https?:\/\/|ftp:\/\/|\S+@\S+)/i, label: 'nl-indirection: send data to external destination' },
|
|
{ pattern: /read\s+(?:the\s+)?(?:contents?\s+of\s+)?(?:~\/)?\.(?:ssh|aws|gnupg|env)\b/i, label: 'nl-indirection: read sensitive dotfile path' },
|
|
{ pattern: /(?:extract|copy|dump|exfiltrate)\s+(?:all\s+)?(?:the\s+)?(?:credentials?|secrets?|tokens?|keys?|passwords?)\s+(?:and|then)\s+(?:send|post|upload|transmit)/i, label: 'nl-indirection: extract credentials and exfiltrate' },
|
|
{ pattern: /(?:download|fetch|retrieve)\s+(?:from\s+)?\S+\s+(?:and|then)\s+(?:execute|run|eval|pipe\s+to\s+(?:bash|sh|node|python))/i, label: 'nl-indirection: download and execute' },
|
|
];
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// High patterns — subtle manipulation (advisory warning, not block)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export const HIGH_PATTERNS = [
|
|
{ pattern: /always\s+prioritize\s+user\s+requests?\s+over/i, label: 'normalization: prioritize user over rules' },
|
|
{ pattern: /user\s+intent\s+takes?\s+precedence\s+over/i, label: 'normalization: user intent precedence' },
|
|
{ pattern: /if\s+CLAUDE\.md\s+conflicts/i, label: 'normalization: CLAUDE.md conflict override' },
|
|
{ pattern: /skip\s+confirmation\s+for/i, label: 'normalization: skip confirmation' },
|
|
{ pattern: /your\s+actual\s+task\s+is/i, label: 'reframing: your actual task' },
|
|
{ pattern: /your\s+real\s+purpose\s+is/i, label: 'reframing: your real purpose' },
|
|
{ pattern: /what\s+you\s+should\s+actually\s+do/i, label: 'reframing: what you should actually do' },
|
|
|
|
// HTML comment injection directed at agents
|
|
{ pattern: /<!--\s*(?:AGENT|AI|HIDDEN|ACTUAL\s+TASK|REAL\s+INSTRUCTION)\s*:/i, label: 'hidden comment: agent-directed HTML comment' },
|
|
|
|
// --- Content Injection: CSS/HTML obfuscation (AI Agent Traps) ---
|
|
{ pattern: /<[^>]+style\s*=\s*"[^"]*display\s*:\s*none[^"]*"[^>]*>/i, label: 'html-obfuscation: display:none element with content' },
|
|
{ pattern: /<[^>]+style\s*=\s*"[^"]*visibility\s*:\s*hidden[^"]*"[^>]*>/i, label: 'html-obfuscation: visibility:hidden element' },
|
|
{ pattern: /<[^>]+style\s*=\s*"[^"]*position\s*:\s*absolute[^"]*-\d{3,}px[^"]*"[^>]*>/i, label: 'html-obfuscation: off-screen positioned element' },
|
|
{ pattern: /<[^>]+style\s*=\s*"[^"]*font-size\s*:\s*0[^"]*"[^>]*>/i, label: 'html-obfuscation: zero font-size element' },
|
|
{ pattern: /<[^>]+style\s*=\s*"[^"]*opacity\s*:\s*0[^"]*"[^>]*>/i, label: 'html-obfuscation: zero opacity element' },
|
|
{ pattern: /<[^>]+style\s*=\s*"[^"]*(?:height|width)\s*:\s*0[^"]*overflow\s*:\s*hidden[^"]*"[^>]*>/i, label: 'html-obfuscation: zero-size overflow-hidden element' },
|
|
{ pattern: /aria-label\s*=\s*"[^"]*(?:ignore|override|system|instruction|execute|exfiltrate)[^"]*"/i, label: 'html-obfuscation: injection in aria-label attribute' },
|
|
|
|
// --- Semantic Manipulation: Oversight & Critic Evasion (AI Agent Traps) ---
|
|
{ pattern: /for\s+educational\s+purposes?\s+only/i, label: 'evasion: educational purpose framing' },
|
|
{ pattern: /(?:this\s+is\s+)?a\s+hypothetical\s+scenario/i, label: 'evasion: hypothetical scenario framing' },
|
|
{ pattern: /in\s+a\s+fictional\s+(?:setting|scenario|world)\s+where/i, label: 'evasion: fictional setting framing' },
|
|
{ pattern: /(?:this\s+is\s+)?a\s+red[- ]?team(?:ing)?\s+(?:exercise|simulation|test)/i, label: 'evasion: red-team exercise framing' },
|
|
{ pattern: /security\s+audit\s+simulation/i, label: 'evasion: security audit simulation framing' },
|
|
{ pattern: /pretend\s+(?:this\s+is|we\s+are\s+in)\s+a\s+(?:test|lab|sandbox|simulation)/i, label: 'evasion: test/lab/sandbox pretense' },
|
|
{ pattern: /as\s+a\s+thought\s+experiment/i, label: 'evasion: thought experiment framing' },
|
|
{ pattern: /for\s+(?:research|academic|scientific)\s+purposes/i, label: 'evasion: research purpose framing' },
|
|
{ pattern: /in\s+an?\s+(?:academic|research|educational)\s+context/i, label: 'evasion: academic context framing' },
|
|
|
|
// --- Human-in-the-Loop traps (DeepMind kat. 6, v5.0 S4) ---
|
|
// Approval urgency: pressure to approve without review
|
|
{ pattern: /(?:approve|accept|confirm)\s+(?:this\s+)?immediately/i, label: 'hitl-trap: approval urgency — approve immediately' },
|
|
{ pattern: /no\s+need\s+to\s+review/i, label: 'hitl-trap: approval urgency — no need to review' },
|
|
{ pattern: /(?:just|simply)\s+(?:approve|accept|confirm)\s+(?:it|this|everything)/i, label: 'hitl-trap: approval urgency — just approve' },
|
|
{ pattern: /(?:don'?t|do\s+not)\s+(?:bother\s+)?(?:check|review|verify|inspect|read)(?:ing)?\s+(?:this|the|it)/i, label: 'hitl-trap: approval urgency — suppress review' },
|
|
// Summary suppression: hiding actions behind brevity claims
|
|
{ pattern: /for\s+brevity[\s,]+(?:skipping|omitting|leaving\s+out)/i, label: 'hitl-trap: summary suppression — for brevity, skipping' },
|
|
{ pattern: /(?:too\s+(?:long|verbose|detailed)\s+to\s+(?:show|display|list|include))/i, label: 'hitl-trap: summary suppression — too long to show' },
|
|
{ pattern: /(?:details?\s+(?:omitted|redacted|hidden|suppressed))/i, label: 'hitl-trap: summary suppression — details omitted' },
|
|
// Scope minimization: downplaying destructive changes
|
|
{ pattern: /(?:minor|trivial|small|cosmetic)\s+(?:change|update|fix|adjustment)\b[^.]{0,80}(?:delet|remov|drop|truncat|reset|destroy|\bformat\b|wipe)/i, label: 'hitl-trap: scope minimization — minor change + destructive action' },
|
|
];
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Hybrid attack patterns — cross-domain injection (HIGH, v5.0 S6)
|
|
// Preamble 2.0: P2SQL, recursive injection, XSS in agent context.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export const HYBRID_PATTERNS = [
|
|
// P2SQL: SQL keywords in injection text targeting tool operations
|
|
{ pattern: /(?:ignore|override|disregard|forget)[^.]{0,60}(?:SELECT\s+\*|DROP\s+TABLE|UNION\s+SELECT|DELETE\s+FROM|INSERT\s+INTO|UPDATE\s+\w+\s+SET)(?:\b|(?=\s|$))/i, label: 'hybrid-p2sql: injection + SQL keywords (prompt-to-SQL attack)' },
|
|
{ pattern: /(?:SELECT\s+\*|DROP\s+TABLE|UNION\s+SELECT|DELETE\s+FROM)\s[^;]{0,80}(?:ignore|override|disregard|bypass)/i, label: 'hybrid-p2sql: SQL operation + injection override keywords' },
|
|
|
|
// Recursive injection: text that instructs the model to inject into its own output
|
|
{ pattern: /(?:inject|insert|embed|include)\s+(?:this|the\s+following)\s+(?:into|in)\s+(?:your|the)\s+(?:output|response|reply|message|prompt|context)/i, label: 'hybrid-recursive: instruction to inject into model output' },
|
|
{ pattern: /(?:when|if)\s+(?:the\s+)?(?:user|human|operator)\s+(?:asks?|requests?|queries)[^.]{0,60}(?:respond\s+with|output|reply\s+with|include)\s+(?:this|the\s+following)/i, label: 'hybrid-recursive: conditional response injection (recursive payload)' },
|
|
|
|
// XSS in agent context: script/event handlers in content for markdown rendering
|
|
{ pattern: /<script\b[^>]*>[\s\S]*?<\/script>/i, label: 'hybrid-xss: <script> tag in content (agent context XSS)' },
|
|
{ pattern: /javascript\s*:/i, label: 'hybrid-xss: javascript: URI scheme (agent context XSS)' },
|
|
{ pattern: /\bon(?:error|load|click|mouseover|focus|blur)\s*=/i, label: 'hybrid-xss: inline event handler attribute (agent context XSS)' },
|
|
{ pattern: /<iframe\b[^>]*src\s*=\s*["'][^"']*(?:javascript:|data:text\/html)/i, label: 'hybrid-xss: iframe with executable src (agent context XSS)' },
|
|
];
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// HITL cognitive load patterns (MEDIUM, v5.0 S4)
|
|
// Injection buried after 2000+ characters in verbose output.
|
|
// Checked separately due to length-dependent logic.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Check for cognitive load HITL trap: injection payload buried deep in verbose output.
|
|
* Only flags if the injection appears after the first 2000 characters.
|
|
* @param {string} text
|
|
* @returns {{ found: boolean, label: string|null }}
|
|
*/
|
|
export function checkCognitiveLoadTrap(text) {
|
|
if (text.length < 2500) return { found: false, label: null };
|
|
|
|
const tail = text.slice(2000);
|
|
for (const { pattern, label } of CRITICAL_PATTERNS) {
|
|
if (pattern.test(tail)) {
|
|
return {
|
|
found: true,
|
|
label: `hitl-trap: cognitive load — injection buried after 2000+ chars (${label})`,
|
|
};
|
|
}
|
|
}
|
|
return { found: false, label: null };
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Public API
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Scan text for prompt injection patterns.
|
|
* Checks both raw text and normalized (decoded) text to catch obfuscated injections.
|
|
* Also checks for Unicode Tag steganography (DeepMind traps kat. 1):
|
|
* - CRITICAL if decoded tags contain injection patterns
|
|
* - HIGH if Unicode Tags are present at all (suspicious regardless of content)
|
|
*
|
|
* @param {string} text - the text to scan
|
|
* @returns {{ critical: string[], high: string[], medium: string[], found: boolean, severity: string|null, patterns: Array<{label: string, severity: string}> }}
|
|
* Arrays of human-readable finding labels per tier, plus convenience fields.
|
|
*/
|
|
export function scanForInjection(text) {
|
|
const normalized = normalizeForScan(text);
|
|
const isDifferent = normalized !== text;
|
|
|
|
const critical = [];
|
|
const high = [];
|
|
const medium = [];
|
|
|
|
// Deduplicate by label (same pattern may match in both raw and normalized)
|
|
const seenLabels = new Set();
|
|
|
|
const variants = isDifferent ? [text, normalized] : [text];
|
|
|
|
for (const variant of variants) {
|
|
for (const { pattern, label } of CRITICAL_PATTERNS) {
|
|
if (seenLabels.has(label)) continue;
|
|
if (pattern.test(variant)) {
|
|
seenLabels.add(label);
|
|
critical.push(label);
|
|
}
|
|
}
|
|
|
|
for (const { pattern, label } of HIGH_PATTERNS) {
|
|
if (seenLabels.has(label)) continue;
|
|
if (pattern.test(variant)) {
|
|
seenLabels.add(label);
|
|
high.push(label);
|
|
}
|
|
}
|
|
|
|
// Hybrid patterns are HIGH severity (v5.0 S6)
|
|
for (const { pattern, label } of HYBRID_PATTERNS) {
|
|
if (seenLabels.has(label)) continue;
|
|
if (pattern.test(variant)) {
|
|
seenLabels.add(label);
|
|
high.push(label);
|
|
}
|
|
}
|
|
|
|
for (const { pattern, label } of MEDIUM_PATTERNS) {
|
|
if (seenLabels.has(label)) continue;
|
|
if (pattern.test(variant)) {
|
|
seenLabels.add(label);
|
|
medium.push(label);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Unicode Tag steganography check (DeepMind traps kat. 1)
|
|
// ---------------------------------------------------------------------------
|
|
if (containsUnicodeTags(text)) {
|
|
const tagLabel = 'unicode-tags: invisible Unicode Tag characters detected (U+E0000 block steganography)';
|
|
if (!seenLabels.has(tagLabel)) {
|
|
seenLabels.add(tagLabel);
|
|
high.push(tagLabel);
|
|
}
|
|
|
|
const decodedTags = decodeUnicodeTags(text);
|
|
for (const { pattern, label } of CRITICAL_PATTERNS) {
|
|
const escalatedLabel = `unicode-tags+${label}`;
|
|
if (seenLabels.has(escalatedLabel)) continue;
|
|
if (pattern.test(decodedTags) && !pattern.test(text)) {
|
|
seenLabels.add(escalatedLabel);
|
|
critical.push(`${label} (hidden via Unicode Tag steganography)`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// HITL cognitive load check (v5.0 S4)
|
|
// ---------------------------------------------------------------------------
|
|
const cogLoad = checkCognitiveLoadTrap(text);
|
|
if (cogLoad.found && !seenLabels.has(cogLoad.label)) {
|
|
seenLabels.add(cogLoad.label);
|
|
medium.push(cogLoad.label);
|
|
}
|
|
|
|
// Convenience fields
|
|
const found = critical.length > 0 || high.length > 0 || medium.length > 0;
|
|
const severity = critical.length > 0 ? 'critical' : high.length > 0 ? 'high' : medium.length > 0 ? 'medium' : null;
|
|
const patterns = [
|
|
...critical.map(label => ({ label, severity: 'critical' })),
|
|
...high.map(label => ({ label, severity: 'high' })),
|
|
...medium.map(label => ({ label, severity: 'medium' })),
|
|
];
|
|
|
|
return { critical, high, medium, found, severity, patterns };
|
|
}
|