ktg-plugin-marketplace/plugins/llm-security/hooks/scripts/pre-prompt-inject-scan.mjs

134 lines
4.9 KiB
JavaScript

#!/usr/bin/env node
// Hook: pre-prompt-inject-scan.mjs
// Event: UserPromptSubmit
// Purpose: Scan user prompts for injection patterns before sending to model.
//
// Catches injection hidden in pasted content, piped input, or headless mode.
// Critical patterns (direct override, spoofed headers, identity redefinition) -> block.
// High patterns (subtle manipulation, context normalization) -> warn.
// Medium patterns (leetspeak, homoglyphs, zero-width, multi-language) -> advisory.
//
// v2.3.0: LLM_SECURITY_INJECTION_MODE env var (block/warn/off). Default: block.
// v5.0.0: MEDIUM patterns emit advisory (never block). Appended to existing advisory
// when critical/high patterns are also present.
//
// Protocol:
// - Read JSON from stdin: { session_id, message: { role, content } }
// - content may be a string or array of content blocks
// - Block: exit 2, stdout JSON { decision: "block", reason: "..." }
// - Allow: exit 0
// - Warn: exit 0, stdout JSON { systemMessage: "..." }
import { readFileSync } from 'node:fs';
import { scanForInjection } from '../../scanners/lib/injection-patterns.mjs';
// ---------------------------------------------------------------------------
// Mode configuration
// ---------------------------------------------------------------------------
const VALID_MODES = new Set(['block', 'warn', 'off']);
const mode = VALID_MODES.has(process.env.LLM_SECURITY_INJECTION_MODE)
? process.env.LLM_SECURITY_INJECTION_MODE
: 'block';
// Off mode: skip scanning entirely
if (mode === 'off') {
process.exit(0);
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Extract plaintext from the UserPromptSubmit input payload.
* Handles multiple input shapes for robustness.
*/
function extractText(input) {
// Shape 1: { message: { content: "string" } }
// Shape 2: { message: { content: [{ type: "text", text: "..." }] } }
// Shape 3: { prompt: "string" } (fallback)
const message = input?.message;
if (!message) return input?.prompt ?? '';
const content = message.content;
if (typeof content === 'string') return content;
if (Array.isArray(content)) {
return content
.filter((block) => block.type === 'text')
.map((block) => block.text)
.join('\n');
}
return '';
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
let input;
try {
const raw = readFileSync(0, 'utf-8');
input = JSON.parse(raw);
} catch {
// Cannot parse stdin — allow (don't block on parse errors)
process.exit(0);
}
const text = extractText(input);
if (!text.trim()) {
process.exit(0);
}
const { critical, high, medium } = scanForInjection(text);
if (critical.length > 0 && mode === 'block') {
const reason =
`Blocked: prompt injection pattern detected (OWASP LLM01).\n` +
critical.map((c) => ` - ${c}`).join('\n') +
'\n' +
` This prompt contains patterns associated with prompt injection attacks.\n` +
` If intentional (testing, security research), set LLM_SECURITY_INJECTION_MODE=warn to allow with advisory.`;
process.stdout.write(JSON.stringify({ decision: 'block', reason }));
process.exit(2);
}
if (critical.length > 0 || high.length > 0) {
// In warn mode, critical patterns are downgraded to advisory.
// In block mode, we only reach here if critical is empty (only high patterns).
const allFindings = [...critical, ...high];
const severity = critical.length > 0 ? 'CRITICAL' : 'HIGH';
let message =
`SECURITY ADVISORY (prompt-inject-scan): ${severity} manipulation signals detected.\n\n` +
allFindings.map((f, i) => `[${i + 1}] ${f}`).join('\n') +
'\n\n' +
` These patterns may indicate prompt manipulation in pasted content.\n` +
` Review the source before proceeding.` +
(mode === 'warn' && critical.length > 0
? `\n Note: blocking is disabled (LLM_SECURITY_INJECTION_MODE=warn).`
: '');
// Append MEDIUM count if present (never list individual medium findings with critical/high)
if (medium.length > 0) {
message += `\n Additionally, ${medium.length} lower-confidence signal(s) detected (MEDIUM).`;
}
process.stdout.write(JSON.stringify({ decision: 'allow', systemMessage: message }));
process.exit(0);
}
// MEDIUM-only: advisory (never block)
if (medium.length > 0) {
const message =
`SECURITY ADVISORY (prompt-inject-scan): MEDIUM obfuscation/manipulation signals detected.\n\n` +
medium.map((f, i) => `[${i + 1}] ${f}`).join('\n') +
'\n\n' +
` These patterns may indicate obfuscated prompt manipulation (leetspeak, homoglyphs, multi-language).\n` +
` Review the source before proceeding. MEDIUM signals are advisory-only and never block.`;
process.stdout.write(JSON.stringify({ decision: 'allow', systemMessage: message }));
process.exit(0);
}
// Clean — allow silently
process.exit(0);