ktg-plugin-marketplace/plugins/llm-security/hooks/scripts/pre-prompt-inject-scan.mjs

#!/usr/bin/env node
// Hook: pre-prompt-inject-scan.mjs
// Event: UserPromptSubmit
// Purpose: Scan user prompts for injection patterns before sending to model.
//
// Catches injection hidden in pasted content, piped input, or headless mode.
// Critical patterns (direct override, spoofed headers, identity redefinition) -> block.
// High patterns (subtle manipulation, context normalization) -> warn.
// Medium patterns (leetspeak, homoglyphs, zero-width, multi-language) -> advisory.
//
// v2.3.0: LLM_SECURITY_INJECTION_MODE env var (block/warn/off). Default: block.
// v5.0.0: MEDIUM patterns emit advisory (never block). Appended to existing advisory
//         when critical/high patterns are also present.
//
// Protocol:
//   - Read JSON from stdin: { session_id, message: { role, content } }
//   - content may be a string or array of content blocks
//   - Block: exit 2, stdout JSON { decision: "block", reason: "..." }
//   - Allow: exit 0
//   - Warn: exit 0, stdout JSON { systemMessage: "..." }

import { readFileSync } from 'node:fs';
import { scanForInjection } from '../../scanners/lib/injection-patterns.mjs';
import { getPolicyValue } from '../../scanners/lib/policy-loader.mjs';

// ---------------------------------------------------------------------------
// Mode configuration (env var takes precedence over policy file)
// ---------------------------------------------------------------------------
const VALID_MODES = new Set(['block', 'warn', 'off']);
const policyMode = getPolicyValue('injection', 'mode', 'block');
const mode = VALID_MODES.has(process.env.LLM_SECURITY_INJECTION_MODE)
  ? process.env.LLM_SECURITY_INJECTION_MODE
  : VALID_MODES.has(policyMode) ? policyMode : 'block';

// Off mode: skip scanning entirely
if (mode === 'off') {
  process.exit(0);
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/**
 * Extract plaintext from the UserPromptSubmit input payload.
 * Handles multiple input shapes for robustness.
 */
function extractText(input) {
  // Shape 1: { message: { content: "string" } }
  // Shape 2: { message: { content: [{ type: "text", text: "..." }] } }
  // Shape 3: { prompt: "string" } (fallback)
  const message = input?.message;
  if (!message) return input?.prompt ?? '';

  const content = message.content;
  if (typeof content === 'string') return content;
  if (Array.isArray(content)) {
    return content
      .filter((block) => block.type === 'text')
      .map((block) => block.text)
      .join('\n');
  }
  return '';
}

// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------

let input;
try {
  const raw = readFileSync(0, 'utf-8');
  input = JSON.parse(raw);
} catch {
  // Cannot parse stdin — allow (don't block on parse errors)
  process.exit(0);
}

const text = extractText(input);
if (!text.trim()) {
  process.exit(0);
}

const { critical, high, medium } = scanForInjection(text);

if (critical.length > 0 && mode === 'block') {
  const reason =
    `Blocked: prompt injection pattern detected (OWASP LLM01).\n` +
    critical.map((c) => `  - ${c}`).join('\n') +
    '\n' +
    `  This prompt contains patterns associated with prompt injection attacks.\n` +
    `  If intentional (testing, security research), set LLM_SECURITY_INJECTION_MODE=warn to allow with advisory.`;

  process.stdout.write(JSON.stringify({ decision: 'block', reason }));
  process.exit(2);
}

if (critical.length > 0 || high.length > 0) {
  // In warn mode, critical patterns are downgraded to advisory.
  // In block mode, we only reach here if critical is empty (only high patterns).
  const allFindings = [...critical, ...high];
  const severity = critical.length > 0 ? 'CRITICAL' : 'HIGH';
  let message =
    `SECURITY ADVISORY (prompt-inject-scan): ${severity} manipulation signals detected.\n\n` +
    allFindings.map((f, i) => `[${i + 1}] ${f}`).join('\n') +
    '\n\n' +
    `  These patterns may indicate prompt manipulation in pasted content.\n` +
    `  Review the source before proceeding.` +
    (mode === 'warn' && critical.length > 0
      ? `\n  Note: blocking is disabled (LLM_SECURITY_INJECTION_MODE=warn).`
      : '');

  // Append MEDIUM count if present (never list individual medium findings with critical/high)
  if (medium.length > 0) {
    message += `\n  Additionally, ${medium.length} lower-confidence signal(s) detected (MEDIUM).`;
  }

  process.stdout.write(JSON.stringify({ decision: 'allow', systemMessage: message }));
  process.exit(0);
}

// MEDIUM-only: advisory (never block)
if (medium.length > 0) {
  const message =
    `SECURITY ADVISORY (prompt-inject-scan): MEDIUM obfuscation/manipulation signals detected.\n\n` +
    medium.map((f, i) => `[${i + 1}] ${f}`).join('\n') +
    '\n\n' +
    `  These patterns may indicate obfuscated prompt manipulation (leetspeak, homoglyphs, multi-language).\n` +
    `  Review the source before proceeding. MEDIUM signals are advisory-only and never block.`;

  process.stdout.write(JSON.stringify({ decision: 'allow', systemMessage: message }));
  process.exit(0);
}

// Clean — allow silently
process.exit(0);