ktg-plugin-marketplace/plugins/ai-psychosis/tests/perf.test.mjs
2026-05-01 21:56:14 +02:00

438 lines
16 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Hook timing budget enforcement.
//
// Two thresholds are measured per hook:
//
// - WALL_CLOCK_P95_MS = 200 — total round-trip including Node ESM cold-start.
// The cold-start alone is 60-120ms on Intel Mac, so 100ms is unrealistic
// for any subprocess-based hook. 200ms gives headroom for shared CI noise.
//
// - LOGIC_TIME_P95_MS = 50 — pure work (regex evaluation + JSONL/state I/O)
// measured by a fixture-runner that imports lib.mjs once and exercises
// the hook's hot path inline. This is the meaningful hook-perf assertion;
// ESM cold-start is not something the plugin can optimize.
//
// p95 = the 4th value of 5 sorted iterations. Failing once triggers a single
// retry to absorb transient OS noise; a second failure is treated as a real
// signal (real perf regression or threshold needs tuning).
import { test } from 'node:test';
import assert from 'node:assert/strict';
import { execSync } from 'child_process';
import {
mkdtempSync, mkdirSync, writeFileSync, readFileSync, existsSync,
unlinkSync, rmSync, appendFileSync,
} from 'fs';
import { join } from 'path';
import { tmpdir } from 'os';
import { nowIso, nowEpoch } from '../hooks/scripts/lib.mjs';
const SCRIPTS_DIR = join(import.meta.dirname, '..', 'hooks', 'scripts');
const WALL_CLOCK_P95_MS = 200;
const LOGIC_TIME_P95_MS = 50;
const ITERATIONS = 5;
function setupDir() {
const dir = mkdtempSync(join(tmpdir(), 'ia-perf-'));
mkdirSync(join(dir, 'state'), { recursive: true });
return dir;
}
function p95(samples) {
return [...samples].sort((a, b) => a - b)[3];
}
// --- Wall-clock measurement (subprocess spawn) ---
function runWallClock(scriptName, stdinJson, dataDir) {
const t0 = performance.now();
execSync(`node ${join(SCRIPTS_DIR, scriptName)}`, {
input: JSON.stringify(stdinJson),
env: { ...process.env, CLAUDE_PLUGIN_DATA: dataDir },
encoding: 'utf8',
timeout: 5000,
});
return performance.now() - t0;
}
function measureWallClock(scriptName, stdinTemplate) {
const samples = [];
for (let i = 0; i < ITERATIONS; i++) {
const dir = setupDir();
try {
const sid = `perf-${i}`;
// Pre-seed state for hooks that read it (tool-tracker, session-end)
writeFileSync(
join(dir, 'state', `${sid}.json`),
JSON.stringify({ start_epoch: nowEpoch(), start_iso: nowIso(), tool_count: 0, edit_count: 0 })
);
samples.push(runWallClock(scriptName, { ...stdinTemplate, session_id: sid }, dir));
} finally {
rmSync(dir, { recursive: true, force: true });
}
}
return samples;
}
// --- Logic-time fixtures (no subprocess, single import of lib.mjs) ---
//
// These mirror each hook's hot path in pure inline code so we can measure
// regex + I/O cost without paying the ~80ms ESM cold-start tax. The pattern
// list intentionally mirrors the size class of prompt-analyzer's full
// pattern set so the benchmark stays representative.
//
// v1.2 pattern count: ~133 = 41 v1.1 (25 negative + 12 pushback + 4 domain)
// + 48 new domains (8 × 6)
// + 32 user-info (15 people + 10 digital + 7 no)
// + 12 valseek
// Fixture sized at ~91+ to bracket the realistic prompt-analyzer cost without
// overweighting the perf budget on test fixture maintenance.
//
// Patterns here are structurally equivalent to the real ones (length +
// complexity), not literal copies — the privacy boundary at
// prompt-analyzer.mjs:119 means production patterns must stay co-located
// with the privacy wipe. Keep in sync (approximately) with v1.2 pattern count.
const samplePatterns = [
// Negative emotional patterns (25 — matches v1.1.0)
/\bI\s+can'?t\s+do\s+this\s+without\b/i,
/\bwhat\s+should\s+I\b/i,
/\bI\s+need\s+you\s+to\b/i,
/\bonly\s+you\s+understand\b/i,
/\b(?:always|never|every|all)\s+the\s+time\b/i,
/\bdefinitely\s+(?:should|will|need)\b/i,
/\babsolutely\s+(?:right|correct)\b/i,
/\bI\s+am\s+(?:tired|exhausted|drained)\b/i,
/\blate\s+night\b/i,
/\b(?:can'?t|cannot)\s+sleep\b/i,
/\bI\s+(?:wish|want)\s+(?:I|you)\s+could\b/i,
/\bdo\s+you\s+think\b/i,
/\bare\s+you\s+sure\b/i,
/\bright\?$/i,
/\bagree\?$/i,
/\bam\s+I\s+(?:right|wrong)\b/i,
/\bplease\s+confirm\b/i,
/\bI\s+keep\s+(?:thinking|coming\s+back)\b/i,
/\bI\s+(?:can'?t|cannot)\s+stop\b/i,
/\bone\s+more\s+(?:thing|question)\b/i,
/\bjust\s+one\s+more\b/i,
/\bI'?ve\s+been\s+thinking\b/i,
/\bwhy\s+did\s+I\b/i,
/\bI\s+messed\s+up\b/i,
/\bI\s+made\s+a\s+mistake\b/i,
// Pushback patterns (12 — matches v1.1.0)
/\bbut\s+(?:that|this)\s+is\s+wrong\b/i,
/\bno,?\s+I\s+(?:meant|asked|said)\b/i,
/\byou(?:'?re|\s+are)\s+(?:wrong|mistaken|incorrect)\b/i,
/\bthat'?s\s+not\s+(?:right|what)\b/i,
/\bactually,?\s+(?:I|the)\b/i,
/\bdisagree\s+(?:with|because)\b/i,
/\bI\s+(?:still|already)\s+(?:think|believe)\b/i,
/\blisten,?\s+(?:I|you)\b/i,
/\bdon'?t\s+(?:tell|give)\s+me\b/i,
/\bjust\s+(?:do|say|tell)\s+(?:it|me)\b/i,
/\bI\s+(?:already|just)\s+decided\b/i,
/\byou\s+(?:keep|always)\s+(?:saying|missing)\b/i,
// Domain patterns (4 — matches v1.1.0)
/\bmy\s+(?:partner|spouse|husband|wife|boyfriend|girlfriend)\b/i,
/\b(?:our|the)\s+relationship\b/i,
/\bbreak\s+up\s+(?:with|over)\b/i,
/\bdating\s+(?:someone|him|her|them)\b/i,
// v1.2: 48 new domain patterns (8 × 6) — structurally equivalent to real ones
/\b(?:my|our)\s+(?:lawyer|attorney)\b/i,
/\bfiling\s+a?\s+lawsuit\b/i,
/\b(?:custody|divorce)\s+(?:hearing|case)\b/i,
/\b(?:contract|nda)\s+(?:violation|dispute)\b/i,
/\bsued?\s+(?:by|for)\b/i,
/\b(?:landlord|tenant)\s+(?:rights|dispute)\b/i,
/\bmy\s+(?:kid|child|son|daughter)\b/i,
/\b(?:potty|sleep)\s+training\s+issue\b/i,
/\bas\s+a\s+(?:parent|mom|dad)\b/i,
/\b(?:bedtime|breastfeeding)\s+routine\b/i,
/\b(?:school|preschool)\s+(?:choice|conflict)\b/i,
/\bmy\s+(?:child|kid)'?s?\s+(?:diagnosis|teacher)\b/i,
/\bmy\s+(?:doctor|physician|gp)\b/i,
/\b(?:diagnosed|prescribed)\s+(?:with|for)\b/i,
/\bmy\s+symptoms?\s+(?:are|include)\b/i,
/\b(?:my|i\s+have)\s+(?:cancer|diabetes)\b/i,
/\b(?:blood\s+pressure|heart\s+rate)\s+reading\b/i,
/\b(?:scheduled|having)\s+(?:surgery|procedure)\b/i,
/\bmy\s+(?:savings|retirement|401k)\s+account\b/i,
/\b(?:mortgage|loan|debt)\s+(?:payment|advice)\b/i,
/\bmy\s+tax\s+(?:return|bracket)\b/i,
/\b(?:budget|paycheck)\s+(?:negotiation|advice)\b/i,
/\b(?:stock|portfolio)\s+(?:pick|allocation)\b/i,
/\b(?:credit\s+card|interest\s+rate)\s+advice\b/i,
/\bmy\s+(?:boss|manager|coworker)\b/i,
/\b(?:performance\s+review|promotion|fired)\b/i,
/\bmy\s+(?:job|career|workplace)\s+(?:change|conflict)\b/i,
/\b(?:resume|cv)\s+advice\b/i,
/\bproject\s+deadline\s+(?:fight|conflict)\b/i,
/\b(?:remote|hybrid)\s+(?:policy|mandate)\b/i,
/\bmy\s+(?:guru|spiritual\s+teacher)\b/i,
/\b(?:meditation|mindfulness)\s+(?:practice|journey)\b/i,
/\b(?:karma|dharma|chakra)\b/i,
/\b(?:god|the\s+universe)\s+(?:wants|told)\b/i,
/\b(?:soulmate|twin\s+flame|past\s+life)\b/i,
/\b(?:prayer|spiritual\s+journey)\b/i,
/\bshould\s+i\s+buy\s+(?:a|the)\b/i,
/\bwhich\s+(?:laptop|phone|car)\s+should\b/i,
/\b(?:product|item)\s+(?:review|comparison)\b/i,
/\b(?:amazon|online)\s+(?:order|purchase)\b/i,
/\b(?:better|best)\s+(?:deal|price)\s+(?:for|on)\b/i,
/\b(?:upgrade|replace)\s+my\s+(?:laptop|phone)\b/i,
/\b(?:learn|practice)\s+(?:a|the)\s+habit\s+of\b/i,
/\bmy\s+(?:morning|daily)\s+routine\b/i,
/\bread(?:ing)?\s+more\s+books\b/i,
/\b(?:start|build)\s+a\s+(?:journal|hobby)\b/i,
/\b(?:learning|teaching\s+myself)\b/i,
/\b(?:improve|level\s+up)\s+(?:myself|my\s+focus)\b/i,
// v1.2: 32 user-info patterns (15 people + 10 digital + 7 no)
/\bmy\s+(?:therapist|counselor|psychologist)\b/i,
/\bmy\s+(?:doctor|gp|physician)\b/i,
/\bmy\s+(?:friend|best\s+friend)\b/i,
/\bmy\s+(?:partner|spouse|wife|husband)\b/i,
/\bmy\s+(?:mom|dad|mother|father)\b/i,
/\bmy\s+(?:mentor|coach|advisor)\b/i,
/\bmy\s+support\s+group\b/i,
/\bi\s+asked\s+my\s+(?:friend|therapist)\b/i,
/\bi\s+told\s+my\s+(?:friend|therapist|partner)\b/i,
/\bmy\s+family\s+(?:said|told)\b/i,
/\bmy\s+(?:lawyer|attorney)\b/i,
/\bmy\s+(?:pastor|priest|rabbi)\b/i,
/\bmy\s+(?:teacher|professor|tutor)\b/i,
/\bmy\s+(?:colleague|coworker)\b/i,
/\bi\s+reached\s+out\s+to\s+my\s+(?:friend|therapist)\b/i,
/\bi\s+(?:googled|searched)\b/i,
/\bi\s+read\s+(?:online|on\s+the\s+internet)\b/i,
/\b(?:chatgpt|gpt|gemini)\s+(?:said|told)\b/i,
/\b(?:found|saw)\s+a\s+(?:forum\s+post|reddit\s+thread)\b/i,
/\b(?:youtube|tiktok|twitter)\s+(?:video|post)\b/i,
/\baccording\s+to\s+(?:wikipedia|google)\b/i,
/\bi\s+asked\s+(?:chatgpt|gpt|claude)\b/i,
/\bonline\s+says\s+(?:that|this)\b/i,
/\bsearched\s+(?:google|stackoverflow)\b/i,
/\bi\s+watched\s+a\s+youtube\b/i,
/\b(?:nobody|no\s+one)\s+knows\b/i,
/\bi\s+haven'?t\s+told\s+(?:anyone|anybody)\b/i,
/\bdealing\s+with\s+this\s+alone\b/i,
/\bi\s+can'?t\s+tell\s+(?:anyone|anybody)\b/i,
/\bkeep\s+(?:this|it)\s+(?:to\s+myself|secret)\b/i,
/\bnobody\s+(?:in\s+my\s+life|around\s+me)\s+would\s+understand\b/i,
/\bjust\s+me\s+(?:and|with)\s+(?:my|the)\s+(?:thoughts|head)\b/i,
// v1.2: 12 valseek patterns
/\bisn'?t\s+(?:it|that|she|he)\b[^.!?]*\?/i,
/\bdon'?t\s+you\s+(?:think|agree|see)\b[^.!?]*\?/i,
/\bright,?\s+(?:though|so)\b[^.!?]*\?/i,
/\bam\s+i\s+(?:crazy|wrong|the\s+only\s+one)\b/i,
/\btell\s+me\s+i'?m\s+not\s+(?:crazy|wrong)\b/i,
/\bis\s+it\s+(?:normal|crazy|reasonable)\s+(?:to|that)\b/i,
/\byou\s+agree,?\s+right\??/i,
/\btell\s+me\s+i'?m\s+right\b/i,
/\bback\s+me\s+up\s+(?:on\s+this|here)\b/i,
/\bi\s+(?:already|just)\s+(?:decided|knew)\b.*(?:should|right)\b/i,
/\bi'?ve\s+made\s+up\s+my\s+mind\b.*(?:right|correct)\b/i,
/\bi\s+know\s+i'?m\s+right\s+(?:about|on)\b/i,
];
function logicSessionStart(dir, sid) {
const stateFile = join(dir, 'state', `${sid}.json`);
const sessionsLog = join(dir, 'sessions.jsonl');
const iso = nowIso();
const epoch = nowEpoch();
const state = { start_epoch: epoch, start_iso: iso, tool_count: 0, edit_count: 0 };
writeFileSync(stateFile, JSON.stringify(state));
appendFileSync(
sessionsLog,
JSON.stringify({ session_id: sid, start: iso, hour: new Date().getUTCHours(), is_late_night: false }) + '\n'
);
}
function logicPromptAnalyzer(dir, sid, prompt) {
const stateFile = join(dir, 'state', `${sid}.json`);
const state = existsSync(stateFile) ? JSON.parse(readFileSync(stateFile, 'utf8')) : {};
let depHit = 0, valHit = 0;
for (const p of samplePatterns) { if (p.test(prompt)) { valHit = 1; break; } }
state.dep_flags = (state.dep_flags || 0) + depHit;
state.val_flags = (state.val_flags || 0) + valHit;
writeFileSync(stateFile, JSON.stringify(state));
}
function logicToolTracker(dir, sid, toolName) {
const stateFile = join(dir, 'state', `${sid}.json`);
const eventsLog = join(dir, 'events.jsonl');
const state = existsSync(stateFile) ? JSON.parse(readFileSync(stateFile, 'utf8')) : {};
state.tool_count = (state.tool_count || 0) + 1;
if (toolName === 'Edit' || toolName === 'Write') state.edit_count = (state.edit_count || 0) + 1;
appendFileSync(
eventsLog,
JSON.stringify({ ts: nowIso(), session_id: sid, tool_name: toolName }) + '\n'
);
writeFileSync(stateFile, JSON.stringify(state));
}
function logicSessionEnd(dir, sid) {
const stateFile = join(dir, 'state', `${sid}.json`);
const sessionsLog = join(dir, 'sessions.jsonl');
if (!existsSync(stateFile)) return;
const state = JSON.parse(readFileSync(stateFile, 'utf8'));
appendFileSync(
sessionsLog,
JSON.stringify({
session_id: sid,
start: state.start_iso,
end: nowIso(),
duration_min: 0,
tool_count: state.tool_count || 0,
edit_count: state.edit_count || 0,
flags: { dependency: 0, escalation: 0, fatigue: 0, validation: state.val_flags || 0, pushback: 0 },
}) + '\n'
);
unlinkSync(stateFile);
}
function measureLogicTime(fn, ...extraArgs) {
const samples = [];
for (let i = 0; i < ITERATIONS; i++) {
const dir = setupDir();
const sid = `perf-${i}`;
try {
writeFileSync(
join(dir, 'state', `${sid}.json`),
JSON.stringify({ start_epoch: nowEpoch(), start_iso: nowIso(), tool_count: 0, edit_count: 0 })
);
const t0 = performance.now();
fn(dir, sid, ...extraArgs);
samples.push(performance.now() - t0);
} finally {
rmSync(dir, { recursive: true, force: true });
}
}
return samples;
}
function assertWithRetry(measure, threshold, label) {
let samples = measure();
let p = p95(samples);
if (p > threshold) {
samples = measure();
p = p95(samples);
}
assert.ok(
p <= threshold,
`${label} p95 = ${p.toFixed(1)}ms exceeds ${threshold}ms (samples: ${samples.map(s => s.toFixed(1)).join(', ')})`
);
}
// --- Wall-clock tests (4) ---
test('session-start.mjs wall-clock p95 within 200ms', () => {
assertWithRetry(
() => measureWallClock('session-start.mjs', { cwd: '/tmp' }),
WALL_CLOCK_P95_MS,
'session-start wall-clock'
);
});
test('prompt-analyzer.mjs wall-clock p95 within 200ms', () => {
assertWithRetry(
() => measureWallClock('prompt-analyzer.mjs', { prompt: 'are you sure I should do this? right?', cwd: '/tmp' }),
WALL_CLOCK_P95_MS,
'prompt-analyzer wall-clock'
);
});
test('tool-tracker.mjs wall-clock p95 within 200ms', () => {
assertWithRetry(
() => measureWallClock('tool-tracker.mjs', { tool_name: 'Edit', cwd: '/tmp' }),
WALL_CLOCK_P95_MS,
'tool-tracker wall-clock'
);
});
test('session-end.mjs wall-clock p95 within 200ms', () => {
assertWithRetry(
() => measureWallClock('session-end.mjs', { cwd: '/tmp' }),
WALL_CLOCK_P95_MS,
'session-end wall-clock'
);
});
// --- Logic-time tests (4) ---
test('session-start logic-time p95 within 50ms', () => {
assertWithRetry(
() => measureLogicTime(logicSessionStart),
LOGIC_TIME_P95_MS,
'session-start logic-time'
);
});
test('prompt-analyzer logic-time p95 within 50ms', () => {
assertWithRetry(
() => measureLogicTime(logicPromptAnalyzer, 'are you sure I should do this? right?'),
LOGIC_TIME_P95_MS,
'prompt-analyzer logic-time'
);
});
test('tool-tracker logic-time p95 within 50ms', () => {
assertWithRetry(
() => measureLogicTime(logicToolTracker, 'Edit'),
LOGIC_TIME_P95_MS,
'tool-tracker logic-time'
);
});
test('session-end logic-time p95 within 50ms', () => {
assertWithRetry(
() => measureLogicTime(logicSessionEnd),
LOGIC_TIME_P95_MS,
'session-end logic-time'
);
});
// --- v1.2: cross-session read at scale ---
//
// Pre-seeds sessions.jsonl with 1000 records to exercise the realistic
// readRecentEndRecords path. Tail-first scan should bound cost regardless.
function measureSessionStartWithJsonlFixture(recordCount) {
const samples = [];
for (let i = 0; i < ITERATIONS; i++) {
const dir = setupDir();
try {
// Pre-seed sessions.jsonl with mixed start/end records.
const lines = [];
for (let r = 0; r < recordCount; r++) {
const startISO = new Date(Date.now() - (recordCount - r) * 60_000).toISOString();
const endISO = new Date(Date.now() - (recordCount - r) * 60_000 + 30_000).toISOString();
lines.push(JSON.stringify({
session_id: `seed-${r}`, start: startISO,
end: endISO, duration_min: 30,
domain_context: ['legal'], user_info_class: 'no',
flags: { dependency: 0, escalation: 0, fatigue: 0, validation: 0, pushback: 0 },
}));
}
writeFileSync(join(dir, 'sessions.jsonl'), lines.join('\n') + '\n');
const sid = `bigfix-${i}`;
writeFileSync(
join(dir, 'state', `${sid}.json`),
JSON.stringify({ start_epoch: nowEpoch(), start_iso: nowIso(), tool_count: 0, edit_count: 0 })
);
samples.push(runWallClock('session-start.mjs', { session_id: sid, cwd: '/tmp' }, dir));
} finally {
rmSync(dir, { recursive: true, force: true });
}
}
return samples;
}
test('session-start with 1000-record sessions.jsonl wall-clock p95 within 200ms', () => {
// The tier-2 alert in session-start.mjs reads the tail of sessions.jsonl
// via readRecentEndRecords(3). Tail-first scan should keep wall-clock
// bounded regardless of total file size.
assertWithRetry(
() => measureSessionStartWithJsonlFixture(1000),
WALL_CLOCK_P95_MS,
'session-start wall-clock with 1000-record fixture'
);
});