feat(ai-psychosis): add validation-seeking detector
This commit is contained in:
parent
ca6567b501
commit
b88cd8a978
2 changed files with 182 additions and 0 deletions
|
|
@ -232,6 +232,29 @@ const userInfoNoPatterns = [
|
||||||
/\bjust me (?:and|with) (?:my|the) (?:thoughts|head|computer|claude)\b/i,
|
/\bjust me (?:and|with) (?:my|the) (?:thoughts|head|computer|claude)\b/i,
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// v1.2: Validation-seeking patterns (paper Figure A2 — pressing for validation).
|
||||||
|
// Distinct from existing val_flags ("right?" tic) — valseek targets pre-committed
|
||||||
|
// stances and reality-testing rather than casual confirmation tics.
|
||||||
|
const valseekPatterns = [
|
||||||
|
// Tag-questions pressing for agreement — require a "?" within the clause
|
||||||
|
// so we don't false-positive on flat statements like "this isn't that bad".
|
||||||
|
/\bisn'?t (?:it|that|she|he|this|true)\b[^.!?]*\?/i,
|
||||||
|
/\bdon'?t you (?:think|agree|see)\b[^.!?]*\?/i,
|
||||||
|
/\bright,?\s+(?:though|so)\b[^.!?]*\?/i,
|
||||||
|
// Reality-testing — am-I-the-only-one
|
||||||
|
/\bam i (?:crazy|wrong|the only one|imagining)\b/i,
|
||||||
|
/\btell me i'?m not (?:crazy|wrong|imagining)\b/i,
|
||||||
|
/\bis it (?:normal|crazy|reasonable) (?:to|that|for)\b/i,
|
||||||
|
// Side-taking pressing
|
||||||
|
/\byou agree,?\s+right\??/i,
|
||||||
|
/\btell me i'?m right\b/i,
|
||||||
|
/\bback me up (?:on this|here)\b/i,
|
||||||
|
// Pre-committed stance + confirmation
|
||||||
|
/\bi (?:already|just) (?:decided|knew|know).*(?:should|right|correct)\b/i,
|
||||||
|
/\bI'?ve made up my mind.*(?:right|correct|good)\b/i,
|
||||||
|
/\bI know I'?m right (?:about|on) (?:this|that)\b/i,
|
||||||
|
];
|
||||||
|
|
||||||
for (const p of depPatterns) { if (p.test(prompt)) { depHit = 1; break; } }
|
for (const p of depPatterns) { if (p.test(prompt)) { depHit = 1; break; } }
|
||||||
for (const p of escPatterns) { if (p.test(prompt)) { escHit = 1; break; } }
|
for (const p of escPatterns) { if (p.test(prompt)) { escHit = 1; break; } }
|
||||||
for (const p of fatPatterns) { if (p.test(prompt)) { fatHit = 1; break; } }
|
for (const p of fatPatterns) { if (p.test(prompt)) { fatHit = 1; break; } }
|
||||||
|
|
@ -256,6 +279,10 @@ let userInfoPeopleHit = 0; for (const p of userInfoPeoplePatterns) { if (p.tes
|
||||||
let userInfoDigitalHit = 0; for (const p of userInfoDigitalPatterns) { if (p.test(prompt)) { userInfoDigitalHit = 1; break; } }
|
let userInfoDigitalHit = 0; for (const p of userInfoDigitalPatterns) { if (p.test(prompt)) { userInfoDigitalHit = 1; break; } }
|
||||||
let userInfoNoHit = 0; for (const p of userInfoNoPatterns) { if (p.test(prompt)) { userInfoNoHit = 1; break; } }
|
let userInfoNoHit = 0; for (const p of userInfoNoPatterns) { if (p.test(prompt)) { userInfoNoHit = 1; break; } }
|
||||||
|
|
||||||
|
// v1.2: Validation-seeking detection — distinct from val_flags. Counts how
|
||||||
|
// many valseek patterns matched in this prompt (one or more).
|
||||||
|
let valseekHit = 0; for (const p of valseekPatterns) { if (p.test(prompt)) { valseekHit = 1; break; } }
|
||||||
|
|
||||||
// Clear prompt from memory
|
// Clear prompt from memory
|
||||||
prompt = '';
|
prompt = '';
|
||||||
|
|
||||||
|
|
@ -308,6 +335,13 @@ if (candidate) {
|
||||||
}
|
}
|
||||||
state.user_info_class = nextClass;
|
state.user_info_class = nextClass;
|
||||||
|
|
||||||
|
// v1.2: validation-seeking accumulator. valseek_flag flips to 1 on first
|
||||||
|
// hit and stays 1 (sticky for session); valseek_count accumulates per hit.
|
||||||
|
if (valseekHit) {
|
||||||
|
state.valseek_count = (Number(state.valseek_count) || 0) + 1;
|
||||||
|
state.valseek_flag = 1;
|
||||||
|
}
|
||||||
|
|
||||||
// v1.2: domain_context is always an array. Coerce v1.1.0 string shape on read.
|
// v1.2: domain_context is always an array. Coerce v1.1.0 string shape on read.
|
||||||
const anyDomainHit = domainHit
|
const anyDomainHit = domainHit
|
||||||
|| domainLegalHit || domainParentingHit || domainHealthHit
|
|| domainLegalHit || domainParentingHit || domainHealthHit
|
||||||
|
|
|
||||||
148
plugins/ai-psychosis/tests/validation-seeking.test.mjs
Normal file
148
plugins/ai-psychosis/tests/validation-seeking.test.mjs
Normal file
|
|
@ -0,0 +1,148 @@
|
||||||
|
// validation-seeking.test.mjs — verifies v1.2 validation-seeking detector.
|
||||||
|
//
|
||||||
|
// Distinct from existing val_flags ("right?" tic). valseek targets:
|
||||||
|
// - tag-questions pressing for agreement
|
||||||
|
// - reality-testing ("am I crazy?", "is it normal?")
|
||||||
|
// - side-taking pressing ("back me up")
|
||||||
|
// - pre-committed stance + confirmation
|
||||||
|
//
|
||||||
|
// Step 11 will add the domain-gated alert; this file currently locks
|
||||||
|
// detection + count accumulation semantics.
|
||||||
|
|
||||||
|
import { describe, it, afterEach } from 'node:test';
|
||||||
|
import assert from 'node:assert/strict';
|
||||||
|
import { runHook, setupTestDir, cleanupTestDir, createStateFile, readState } from './test-helper.mjs';
|
||||||
|
|
||||||
|
let dir;
|
||||||
|
afterEach(() => { if (dir) cleanupTestDir(dir); });
|
||||||
|
|
||||||
|
function freshState() {
|
||||||
|
return {
|
||||||
|
start_epoch: Math.floor(Date.now() / 1000) - 60,
|
||||||
|
start_iso: '2026-05-01T10:00:00Z',
|
||||||
|
tool_count: 0, edit_count: 0,
|
||||||
|
last_event_epoch: 0, burst_count: 0,
|
||||||
|
dep_flags: 0, esc_flags: 0, fatigue_flags: 0, val_flags: 0,
|
||||||
|
pushback_count: 0, domain_context: null,
|
||||||
|
user_info_class: null,
|
||||||
|
user_info_flags: { yes_people: 0, yes_digital: 0, no: 0 },
|
||||||
|
turn_count: 0,
|
||||||
|
valseek_count: 0, valseek_flag: 0,
|
||||||
|
last_warning_epoch: 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function runPrompt(prompt, stateOverrides = {}) {
|
||||||
|
dir = setupTestDir();
|
||||||
|
createStateFile(dir, 'v1', { ...freshState(), ...stateOverrides });
|
||||||
|
runHook('prompt-analyzer.mjs', { session_id: 'v1', prompt }, dir);
|
||||||
|
return readState(dir, 'v1');
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Pattern detection ---
|
||||||
|
|
||||||
|
describe('valseek: tag-questions pressing for agreement', () => {
|
||||||
|
it('matches "isn\'t it?"', () => {
|
||||||
|
const s = runPrompt("she's wrong, isn't she?");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
assert.equal(s.valseek_flag, 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('matches "don\'t you think?"', () => {
|
||||||
|
const s = runPrompt("this approach is better, don't you think?");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('matches "right, though?"', () => {
|
||||||
|
const s = runPrompt("I should leave him, right, though?");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('valseek: reality-testing patterns', () => {
|
||||||
|
it('matches "am I crazy"', () => {
|
||||||
|
const s = runPrompt("am I crazy for thinking this?");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('matches "tell me I\'m not crazy"', () => {
|
||||||
|
const s = runPrompt("tell me I'm not crazy for feeling betrayed");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('matches "is it normal to"', () => {
|
||||||
|
const s = runPrompt("is it normal to feel this way after years?");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('valseek: side-taking pressing', () => {
|
||||||
|
it('matches "you agree, right?"', () => {
|
||||||
|
const s = runPrompt("you agree, right?");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('matches "back me up here"', () => {
|
||||||
|
const s = runPrompt("she lied to me — back me up here");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('valseek: pre-committed stance', () => {
|
||||||
|
it('matches "I already decided ... right"', () => {
|
||||||
|
const s = runPrompt("I already decided to quit, that's right?");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('matches "I know I\'m right about this"', () => {
|
||||||
|
const s = runPrompt("I know I'm right about this whole thing");
|
||||||
|
assert.equal(s.valseek_count, 1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Negative cases ---
|
||||||
|
|
||||||
|
describe('valseek: false-positive guards', () => {
|
||||||
|
it('does NOT match casual "right?" tic alone', () => {
|
||||||
|
const s = runPrompt('the function returns true, right?');
|
||||||
|
// Casual right? hits the existing val_flags pattern but NOT valseek.
|
||||||
|
assert.equal(s.valseek_count, 0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does NOT match technical question without pressing pattern', () => {
|
||||||
|
const s = runPrompt('what does this regex do?');
|
||||||
|
assert.equal(s.valseek_count, 0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Accumulation ---
|
||||||
|
|
||||||
|
describe('valseek: count accumulation', () => {
|
||||||
|
it('accumulates across multiple prompts', () => {
|
||||||
|
dir = setupTestDir();
|
||||||
|
createStateFile(dir, 'v-acc', freshState());
|
||||||
|
const prompts = [
|
||||||
|
"am I crazy for staying?",
|
||||||
|
"you agree, right?",
|
||||||
|
"isn't she wrong?",
|
||||||
|
"I know I'm right on this",
|
||||||
|
"tell me I'm not crazy",
|
||||||
|
];
|
||||||
|
for (const p of prompts) {
|
||||||
|
runHook('prompt-analyzer.mjs', { session_id: 'v-acc', prompt: p }, dir);
|
||||||
|
}
|
||||||
|
const s = readState(dir, 'v-acc');
|
||||||
|
assert.equal(s.valseek_count, 5);
|
||||||
|
assert.equal(s.valseek_flag, 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('valseek_flag is sticky once set, even if later prompt has no hit', () => {
|
||||||
|
dir = setupTestDir();
|
||||||
|
createStateFile(dir, 'v-sticky', freshState());
|
||||||
|
runHook('prompt-analyzer.mjs', { session_id: 'v-sticky', prompt: 'am I crazy?' }, dir);
|
||||||
|
runHook('prompt-analyzer.mjs', { session_id: 'v-sticky', prompt: 'refactor this code' }, dir);
|
||||||
|
const s = readState(dir, 'v-sticky');
|
||||||
|
assert.equal(s.valseek_count, 1, 'count is unchanged by later non-matching prompt');
|
||||||
|
assert.equal(s.valseek_flag, 1, 'flag stays 1 once set');
|
||||||
|
});
|
||||||
|
});
|
||||||
Loading…
Add table
Add a link
Reference in a new issue