ktg-plugin-marketplace/plugins/voyage/lib/parsers/profile-jaccard.mjs

// lib/parsers/profile-jaccard.mjs
// String-normalisering helper for cross-tier Jaccard smoke-test (Step 18).
//
// Plan steps from different model tiers (sonnet vs opus) often differ
// only in punctuation, casing, or trivial wording (`logger.info` vs
// "logger.info" vs `logger info`). To avoid trivial false-negatives in
// cross-tier Jaccard, every step title passes through `normalizeStep`
// before set membership is computed.
//
// Normalisering rules (per research/02 §3.4):
//   1. Lowercase the entire string.
//   2. Strip backticks and parentheses (`...` and (...)).
//   3. Collapse runs of whitespace to a single space.
//   4. Trim leading + trailing whitespace.
//
// We do NOT stem or lemmatize — that would over-normalize and mask real
// disagreement (e.g. "Add tests for X" vs "Verify tests for X" should
// remain distinct).

/**
 * Normalize a single step-title string.
 * @param {unknown} step
 * @returns {string}
 */
export function normalizeStep(step) {
  if (typeof step !== 'string') return '';
  return step
    .toLowerCase()
    .replace(/[`()]/g, '')
    .replace(/\s+/g, ' ')
    .trim();
}

/**
 * Normalize an array of step titles.
 * @param {string[]} steps
 * @returns {string[]}
 */
export function normalizeSteps(steps) {
  if (!Array.isArray(steps)) return [];
  return steps.map(normalizeStep).filter((s) => s.length > 0);
}

/**
 * Verify step-count parity within a tolerance band.
 * @param {string[]} stepsA
 * @param {string[]} stepsB
 * @param {number} tolerance fraction (default 0.20 = ±20%)
 * @returns {{ok: boolean, ratio: number, message: string}}
 */
export function checkStepCountParity(stepsA, stepsB, tolerance = 0.2) {
  const a = Array.isArray(stepsA) ? stepsA.length : 0;
  const b = Array.isArray(stepsB) ? stepsB.length : 0;
  if (a === 0 || b === 0) {
    return {
      ok: false,
      ratio: 0,
      message: `step-count parity failed: empty input (a=${a}, b=${b})`,
    };
  }
  const ratio = Math.abs(a - b) / Math.max(a, b);
  return {
    ok: ratio <= tolerance,
    ratio,
    message:
      ratio <= tolerance
        ? `step-count parity OK (a=${a}, b=${b}, ratio=${ratio.toFixed(3)})`
        : `step-count parity exceeded ${tolerance}: a=${a}, b=${b}, ratio=${ratio.toFixed(3)}`,
  };
}