Session 5 step 20 — templates for recurring feedback patterns with VFM-compatible scoring. Adds FEEDBACK.md append-only log, PostToolUse hook that detects 3+ recurring pattern tags, and per-agent scoring that tracks trends against prior window. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
183 lines
5.9 KiB
Bash
183 lines
5.9 KiB
Bash
#!/bin/bash
|
|
# Performance scorer: per-agent metrics from FEEDBACK.md + cost-events.jsonl.
|
|
# Bash 3.2 compatible. Uses python3 for all metrics computation.
|
|
#
|
|
# Metrics per agent:
|
|
# - Average score (0-100)
|
|
# - Error rate (rows with score < threshold / total rows)
|
|
# - Cost per run (from cost-events.jsonl, rough proxy)
|
|
# - Improvement trend: avg of last 10 scores vs. previous 10
|
|
#
|
|
# Flags agents below threshold (default 60/100).
|
|
#
|
|
# Usage:
|
|
# ./performance-scorer.sh # Score all agents
|
|
# ./performance-scorer.sh --agent {{AGENT}} # Score specific agent
|
|
# ./performance-scorer.sh --threshold 70 # Custom threshold
|
|
#
|
|
# Placeholders:
|
|
# {{WORKING_DIR}} - absolute path to project directory
|
|
|
|
WORKING_DIR="{{WORKING_DIR}}"
|
|
FEEDBACK_FILE="$WORKING_DIR/FEEDBACK.md"
|
|
COST_LOG="$WORKING_DIR/budget/cost-events.jsonl"
|
|
THRESHOLD="${2:-60}"
|
|
AGENT_FILTER=""
|
|
|
|
# Parse arguments (bash 3.2 compatible -- no associative arrays)
|
|
while [ "$#" -gt 0 ]; do
|
|
case "$1" in
|
|
--agent) AGENT_FILTER="$2"; shift 2 ;;
|
|
--threshold) THRESHOLD="$2"; shift 2 ;;
|
|
*) shift ;;
|
|
esac
|
|
done
|
|
|
|
if [ ! -f "$FEEDBACK_FILE" ]; then
|
|
echo "No feedback file found at $FEEDBACK_FILE"
|
|
exit 0
|
|
fi
|
|
|
|
python3 << PYEOF
|
|
import re, json, os, sys
|
|
from collections import defaultdict
|
|
|
|
feedback_file = "$FEEDBACK_FILE"
|
|
cost_log = "$COST_LOG"
|
|
threshold = int("$THRESHOLD")
|
|
agent_filter = "$AGENT_FILTER"
|
|
|
|
# Parse FEEDBACK.md table rows
|
|
# Expected columns: Date, Pipeline, Agent, Score, Issue, Resolution, Pattern
|
|
feedback_rows = []
|
|
with open(feedback_file) as f:
|
|
in_table = False
|
|
header_seen = False
|
|
for line in f:
|
|
line = line.strip()
|
|
if '| Date |' in line:
|
|
in_table = True
|
|
header_seen = True
|
|
continue
|
|
if in_table and line.startswith('|---'):
|
|
continue
|
|
if in_table and line.startswith('|') and '{{' not in line and header_seen:
|
|
cols = [c.strip() for c in line.strip('|').split('|')]
|
|
if len(cols) >= 7:
|
|
try:
|
|
date = cols[0]
|
|
pipeline = cols[1]
|
|
agent = cols[2]
|
|
score_str = cols[3]
|
|
issue = cols[4]
|
|
resolution = cols[5]
|
|
pattern = cols[6]
|
|
# Parse score: "75/100" or "75"
|
|
score_m = re.match(r'(\d+)', score_str)
|
|
score = int(score_m.group(1)) if score_m else 0
|
|
feedback_rows.append({
|
|
'date': date,
|
|
'pipeline': pipeline,
|
|
'agent': agent,
|
|
'score': score,
|
|
'issue': issue,
|
|
'pattern': pattern
|
|
})
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
# Filter by agent if specified
|
|
if agent_filter:
|
|
feedback_rows = [r for r in feedback_rows if r['agent'] == agent_filter]
|
|
|
|
if not feedback_rows:
|
|
print("No feedback rows found.")
|
|
sys.exit(0)
|
|
|
|
# Read cost events if available
|
|
cost_by_agent = defaultdict(int)
|
|
if os.path.exists(cost_log):
|
|
with open(cost_log) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
try:
|
|
event = json.loads(line)
|
|
agent = event.get('agent', 'unknown')
|
|
cost_by_agent[agent] += 1 # event count as proxy
|
|
except Exception:
|
|
pass
|
|
|
|
# Compute per-agent metrics
|
|
agents = list(set(r['agent'] for r in feedback_rows))
|
|
|
|
print("PERFORMANCE SCORECARD")
|
|
print("=" * 60)
|
|
print(f"Threshold: {threshold}/100")
|
|
print(f"Total feedback rows: {len(feedback_rows)}")
|
|
print()
|
|
|
|
flagged = []
|
|
|
|
for agent in sorted(agents):
|
|
rows = [r for r in feedback_rows if r['agent'] == agent]
|
|
scores = [r['score'] for r in rows]
|
|
|
|
avg_score = sum(scores) / len(scores) if scores else 0
|
|
error_rate = len([s for s in scores if s < threshold]) / len(scores) if scores else 0
|
|
cost_events = cost_by_agent.get(agent, 0)
|
|
cost_per_run = cost_events / len(rows) if rows else 0
|
|
|
|
# Improvement trend: last 10 vs. prev 10
|
|
trend_str = "n/a (fewer than 20 runs)"
|
|
if len(scores) >= 20:
|
|
prev10 = scores[-20:-10]
|
|
last10 = scores[-10:]
|
|
prev_avg = sum(prev10) / len(prev10)
|
|
last_avg = sum(last10) / len(last10)
|
|
delta = last_avg - prev_avg
|
|
if delta > 5:
|
|
trend_str = f"improving (+{delta:.1f})"
|
|
elif delta < -5:
|
|
trend_str = f"declining ({delta:.1f})"
|
|
else:
|
|
trend_str = f"stable ({delta:+.1f})"
|
|
elif len(scores) >= 10:
|
|
last10 = scores[-10:]
|
|
trend_str = f"recent avg: {sum(last10)/len(last10):.1f} (need 20 runs for trend)"
|
|
|
|
# Pattern frequency
|
|
patterns = defaultdict(int)
|
|
for r in rows:
|
|
if r['pattern']:
|
|
patterns[r['pattern']] += 1
|
|
top_patterns = sorted(patterns.items(), key=lambda x: -x[1])[:3]
|
|
|
|
print(f"Agent: {agent}")
|
|
print(f" Runs: {len(rows)}")
|
|
print(f" Avg score: {avg_score:.1f}/100")
|
|
print(f" Error rate: {error_rate*100:.0f}% (score < {threshold})")
|
|
print(f" Cost/run: ~{cost_per_run:.1f} events (rough proxy)")
|
|
print(f" Trend: {trend_str}")
|
|
if top_patterns:
|
|
print(f" Top patterns: {', '.join(f'{p}({c})' for p, c in top_patterns)}")
|
|
print()
|
|
|
|
if avg_score < threshold:
|
|
flagged.append((agent, avg_score))
|
|
|
|
# Summary of flagged agents
|
|
if flagged:
|
|
print("FLAGGED AGENTS (below threshold)")
|
|
print("-" * 40)
|
|
for agent, avg in flagged:
|
|
print(f" {agent}: avg {avg:.1f} < {threshold}")
|
|
print()
|
|
print("Recommended actions:")
|
|
print(" 1. Review feedback rows for top patterns")
|
|
print(" 2. Iterate on agent system prompt")
|
|
print(" 3. Consider pipeline redesign if pattern is structural")
|
|
print(" 4. Run pipeline-optimizer.sh for bottleneck analysis")
|
|
else:
|
|
print("All agents above threshold.")
|
|
PYEOF
|