#!/bin/bash # Performance scorer: per-agent metrics from FEEDBACK.md + cost-events.jsonl. # Bash 3.2 compatible. Uses python3 for all metrics computation. # # Metrics per agent: # - Average score (0-100) # - Error rate (rows with score < threshold / total rows) # - Cost per run (from cost-events.jsonl, rough proxy) # - Improvement trend: avg of last 10 scores vs. previous 10 # # Flags agents below threshold (default 60/100). # # Usage: # ./performance-scorer.sh # Score all agents # ./performance-scorer.sh --agent {{AGENT}} # Score specific agent # ./performance-scorer.sh --threshold 70 # Custom threshold # # Placeholders: # {{WORKING_DIR}} - absolute path to project directory WORKING_DIR="{{WORKING_DIR}}" FEEDBACK_FILE="$WORKING_DIR/FEEDBACK.md" COST_LOG="$WORKING_DIR/budget/cost-events.jsonl" THRESHOLD="${2:-60}" AGENT_FILTER="" # Parse arguments (bash 3.2 compatible -- no associative arrays) while [ "$#" -gt 0 ]; do case "$1" in --agent) AGENT_FILTER="$2"; shift 2 ;; --threshold) THRESHOLD="$2"; shift 2 ;; *) shift ;; esac done if [ ! -f "$FEEDBACK_FILE" ]; then echo "No feedback file found at $FEEDBACK_FILE" exit 0 fi python3 << PYEOF import re, json, os, sys from collections import defaultdict feedback_file = "$FEEDBACK_FILE" cost_log = "$COST_LOG" threshold = int("$THRESHOLD") agent_filter = "$AGENT_FILTER" # Parse FEEDBACK.md table rows # Expected columns: Date, Pipeline, Agent, Score, Issue, Resolution, Pattern feedback_rows = [] with open(feedback_file) as f: in_table = False header_seen = False for line in f: line = line.strip() if '| Date |' in line: in_table = True header_seen = True continue if in_table and line.startswith('|---'): continue if in_table and line.startswith('|') and '{{' not in line and header_seen: cols = [c.strip() for c in line.strip('|').split('|')] if len(cols) >= 7: try: date = cols[0] pipeline = cols[1] agent = cols[2] score_str = cols[3] issue = cols[4] resolution = cols[5] pattern = cols[6] # Parse score: "75/100" or "75" score_m = re.match(r'(\d+)', score_str) score = int(score_m.group(1)) if score_m else 0 feedback_rows.append({ 'date': date, 'pipeline': pipeline, 'agent': agent, 'score': score, 'issue': issue, 'pattern': pattern }) except (ValueError, IndexError): pass # Filter by agent if specified if agent_filter: feedback_rows = [r for r in feedback_rows if r['agent'] == agent_filter] if not feedback_rows: print("No feedback rows found.") sys.exit(0) # Read cost events if available cost_by_agent = defaultdict(int) if os.path.exists(cost_log): with open(cost_log) as f: for line in f: line = line.strip() if line: try: event = json.loads(line) agent = event.get('agent', 'unknown') cost_by_agent[agent] += 1 # event count as proxy except Exception: pass # Compute per-agent metrics agents = list(set(r['agent'] for r in feedback_rows)) print("PERFORMANCE SCORECARD") print("=" * 60) print(f"Threshold: {threshold}/100") print(f"Total feedback rows: {len(feedback_rows)}") print() flagged = [] for agent in sorted(agents): rows = [r for r in feedback_rows if r['agent'] == agent] scores = [r['score'] for r in rows] avg_score = sum(scores) / len(scores) if scores else 0 error_rate = len([s for s in scores if s < threshold]) / len(scores) if scores else 0 cost_events = cost_by_agent.get(agent, 0) cost_per_run = cost_events / len(rows) if rows else 0 # Improvement trend: last 10 vs. prev 10 trend_str = "n/a (fewer than 20 runs)" if len(scores) >= 20: prev10 = scores[-20:-10] last10 = scores[-10:] prev_avg = sum(prev10) / len(prev10) last_avg = sum(last10) / len(last10) delta = last_avg - prev_avg if delta > 5: trend_str = f"improving (+{delta:.1f})" elif delta < -5: trend_str = f"declining ({delta:.1f})" else: trend_str = f"stable ({delta:+.1f})" elif len(scores) >= 10: last10 = scores[-10:] trend_str = f"recent avg: {sum(last10)/len(last10):.1f} (need 20 runs for trend)" # Pattern frequency patterns = defaultdict(int) for r in rows: if r['pattern']: patterns[r['pattern']] += 1 top_patterns = sorted(patterns.items(), key=lambda x: -x[1])[:3] print(f"Agent: {agent}") print(f" Runs: {len(rows)}") print(f" Avg score: {avg_score:.1f}/100") print(f" Error rate: {error_rate*100:.0f}% (score < {threshold})") print(f" Cost/run: ~{cost_per_run:.1f} events (rough proxy)") print(f" Trend: {trend_str}") if top_patterns: print(f" Top patterns: {', '.join(f'{p}({c})' for p, c in top_patterns)}") print() if avg_score < threshold: flagged.append((agent, avg_score)) # Summary of flagged agents if flagged: print("FLAGGED AGENTS (below threshold)") print("-" * 40) for agent, avg in flagged: print(f" {agent}: avg {avg:.1f} < {threshold}") print() print("Recommended actions:") print(" 1. Review feedback rows for top patterns") print(" 2. Iterate on agent system prompt") print(" 3. Consider pipeline redesign if pattern is structural") print(" 4. Run pipeline-optimizer.sh for bottleneck analysis") else: print("All agents above threshold.") PYEOF