agent-builder/scripts/templates/feedback/performance-scorer.sh

#!/bin/bash
# Performance scorer: per-agent metrics from FEEDBACK.md + cost-events.jsonl.
# Bash 3.2 compatible. Uses python3 for all metrics computation.
#
# Metrics per agent:
#   - Average score (0-100)
#   - Error rate (rows with score < threshold / total rows)
#   - Cost per run (from cost-events.jsonl, rough proxy)
#   - Improvement trend: avg of last 10 scores vs. previous 10
#
# Flags agents below threshold (default 60/100).
#
# Usage:
#   ./performance-scorer.sh                     # Score all agents
#   ./performance-scorer.sh --agent {{AGENT}}   # Score specific agent
#   ./performance-scorer.sh --threshold 70      # Custom threshold
#
# Placeholders:
#   {{WORKING_DIR}} - absolute path to project directory

WORKING_DIR="{{WORKING_DIR}}"
FEEDBACK_FILE="$WORKING_DIR/FEEDBACK.md"
COST_LOG="$WORKING_DIR/budget/cost-events.jsonl"
THRESHOLD="${2:-60}"
AGENT_FILTER=""

# Parse arguments (bash 3.2 compatible -- no associative arrays)
while [ "$#" -gt 0 ]; do
  case "$1" in
    --agent) AGENT_FILTER="$2"; shift 2 ;;
    --threshold) THRESHOLD="$2"; shift 2 ;;
    *) shift ;;
  esac
done

if [ ! -f "$FEEDBACK_FILE" ]; then
  echo "No feedback file found at $FEEDBACK_FILE"
  exit 0
fi

python3 << PYEOF
import re, json, os, sys
from collections import defaultdict

feedback_file = "$FEEDBACK_FILE"
cost_log = "$COST_LOG"
threshold = int("$THRESHOLD")
agent_filter = "$AGENT_FILTER"

# Parse FEEDBACK.md table rows
# Expected columns: Date, Pipeline, Agent, Score, Issue, Resolution, Pattern
feedback_rows = []
with open(feedback_file) as f:
    in_table = False
    header_seen = False
    for line in f:
        line = line.strip()
        if '| Date |' in line:
            in_table = True
            header_seen = True
            continue
        if in_table and line.startswith('|---'):
            continue
        if in_table and line.startswith('|') and '{{' not in line and header_seen:
            cols = [c.strip() for c in line.strip('|').split('|')]
            if len(cols) >= 7:
                try:
                    date = cols[0]
                    pipeline = cols[1]
                    agent = cols[2]
                    score_str = cols[3]
                    issue = cols[4]
                    resolution = cols[5]
                    pattern = cols[6]
                    # Parse score: "75/100" or "75"
                    score_m = re.match(r'(\d+)', score_str)
                    score = int(score_m.group(1)) if score_m else 0
                    feedback_rows.append({
                        'date': date,
                        'pipeline': pipeline,
                        'agent': agent,
                        'score': score,
                        'issue': issue,
                        'pattern': pattern
                    })
                except (ValueError, IndexError):
                    pass

# Filter by agent if specified
if agent_filter:
    feedback_rows = [r for r in feedback_rows if r['agent'] == agent_filter]

if not feedback_rows:
    print("No feedback rows found.")
    sys.exit(0)

# Read cost events if available
cost_by_agent = defaultdict(int)
if os.path.exists(cost_log):
    with open(cost_log) as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    event = json.loads(line)
                    agent = event.get('agent', 'unknown')
                    cost_by_agent[agent] += 1  # event count as proxy
                except Exception:
                    pass

# Compute per-agent metrics
agents = list(set(r['agent'] for r in feedback_rows))

print("PERFORMANCE SCORECARD")
print("=" * 60)
print(f"Threshold: {threshold}/100")
print(f"Total feedback rows: {len(feedback_rows)}")
print()

flagged = []

for agent in sorted(agents):
    rows = [r for r in feedback_rows if r['agent'] == agent]
    scores = [r['score'] for r in rows]

    avg_score = sum(scores) / len(scores) if scores else 0
    error_rate = len([s for s in scores if s < threshold]) / len(scores) if scores else 0
    cost_events = cost_by_agent.get(agent, 0)
    cost_per_run = cost_events / len(rows) if rows else 0

    # Improvement trend: last 10 vs. prev 10
    trend_str = "n/a (fewer than 20 runs)"
    if len(scores) >= 20:
        prev10 = scores[-20:-10]
        last10 = scores[-10:]
        prev_avg = sum(prev10) / len(prev10)
        last_avg = sum(last10) / len(last10)
        delta = last_avg - prev_avg
        if delta > 5:
            trend_str = f"improving (+{delta:.1f})"
        elif delta < -5:
            trend_str = f"declining ({delta:.1f})"
        else:
            trend_str = f"stable ({delta:+.1f})"
    elif len(scores) >= 10:
        last10 = scores[-10:]
        trend_str = f"recent avg: {sum(last10)/len(last10):.1f} (need 20 runs for trend)"

    # Pattern frequency
    patterns = defaultdict(int)
    for r in rows:
        if r['pattern']:
            patterns[r['pattern']] += 1
    top_patterns = sorted(patterns.items(), key=lambda x: -x[1])[:3]

    print(f"Agent: {agent}")
    print(f"  Runs:          {len(rows)}")
    print(f"  Avg score:     {avg_score:.1f}/100")
    print(f"  Error rate:    {error_rate*100:.0f}% (score < {threshold})")
    print(f"  Cost/run:      ~{cost_per_run:.1f} events (rough proxy)")
    print(f"  Trend:         {trend_str}")
    if top_patterns:
        print(f"  Top patterns:  {', '.join(f'{p}({c})' for p, c in top_patterns)}")
    print()

    if avg_score < threshold:
        flagged.append((agent, avg_score))

# Summary of flagged agents
if flagged:
    print("FLAGGED AGENTS (below threshold)")
    print("-" * 40)
    for agent, avg in flagged:
        print(f"  {agent}: avg {avg:.1f} < {threshold}")
    print()
    print("Recommended actions:")
    print("  1. Review feedback rows for top patterns")
    print("  2. Iterate on agent system prompt")
    print("  3. Consider pipeline redesign if pattern is structural")
    print("  4. Run pipeline-optimizer.sh for bottleneck analysis")
else:
    print("All agents above threshold.")
PYEOF