diff --git a/scripts/templates/feedback/FEEDBACK.md b/scripts/templates/feedback/FEEDBACK.md new file mode 100644 index 0000000..eff6721 --- /dev/null +++ b/scripts/templates/feedback/FEEDBACK.md @@ -0,0 +1,28 @@ +# Feedback Log: {{PROJECT_NAME}} + +> Append-only. One row per pipeline run. Reviewed by performance-scorer.sh. + +## Feedback Table + +| Date | Pipeline | Agent | Score | Issue | Resolution | Pattern | +|------|----------|-------|-------|-------|------------|---------| +| {{DATE}} | {{PIPELINE_NAME}} | {{AGENT_NAME}} | {{SCORE}}/100 | {{ISSUE_DESCRIPTION}} | {{RESOLUTION}} | {{PATTERN_TAG}} | + +## Pattern Tags + +Use consistent tags so performance-scorer.sh can detect recurring issues: + +- `quality-low` — output below acceptance threshold +- `loop-excess` — more revision iterations than expected +- `timeout` — agent exceeded time budget +- `tool-fail` — tool call failed or returned unexpected result +- `cost-spike` — single run cost exceeded 3x average +- `scope-drift` — agent worked outside defined scope +- `hallucination` — output contained factual errors + +## Notes + +Scores are 0–100 as assigned by the reviewer agent or human reviewer. +A score below 60 triggers a flag in performance-scorer.sh. +Three or more rows with the same Pattern tag = recurring issue. +Recurring issues should drive prompt iteration or pipeline redesign. diff --git a/scripts/templates/feedback/README.md b/scripts/templates/feedback/README.md new file mode 100644 index 0000000..0ae83d3 --- /dev/null +++ b/scripts/templates/feedback/README.md @@ -0,0 +1,73 @@ +# Feedback Loop + +Systematic feedback collection and performance scoring for agent pipelines. + +## How it works + +1. After each pipeline run, a reviewer agent (or human) assigns a score (0–100) + and categorizes any issues with a pattern tag. +2. `feedback-collector.sh` runs as a PostToolUse hook on `review_pipeline` or + `score_output` tool calls. It appends a row to `FEEDBACK.md`. +3. When 3+ rows share the same pattern tag, a recurring-pattern alert fires. +4. `performance-scorer.sh` reads `FEEDBACK.md` and `budget/cost-events.jsonl` + to compute per-agent metrics: average score, error rate, cost per run, + improvement trend (last 10 vs. previous 10 runs). +5. Agents scoring below the threshold (default 60/100) are flagged for review. + +## Pattern tags + +Consistent tags are required for pattern detection to work. Use the tags +defined in `FEEDBACK.md`. Add project-specific tags as needed — but be +consistent. Inconsistent tagging produces false negatives. + +## Scoring → self-improvement connection + +Feedback scores are the input to VFM (Value-for-Money) pre-scoring +defined in `scripts/templates/proactive/VFM-SCORING.md` (Step 11). +A low-scoring agent gets a lower VFM pre-score for future pipeline tasks, +making it less likely to be selected until its performance improves. + +The feedback loop closes the improvement cycle: +1. Pipeline runs → reviewer assigns score + pattern tag +2. `feedback-collector.sh` appends to FEEDBACK.md +3. `performance-scorer.sh` flags underperforming agents +4. Developer reviews top patterns → iterates on agent prompt +5. New runs produce new feedback → trend shows improvement +6. VFM scores update automatically on next pipeline selection + +## Example: prompt iteration driven by feedback + +Suppose `agent-writer` repeatedly scores 45/100 with pattern `quality-low`: + +``` +| 2025-01-10 | doc-pipeline | agent-writer | 45/100 | Output too brief | Added detail requirement | quality-low | +| 2025-01-11 | doc-pipeline | agent-writer | 42/100 | Still too brief | Repeated instruction | quality-low | +| 2025-01-12 | doc-pipeline | agent-writer | 48/100 | Slightly better | — | quality-low | +``` + +After 3 rows: feedback-collector.sh fires the recurring-pattern alert. +performance-scorer.sh shows avg 45/100, error rate 100%. +Action: update agent-writer's system prompt with explicit length and +depth requirements. Next 10 runs show trend "improving (+18.3)". + +## Integration + +Add feedback-collector.sh as a PostToolUse hook in `.claude/settings.json`: + +```json +{ + "hooks": { + "PostToolUse": [{ + "matcher": "review_pipeline", + "hooks": [{"type": "command", "command": "bash feedback/feedback-collector.sh"}] + }] + } +} +``` + +Run performance-scorer.sh on demand or as a scheduled report: + +```bash +./feedback/performance-scorer.sh +./feedback/performance-scorer.sh --agent agent-writer --threshold 70 +``` diff --git a/scripts/templates/feedback/feedback-collector.sh b/scripts/templates/feedback/feedback-collector.sh new file mode 100644 index 0000000..0f62fba --- /dev/null +++ b/scripts/templates/feedback/feedback-collector.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# PostToolUse hook: Collect feedback after pipeline completion. +# Bash 3.2 compatible. Uses python3 for JSON parsing and CSV/MD append. +# +# Triggered after a designated "review" tool call completes. +# Reads pipeline output and reviewer score, appends to FEEDBACK.md, +# and detects recurring patterns (3+ rows with same tag = recurring). +# +# Placeholders: +# {{WORKING_DIR}} - absolute path to project directory +# {{PIPELINE_NAME}} - name of the pipeline being tracked +# {{SCORE_THRESHOLD}} - minimum acceptable score (default: 60) + +WORKING_DIR="{{WORKING_DIR}}" +PIPELINE_NAME="{{PIPELINE_NAME}}" +SCORE_THRESHOLD="${SCORE_THRESHOLD:-60}" +FEEDBACK_FILE="$WORKING_DIR/FEEDBACK.md" +HOOK_INPUT=$(cat) + +# Only act on review tool calls +TOOL_NAME=$(echo "$HOOK_INPUT" | python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + print(data.get('tool_name', '')) +except: + print('') +" 2>/dev/null) + +if [ "$TOOL_NAME" != "review_pipeline" ] && [ "$TOOL_NAME" != "score_output" ]; then + exit 0 +fi + +# Extract score, agent, issue, resolution, pattern from hook input +python3 << PYEOF +import sys, json, re, os +from datetime import datetime + +hook_input = """$HOOK_INPUT""" +feedback_file = "$FEEDBACK_FILE" +pipeline_name = "$PIPELINE_NAME" +score_threshold = int("$SCORE_THRESHOLD") + +try: + data = json.loads(hook_input) +except Exception: + sys.exit(0) + +tool_result = data.get('tool_result', '') +if isinstance(tool_result, dict): + tool_result = json.dumps(tool_result) + +# Parse structured fields from tool result (expects JSON or key:value) +agent_name = os.environ.get('AGENT_NAME', 'unknown') +score = 0 +issue = '' +resolution = '' +pattern = '' + +try: + result_data = json.loads(tool_result) + agent_name = result_data.get('agent', agent_name) + score = int(result_data.get('score', 0)) + issue = result_data.get('issue', '') + resolution = result_data.get('resolution', '') + pattern = result_data.get('pattern', '') +except Exception: + # Fallback: look for score: N in plain text + m = re.search(r'score[:\s]+(\d+)', tool_result, re.IGNORECASE) + if m: + score = int(m.group(1)) + m = re.search(r'pattern[:\s]+(\S+)', tool_result, re.IGNORECASE) + if m: + pattern = m.group(1) + +if score == 0 and not issue: + sys.exit(0) + +date_str = datetime.utcnow().strftime('%Y-%m-%d') +row = f"| {date_str} | {pipeline_name} | {agent_name} | {score}/100 | {issue} | {resolution} | {pattern} |" + +# Append to feedback table +if not os.path.exists(feedback_file): + print(f"Warning: {feedback_file} not found -- skipping feedback append") + sys.exit(0) + +with open(feedback_file, 'r') as f: + content = f.read() + +# Insert row after the header row of the table +table_header = '| Date | Pipeline | Agent | Score | Issue | Resolution | Pattern |' +separator = '|------|----------|-------|-------|-------|------------|---------|' +placeholder_row = '| {{DATE}} | {{PIPELINE_NAME}} | {{AGENT_NAME}} | {{SCORE}}/100 | {{ISSUE_DESCRIPTION}} | {{RESOLUTION}} | {{PATTERN_TAG}} |' + +if placeholder_row in content: + # Replace placeholder with real row + keep placeholder for next time + content = content.replace(placeholder_row, row + '\n' + placeholder_row) +elif separator in content: + content = content.replace(separator, separator + '\n' + row) +else: + content += '\n' + row + '\n' + +with open(feedback_file, 'w') as f: + f.write(content) + +print(f"Feedback recorded: score={score}, pattern={pattern}") + +# Detect recurring patterns +if pattern: + pattern_count = content.count(f'| {pattern} |') + if pattern_count >= 3: + print(f"RECURRING PATTERN DETECTED: '{pattern}' appears {pattern_count} times") + print(f"Action required: review prompt or pipeline for '{pipeline_name}'") + +# Flag low scores +if score < score_threshold and score > 0: + print(f"LOW SCORE: {score} < threshold {score_threshold} for agent {agent_name}") +PYEOF + +exit 0 diff --git a/scripts/templates/feedback/performance-scorer.sh b/scripts/templates/feedback/performance-scorer.sh new file mode 100644 index 0000000..a5b7d7d --- /dev/null +++ b/scripts/templates/feedback/performance-scorer.sh @@ -0,0 +1,183 @@ +#!/bin/bash +# Performance scorer: per-agent metrics from FEEDBACK.md + cost-events.jsonl. +# Bash 3.2 compatible. Uses python3 for all metrics computation. +# +# Metrics per agent: +# - Average score (0-100) +# - Error rate (rows with score < threshold / total rows) +# - Cost per run (from cost-events.jsonl, rough proxy) +# - Improvement trend: avg of last 10 scores vs. previous 10 +# +# Flags agents below threshold (default 60/100). +# +# Usage: +# ./performance-scorer.sh # Score all agents +# ./performance-scorer.sh --agent {{AGENT}} # Score specific agent +# ./performance-scorer.sh --threshold 70 # Custom threshold +# +# Placeholders: +# {{WORKING_DIR}} - absolute path to project directory + +WORKING_DIR="{{WORKING_DIR}}" +FEEDBACK_FILE="$WORKING_DIR/FEEDBACK.md" +COST_LOG="$WORKING_DIR/budget/cost-events.jsonl" +THRESHOLD="${2:-60}" +AGENT_FILTER="" + +# Parse arguments (bash 3.2 compatible -- no associative arrays) +while [ "$#" -gt 0 ]; do + case "$1" in + --agent) AGENT_FILTER="$2"; shift 2 ;; + --threshold) THRESHOLD="$2"; shift 2 ;; + *) shift ;; + esac +done + +if [ ! -f "$FEEDBACK_FILE" ]; then + echo "No feedback file found at $FEEDBACK_FILE" + exit 0 +fi + +python3 << PYEOF +import re, json, os, sys +from collections import defaultdict + +feedback_file = "$FEEDBACK_FILE" +cost_log = "$COST_LOG" +threshold = int("$THRESHOLD") +agent_filter = "$AGENT_FILTER" + +# Parse FEEDBACK.md table rows +# Expected columns: Date, Pipeline, Agent, Score, Issue, Resolution, Pattern +feedback_rows = [] +with open(feedback_file) as f: + in_table = False + header_seen = False + for line in f: + line = line.strip() + if '| Date |' in line: + in_table = True + header_seen = True + continue + if in_table and line.startswith('|---'): + continue + if in_table and line.startswith('|') and '{{' not in line and header_seen: + cols = [c.strip() for c in line.strip('|').split('|')] + if len(cols) >= 7: + try: + date = cols[0] + pipeline = cols[1] + agent = cols[2] + score_str = cols[3] + issue = cols[4] + resolution = cols[5] + pattern = cols[6] + # Parse score: "75/100" or "75" + score_m = re.match(r'(\d+)', score_str) + score = int(score_m.group(1)) if score_m else 0 + feedback_rows.append({ + 'date': date, + 'pipeline': pipeline, + 'agent': agent, + 'score': score, + 'issue': issue, + 'pattern': pattern + }) + except (ValueError, IndexError): + pass + +# Filter by agent if specified +if agent_filter: + feedback_rows = [r for r in feedback_rows if r['agent'] == agent_filter] + +if not feedback_rows: + print("No feedback rows found.") + sys.exit(0) + +# Read cost events if available +cost_by_agent = defaultdict(int) +if os.path.exists(cost_log): + with open(cost_log) as f: + for line in f: + line = line.strip() + if line: + try: + event = json.loads(line) + agent = event.get('agent', 'unknown') + cost_by_agent[agent] += 1 # event count as proxy + except Exception: + pass + +# Compute per-agent metrics +agents = list(set(r['agent'] for r in feedback_rows)) + +print("PERFORMANCE SCORECARD") +print("=" * 60) +print(f"Threshold: {threshold}/100") +print(f"Total feedback rows: {len(feedback_rows)}") +print() + +flagged = [] + +for agent in sorted(agents): + rows = [r for r in feedback_rows if r['agent'] == agent] + scores = [r['score'] for r in rows] + + avg_score = sum(scores) / len(scores) if scores else 0 + error_rate = len([s for s in scores if s < threshold]) / len(scores) if scores else 0 + cost_events = cost_by_agent.get(agent, 0) + cost_per_run = cost_events / len(rows) if rows else 0 + + # Improvement trend: last 10 vs. prev 10 + trend_str = "n/a (fewer than 20 runs)" + if len(scores) >= 20: + prev10 = scores[-20:-10] + last10 = scores[-10:] + prev_avg = sum(prev10) / len(prev10) + last_avg = sum(last10) / len(last10) + delta = last_avg - prev_avg + if delta > 5: + trend_str = f"improving (+{delta:.1f})" + elif delta < -5: + trend_str = f"declining ({delta:.1f})" + else: + trend_str = f"stable ({delta:+.1f})" + elif len(scores) >= 10: + last10 = scores[-10:] + trend_str = f"recent avg: {sum(last10)/len(last10):.1f} (need 20 runs for trend)" + + # Pattern frequency + patterns = defaultdict(int) + for r in rows: + if r['pattern']: + patterns[r['pattern']] += 1 + top_patterns = sorted(patterns.items(), key=lambda x: -x[1])[:3] + + print(f"Agent: {agent}") + print(f" Runs: {len(rows)}") + print(f" Avg score: {avg_score:.1f}/100") + print(f" Error rate: {error_rate*100:.0f}% (score < {threshold})") + print(f" Cost/run: ~{cost_per_run:.1f} events (rough proxy)") + print(f" Trend: {trend_str}") + if top_patterns: + print(f" Top patterns: {', '.join(f'{p}({c})' for p, c in top_patterns)}") + print() + + if avg_score < threshold: + flagged.append((agent, avg_score)) + +# Summary of flagged agents +if flagged: + print("FLAGGED AGENTS (below threshold)") + print("-" * 40) + for agent, avg in flagged: + print(f" {agent}: avg {avg:.1f} < {threshold}") + print() + print("Recommended actions:") + print(" 1. Review feedback rows for top patterns") + print(" 2. Iterate on agent system prompt") + print(" 3. Consider pipeline redesign if pattern is structural") + print(" 4. Run pipeline-optimizer.sh for bottleneck analysis") +else: + print("All agents above threshold.") +PYEOF