feat(templates): add pipeline optimization and self-healing templates
Session 5 step 21 — pipeline-optimizer writes RECOMMENDATIONS.md with VFM pre-scores (never modifies pipeline files directly). self-healing categorizes errors and applies recovery strategies with 5-attempt hard cap, logging to healing-log.jsonl. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d743ec7fbf
commit
fa8bc86897
3 changed files with 456 additions and 0 deletions
147
scripts/templates/optimization/self-healing.sh
Normal file
147
scripts/templates/optimization/self-healing.sh
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
#!/bin/bash
|
||||
# Self-healing: categorize errors and apply recovery strategies.
|
||||
# Bash 3.2 compatible. Uses python3 for JSON/log parsing.
|
||||
#
|
||||
# Error categories and recovery strategies:
|
||||
# timeout -> retry with shorter task scope
|
||||
# permission-denied -> log and skip (do not retry)
|
||||
# tool-not-found -> log and alert, do not retry
|
||||
# api-error -> exponential backoff, max 3 retries
|
||||
# content-quality -> re-run with stricter prompt, max 2 retries
|
||||
#
|
||||
# Max total attempts: 5 (OpenClaw pattern -- hard cap regardless of category).
|
||||
# All recovery events logged to healing-log.jsonl.
|
||||
#
|
||||
# Usage:
|
||||
# ./self-healing.sh --error-type <type> --agent <name> --attempt <n> --context <msg>
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 -- recovery action taken (caller should retry)
|
||||
# 1 -- no recovery possible (caller should abort)
|
||||
# 2 -- max attempts reached (caller should escalate)
|
||||
#
|
||||
# Placeholders:
|
||||
# {{WORKING_DIR}} - absolute path to project directory
|
||||
|
||||
WORKING_DIR="{{WORKING_DIR}}"
|
||||
HEALING_LOG="$WORKING_DIR/healing-log.jsonl"
|
||||
MAX_ATTEMPTS=5
|
||||
|
||||
ERROR_TYPE=""
|
||||
AGENT_NAME=""
|
||||
ATTEMPT=1
|
||||
CONTEXT_MSG=""
|
||||
|
||||
# Parse arguments (bash 3.2 compatible)
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--error-type) ERROR_TYPE="$2"; shift 2 ;;
|
||||
--agent) AGENT_NAME="$2"; shift 2 ;;
|
||||
--attempt) ATTEMPT="$2"; shift 2 ;;
|
||||
--context) CONTEXT_MSG="$2"; shift 2 ;;
|
||||
*) shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$ERROR_TYPE" ]; then
|
||||
echo "Usage: $0 --error-type <type> --agent <name> --attempt <n> --context <msg>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Hard cap: max 5 attempts total
|
||||
if [ "$ATTEMPT" -gt "$MAX_ATTEMPTS" ]; then
|
||||
echo "MAX ATTEMPTS REACHED ($MAX_ATTEMPTS) for $AGENT_NAME. Escalating."
|
||||
python3 -c "
|
||||
import json, time, os
|
||||
event = {
|
||||
'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
|
||||
'agent': '$AGENT_NAME',
|
||||
'error_type': '$ERROR_TYPE',
|
||||
'attempt': $ATTEMPT,
|
||||
'action': 'escalate',
|
||||
'reason': 'max_attempts_reached',
|
||||
'context': '$CONTEXT_MSG'
|
||||
}
|
||||
with open('$HEALING_LOG', 'a') as f:
|
||||
f.write(json.dumps(event) + '\n')
|
||||
print(json.dumps(event))
|
||||
"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Determine recovery action per category
|
||||
RECOVERY_ACTION=""
|
||||
RECOVERY_DETAIL=""
|
||||
EXIT_CODE=0
|
||||
|
||||
case "$ERROR_TYPE" in
|
||||
timeout)
|
||||
RECOVERY_ACTION="retry_shorter"
|
||||
RECOVERY_DETAIL="Re-run with reduced task scope. Split task if attempt >= 3."
|
||||
if [ "$ATTEMPT" -ge 3 ]; then
|
||||
RECOVERY_DETAIL="Attempt $ATTEMPT: recommend splitting task before retry."
|
||||
fi
|
||||
EXIT_CODE=0
|
||||
;;
|
||||
permission-denied)
|
||||
RECOVERY_ACTION="skip"
|
||||
RECOVERY_DETAIL="Permission errors cannot be auto-resolved. Log and skip. Notify operator."
|
||||
EXIT_CODE=1
|
||||
;;
|
||||
tool-not-found)
|
||||
RECOVERY_ACTION="alert"
|
||||
RECOVERY_DETAIL="Tool not found -- check agent config and hook registrations. Do not retry."
|
||||
EXIT_CODE=1
|
||||
;;
|
||||
api-error)
|
||||
# Exponential backoff: 2^(attempt-1) seconds, max 3 retries
|
||||
if [ "$ATTEMPT" -le 3 ]; then
|
||||
BACKOFF_SECS=$(python3 -c "print(min(2 ** ($ATTEMPT - 1), 16))")
|
||||
RECOVERY_ACTION="retry_backoff"
|
||||
RECOVERY_DETAIL="API error -- wait ${BACKOFF_SECS}s then retry (attempt $ATTEMPT/3)."
|
||||
sleep "$BACKOFF_SECS"
|
||||
EXIT_CODE=0
|
||||
else
|
||||
RECOVERY_ACTION="abort"
|
||||
RECOVERY_DETAIL="API error persists after 3 retries. Aborting."
|
||||
EXIT_CODE=1
|
||||
fi
|
||||
;;
|
||||
content-quality)
|
||||
# Max 2 retries for quality issues
|
||||
if [ "$ATTEMPT" -le 2 ]; then
|
||||
RECOVERY_ACTION="retry_strict"
|
||||
RECOVERY_DETAIL="Re-run with stricter prompt. Add explicit quality criteria (attempt $ATTEMPT/2)."
|
||||
EXIT_CODE=0
|
||||
else
|
||||
RECOVERY_ACTION="escalate_quality"
|
||||
RECOVERY_DETAIL="Content quality below threshold after 2 retries. Escalate to human review."
|
||||
EXIT_CODE=2
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
RECOVERY_ACTION="unknown"
|
||||
RECOVERY_DETAIL="Unknown error type '$ERROR_TYPE'. Logging and aborting."
|
||||
EXIT_CODE=1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Log recovery event
|
||||
python3 -c "
|
||||
import json, time
|
||||
event = {
|
||||
'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
|
||||
'agent': '$AGENT_NAME',
|
||||
'error_type': '$ERROR_TYPE',
|
||||
'attempt': $ATTEMPT,
|
||||
'action': '$RECOVERY_ACTION',
|
||||
'detail': '$RECOVERY_DETAIL',
|
||||
'context': '$CONTEXT_MSG'
|
||||
}
|
||||
with open('$HEALING_LOG', 'a') as f:
|
||||
f.write(json.dumps(event) + '\n')
|
||||
print(json.dumps(event, indent=2))
|
||||
"
|
||||
|
||||
echo "Recovery: $RECOVERY_ACTION -- $RECOVERY_DETAIL"
|
||||
exit $EXIT_CODE
|
||||
Loading…
Add table
Add a link
Reference in a new issue