#!/bin/bash # Self-healing: categorize errors and apply recovery strategies. # Bash 3.2 compatible. Uses python3 for JSON/log parsing. # # Error categories and recovery strategies: # timeout -> retry with shorter task scope # permission-denied -> log and skip (do not retry) # tool-not-found -> log and alert, do not retry # api-error -> exponential backoff, max 3 retries # content-quality -> re-run with stricter prompt, max 2 retries # # Max total attempts: 5 (OpenClaw pattern -- hard cap regardless of category). # All recovery events logged to healing-log.jsonl. # # Usage: # ./self-healing.sh --error-type --agent --attempt --context # # Exit codes: # 0 -- recovery action taken (caller should retry) # 1 -- no recovery possible (caller should abort) # 2 -- max attempts reached (caller should escalate) # # Placeholders: # {{WORKING_DIR}} - absolute path to project directory WORKING_DIR="{{WORKING_DIR}}" HEALING_LOG="$WORKING_DIR/healing-log.jsonl" MAX_ATTEMPTS=5 ERROR_TYPE="" AGENT_NAME="" ATTEMPT=1 CONTEXT_MSG="" # Parse arguments (bash 3.2 compatible) while [ "$#" -gt 0 ]; do case "$1" in --error-type) ERROR_TYPE="$2"; shift 2 ;; --agent) AGENT_NAME="$2"; shift 2 ;; --attempt) ATTEMPT="$2"; shift 2 ;; --context) CONTEXT_MSG="$2"; shift 2 ;; *) shift ;; esac done if [ -z "$ERROR_TYPE" ]; then echo "Usage: $0 --error-type --agent --attempt --context " exit 1 fi # Hard cap: max 5 attempts total if [ "$ATTEMPT" -gt "$MAX_ATTEMPTS" ]; then echo "MAX ATTEMPTS REACHED ($MAX_ATTEMPTS) for $AGENT_NAME. Escalating." python3 -c " import json, time, os event = { 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), 'agent': '$AGENT_NAME', 'error_type': '$ERROR_TYPE', 'attempt': $ATTEMPT, 'action': 'escalate', 'reason': 'max_attempts_reached', 'context': '$CONTEXT_MSG' } with open('$HEALING_LOG', 'a') as f: f.write(json.dumps(event) + '\n') print(json.dumps(event)) " exit 2 fi # Determine recovery action per category RECOVERY_ACTION="" RECOVERY_DETAIL="" EXIT_CODE=0 case "$ERROR_TYPE" in timeout) RECOVERY_ACTION="retry_shorter" RECOVERY_DETAIL="Re-run with reduced task scope. Split task if attempt >= 3." if [ "$ATTEMPT" -ge 3 ]; then RECOVERY_DETAIL="Attempt $ATTEMPT: recommend splitting task before retry." fi EXIT_CODE=0 ;; permission-denied) RECOVERY_ACTION="skip" RECOVERY_DETAIL="Permission errors cannot be auto-resolved. Log and skip. Notify operator." EXIT_CODE=1 ;; tool-not-found) RECOVERY_ACTION="alert" RECOVERY_DETAIL="Tool not found -- check agent config and hook registrations. Do not retry." EXIT_CODE=1 ;; api-error) # Exponential backoff: 2^(attempt-1) seconds, max 3 retries if [ "$ATTEMPT" -le 3 ]; then BACKOFF_SECS=$(python3 -c "print(min(2 ** ($ATTEMPT - 1), 16))") RECOVERY_ACTION="retry_backoff" RECOVERY_DETAIL="API error -- wait ${BACKOFF_SECS}s then retry (attempt $ATTEMPT/3)." sleep "$BACKOFF_SECS" EXIT_CODE=0 else RECOVERY_ACTION="abort" RECOVERY_DETAIL="API error persists after 3 retries. Aborting." EXIT_CODE=1 fi ;; content-quality) # Max 2 retries for quality issues if [ "$ATTEMPT" -le 2 ]; then RECOVERY_ACTION="retry_strict" RECOVERY_DETAIL="Re-run with stricter prompt. Add explicit quality criteria (attempt $ATTEMPT/2)." EXIT_CODE=0 else RECOVERY_ACTION="escalate_quality" RECOVERY_DETAIL="Content quality below threshold after 2 retries. Escalate to human review." EXIT_CODE=2 fi ;; *) RECOVERY_ACTION="unknown" RECOVERY_DETAIL="Unknown error type '$ERROR_TYPE'. Logging and aborting." EXIT_CODE=1 ;; esac # Log recovery event python3 -c " import json, time event = { 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), 'agent': '$AGENT_NAME', 'error_type': '$ERROR_TYPE', 'attempt': $ATTEMPT, 'action': '$RECOVERY_ACTION', 'detail': '$RECOVERY_DETAIL', 'context': '$CONTEXT_MSG' } with open('$HEALING_LOG', 'a') as f: f.write(json.dumps(event) + '\n') print(json.dumps(event, indent=2)) " echo "Recovery: $RECOVERY_ACTION -- $RECOVERY_DETAIL" exit $EXIT_CODE