flowchart TD
%% ── Stage 1: Metric Collection ──────────────────────────────
START([⏰ Schedule Trigger
Every 15-30 min]) --> FETCH
subgraph STAGE1["Stage 1 — Gather Metrics"]
FETCH[Fetch from Prometheus]
FETCH --> CPU[CPU Usage
per host]
FETCH --> RAM[Memory Available
per host]
FETCH --> DISK[Disk Usage
per host]
FETCH --> IO[I/O Pressure
per host]
FETCH --> SVC_FAIL[Failed Services
all hosts]
FETCH --> SVC_INACTIVE[Inactive Critical
Services]
FETCH --> PROCS_CPU[Top CPU
Processes]
FETCH --> PROCS_MEM[Top Memory
Processes]
CPU & RAM & DISK & IO & SVC_FAIL & SVC_INACTIVE & PROCS_CPU & PROCS_MEM --> MERGE[Merge All Metrics]
MERGE --> PROCESS[Process Metrics
JS: threshold checks,
alert cache dedup]
end
PROCESS --> CACHE_CHECK{Issues found
& not cached?}
CACHE_CHECK -- No --> DONE_CLEAN([✅ No Issues])
CACHE_CHECK -- Yes --> LOKI_Q
subgraph LOKI["Loki Log Enrichment"]
LOKI_Q[Build Loki Query
failed/inactive services] --> LOKI_FETCH[Fetch Logs from Loki]
LOKI_FETCH --> LOKI_FMT[Format Loki Context
last 20 entries per stream,
truncate at 3000 chars]
end
%% ── Stage 2: AI Decision Engine ─────────────────────────────
LOKI_FMT --> AI
subgraph STAGE2["Stage 2 — AI Decision Engine"]
AI[Claude Analysis
metrics + logs + prompt]
AI --> PARSE[Parse JSON Response
with malformed JSON fallback]
PARSE --> ROUTE{Response type?}
end
ROUTE -- "actions: []
manual_issues: []" --> DONE_OK([✅ No Action Needed])
ROUTE -- "manual_issues only" --> MANUAL[📋 Post Manual Issues
to Discord]
MANUAL --> LOOP_CHECK
ROUTE -- "actions present" --> SUMMARY_CHECK
SUMMARY_CHECK{Multiple
actions?} -- Yes --> SUMMARY[📊 Post Summary
to Discord]
SUMMARY_CHECK -- No --> SPLIT
SUMMARY --> SPLIT
SPLIT[Split Actions
one item per action] --> LOOP_START
%% ── Stage 3: Discord Approval ───────────────────────────────
subgraph STAGE3["Stage 3 — Discord Approval Gate"]
LOOP_START[Next Action] --> PREP[Prepare Discord Message
risk level → emoji, color,
timeout, default action]
PREP --> POST_MSG[Post to Discord
via Bot API]
POST_MSG --> ADD_REACT[Add ✅ and ❌
Reactions]
ADD_REACT --> WAIT_POLL[⏳ Wait 20s]
WAIT_POLL --> CHECK_APPROVE[Check ✅ Reactions]
WAIT_POLL --> CHECK_REJECT[Check ❌ Reactions]
CHECK_APPROVE & CHECK_REJECT --> EVALUATE[Evaluate Approval
filter bot reactions,
check timeout]
EVALUATE --> DECISION{Decision?}
DECISION -- Pending --> WAIT_POLL
DECISION -- Rejected --> REJECTED[❌ Post Rejection
to Discord]
end
DECISION -- Approved --> LAUNCH
%% ── Stage 4: Automation Execution ───────────────────────────
subgraph STAGE4["Stage 4 — Automation Execution"]
LAUNCH[Build Platform Config
AWX or Semaphore
URLs, body, auth]
LAUNCH --> FIRE[Launch Job
POST to API]
FIRE --> EXTRACT_ID[Extract Job ID]
EXTRACT_ID --> JOB_WAIT[⏳ Wait 20s]
JOB_WAIT --> JOB_POLL[Poll Job Status
GET job endpoint]
JOB_POLL --> JOB_CHECK[Check Job Status
JS: poll counter,
max wait enforcement]
JOB_CHECK --> JOB_DONE{Status?}
JOB_DONE -- Running --> JOB_WAIT
JOB_DONE -- Timed out --> JOB_FAIL[⚠️ Job Timeout]
end
JOB_DONE -- "Complete
(success or failed)" --> VERIFY
%% ── Stage 5: Verification ───────────────────────────────────
subgraph STAGE5["Stage 5 — Verification & Closing"]
VERIFY[Format Job Output
extract stdout,
query Loki for
post-remediation logs]
VERIFY --> LOKI_POST[Fetch Post-Remediation
Logs from Loki
last 5 minutes]
LOKI_POST --> AI_VERIFY[Claude Verifies Result
job output + fresh logs]
AI_VERIFY --> DISCORD_RESULT[📋 Post Result
to Discord
structured feedback]
end
%% ── Loop Back ───────────────────────────────────────────────
DISCORD_RESULT --> LOOP_CHECK{More actions
in queue?}
REJECTED --> LOOP_CHECK
JOB_FAIL --> LOOP_CHECK
LOOP_CHECK -- Yes --> LOOP_START
LOOP_CHECK -- No --> DONE_FINAL([✅ Workflow Complete])
%% ── Styling ─────────────────────────────────────────────────
classDef stage1 fill:#e8f4f8,stroke:#2196F3,stroke-width:2px
classDef stage2 fill:#fef3e2,stroke:#FF9800,stroke-width:2px
classDef stage3 fill:#f3e5f5,stroke:#9C27B0,stroke-width:2px
classDef stage4 fill:#e8f5e9,stroke:#4CAF50,stroke-width:2px
classDef stage5 fill:#fff3e0,stroke:#FF5722,stroke-width:2px
classDef terminal fill:#f5f5f5,stroke:#607D8B,stroke-width:2px,stroke-dasharray: 5 5
class STAGE1 stage1
class STAGE2 stage2
class STAGE3 stage3
class STAGE4 stage4
class STAGE5 stage5
class DONE_CLEAN,DONE_OK,DONE_FINAL,START terminal