Build an AI-Powered SLO Breach Predictor with Claude and Prometheus
Build an SLO breach predictor that reads error budget burn rate from Prometheus, uses Claude to analyze patterns, and sends Slack alerts before SLOs breach — not after.
SLO alerts fire when you've already missed the target. An SLO breach predictor fires when you're burning error budget 3x faster than normal — giving the on-call team time to act before the SLO is violated.
Architecture
Prometheus → Error Budget Burn Rate → Predictor Service
↓
Claude API (pattern analysis)
↓
Prediction: breach in X hours
↓
Slack alert with context + runbook
SLO Concepts Refresher
Error Budget = 1 - SLO target
For 99.9% availability SLO:
Error budget = 0.1% = ~43 minutes/month
Burn rate = how fast you're consuming that budget
Burn rate 1x = consuming budget at exactly the SLO rate
Burn rate 14.4x = will exhaust 30-day budget in 2 hours
Standard multi-window burn rate alert (from Google SRE book):
# 5% budget consumed in 1 hour (fast burn)
(
sum(rate(http_requests_total{code=~"5.."}[1h])) /
sum(rate(http_requests_total[1h]))
) > 14.4 * 0.001 # 14.4 * (1 - 0.999)Step 1 — Prometheus Metrics Fetcher
# prometheus_client.py
import requests
from dataclasses import dataclass
from typing import Optional
@dataclass
class SLOMetrics:
service: str
error_rate_1h: float
error_rate_5m: float
burn_rate_1h: float
burn_rate_6h: float
budget_remaining_pct: float
request_rate: float
class PrometheusClient:
def __init__(self, url: str):
self.url = url
def query(self, promql: str) -> Optional[float]:
r = requests.get(
f"{self.url}/api/v1/query",
params={"query": promql},
timeout=10,
)
data = r.json()
results = data.get("data", {}).get("result", [])
if results:
return float(results[0]["value"][1])
return None
def get_slo_metrics(self, service: str, slo_target: float = 0.999) -> SLOMetrics:
error_budget = 1 - slo_target
base = f'job="{service}"'
error_rate_1h = self.query(
f"sum(rate(http_requests_total{{{base},code=~'5..'}}[1h])) / "
f"sum(rate(http_requests_total{{{base}}}[1h]))"
) or 0.0
error_rate_5m = self.query(
f"sum(rate(http_requests_total{{{base},code=~'5..'}}[5m])) / "
f"sum(rate(http_requests_total{{{base}}}[5m]))"
) or 0.0
burn_rate_1h = error_rate_1h / error_budget if error_budget > 0 else 0
burn_rate_6h = (self.query(
f"sum(rate(http_requests_total{{{base},code=~'5..'}}[6h])) / "
f"sum(rate(http_requests_total{{{base}}}[6h]))"
) or 0.0) / error_budget
# Approximate budget remaining (monthly window)
budget_consumed = self.query(
f"1 - (1 - sum(increase(http_requests_total{{{base},code=~'5..'}}[30d])) / "
f"sum(increase(http_requests_total{{{base}}}[30d]))) / {slo_target}"
) or 0.0
budget_remaining_pct = max(0, (1 - budget_consumed) * 100)
request_rate = self.query(
f"sum(rate(http_requests_total{{{base}}}[5m]))"
) or 0.0
return SLOMetrics(
service=service,
error_rate_1h=error_rate_1h,
error_rate_5m=error_rate_5m,
burn_rate_1h=burn_rate_1h,
burn_rate_6h=burn_rate_6h,
budget_remaining_pct=budget_remaining_pct,
request_rate=request_rate,
)Step 2 — Claude-Powered Breach Predictor
# predictor.py
import anthropic
from prometheus_client import PrometheusClient, SLOMetrics
from dataclasses import dataclass
client = anthropic.Anthropic()
prom = PrometheusClient("http://prometheus.monitoring.svc:9090")
SYSTEM_PROMPT = """You are an SRE analyzing SLO error budget burn rates.
Given metrics, predict:
1. Will the SLO breach? (yes/no)
2. Time until breach (hours)
3. Severity (critical/warning/ok)
4. Likely cause (based on patterns)
5. Recommended immediate action (1-2 sentences)
Be concise. Format as JSON."""
@dataclass
class Prediction:
will_breach: bool
hours_until_breach: float
severity: str
likely_cause: str
recommended_action: str
def predict_breach(metrics: SLOMetrics, historical_context: str = "") -> Prediction:
prompt = f"""Current SLO metrics for {metrics.service}:
- Error rate (5m): {metrics.error_rate_5m:.4%}
- Error rate (1h): {metrics.error_rate_1h:.4%}
- Burn rate (1h): {metrics.burn_rate_1h:.2f}x
- Burn rate (6h): {metrics.burn_rate_6h:.2f}x
- Error budget remaining: {metrics.budget_remaining_pct:.1f}%
- Request rate: {metrics.request_rate:.1f} req/s
{historical_context}
SLO target: 99.9% (error budget: 0.1% = ~43 min/month)
Current date/time context: analyze trend based on burn rates."""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
system=SYSTEM_PROMPT,
messages=[{"role": "user", "content": prompt}],
)
import json, re
text = response.content[0].text
json_match = re.search(r"\{.*\}", text, re.DOTALL)
data = json.loads(json_match.group()) if json_match else {}
return Prediction(
will_breach=data.get("will_breach", False),
hours_until_breach=data.get("hours_until_breach", float("inf")),
severity=data.get("severity", "ok"),
likely_cause=data.get("likely_cause", "Unknown"),
recommended_action=data.get("recommended_action", "Monitor"),
)Step 3 — Slack Alerting
# alerter.py
import requests
import os
from predictor import Prediction
from prometheus_client import SLOMetrics
SEVERITY_EMOJI = {
"critical": "🔴",
"warning": "🟡",
"ok": "🟢",
}
def send_slack_alert(metrics: SLOMetrics, prediction: Prediction):
if prediction.severity == "ok":
return # no alert needed
emoji = SEVERITY_EMOJI.get(prediction.severity, "⚠️")
breach_text = (
f"*{prediction.hours_until_breach:.1f} hours* to breach"
if prediction.hours_until_breach < 24
else "breach likely this month"
)
payload = {
"text": f"{emoji} *SLO Breach Prediction — {metrics.service}*",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": (
f"{emoji} *SLO Breach Prediction*\n"
f"Service: `{metrics.service}`\n"
f"Severity: *{prediction.severity.upper()}*\n"
f"Prediction: {breach_text}\n"
f"Error budget remaining: *{metrics.budget_remaining_pct:.1f}%*\n"
f"Burn rate (1h): *{metrics.burn_rate_1h:.1f}x*\n\n"
f"*Likely cause:* {prediction.likely_cause}\n"
f"*Action:* {prediction.recommended_action}"
),
},
}
],
}
requests.post(os.environ["SLACK_WEBHOOK_URL"], json=payload)Step 4 — Kubernetes CronJob
# main.py
import os
from prometheus_client import PrometheusClient
from predictor import predict_breach
from alerter import send_slack_alert
SERVICES = os.environ.get("SERVICES", "api,payments,auth").split(",")
def run():
prom = PrometheusClient(os.environ["PROMETHEUS_URL"])
for service in SERVICES:
try:
metrics = prom.get_slo_metrics(service)
prediction = predict_breach(metrics)
print(f"{service}: burn_rate={metrics.burn_rate_1h:.2f}x "
f"severity={prediction.severity}")
if prediction.severity in ("warning", "critical"):
send_slack_alert(metrics, prediction)
except Exception as e:
print(f"Error processing {service}: {e}")
if __name__ == "__main__":
run()# k8s-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: slo-breach-predictor
namespace: monitoring
spec:
schedule: "*/15 * * * *" # every 15 minutes
jobTemplate:
spec:
template:
spec:
containers:
- name: predictor
image: your-registry/slo-predictor:latest
env:
- name: PROMETHEUS_URL
value: "http://prometheus.monitoring.svc:9090"
- name: SERVICES
value: "api,payments,auth,checkout"
- name: SLACK_WEBHOOK_URL
valueFrom:
secretKeyRef:
name: slack-secret
key: webhook-url
- name: ANTHROPIC_API_KEY
valueFrom:
secretKeyRef:
name: anthropic-secret
key: api-key
restartPolicy: OnFailureAlert Example
🔴 SLO Breach Prediction — payments
Service: `payments`
Severity: CRITICAL
Prediction: 2.3 hours to breach
Error budget remaining: 8.2%
Burn rate (1h): 18.4x
Likely cause: Spike in 5xx errors from checkout flow, burn rate
escalated 3x in the last 20 minutes, consistent with a downstream
database connection issue or recent deployment.
Action: Check payments pod logs for database connection errors,
review recent deployments in the last 30 minutes.
This gives on-call engineers context before they even open a dashboard — they know what to look at the moment the alert fires.
Today I Fixed
Short real fixes from production — posted daily
Stay ahead of the curve
Get the latest DevOps, Kubernetes, AWS, and AI/ML guides delivered straight to your inbox. No spam — just practical engineering content.
Related Articles
AI-Powered Kubernetes Anomaly Detection: Beyond Static Thresholds
Static alerts miss 40% of real incidents. Learn how AI and ML-based anomaly detection — using tools like Prometheus + ML, Dynatrace, and custom LLM runbooks — catches what thresholds can't.
Build an AI Alert Classifier for Grafana Using LLMs (2026)
Tired of noisy Grafana alerts that wake you up for nothing? Build an AI layer that classifies incoming alerts as actionable or noise, enriches them with context, and routes them intelligently — using Claude or GPT-4 as the reasoning engine.
AI-Powered Log Analysis Is Replacing Manual Debugging in DevOps (2026)
How LLMs and AI are transforming log analysis, anomaly detection, and root cause analysis — and the tools DevOps engineers should know about in 2026.