Build an AI-Powered Incident Postmortem Generator with Claude
Build a tool that takes PagerDuty incident data, Slack messages, and system metrics and generates a structured blameless postmortem — timeline, root cause, action items — in minutes instead of days.
Postmortems take hours to write and often get skipped or delayed. This tool pulls incident context from PagerDuty, Slack, and metrics automatically, then uses Claude to draft a complete blameless postmortem that your team can review and refine.
What It Generates
- Executive summary (2-3 sentences)
- Incident timeline (from alert to resolution)
- Root cause analysis (5 Whys)
- Impact assessment (users/services affected)
- Contributing factors
- Action items with owners and priorities
- Lessons learned
Architecture
PagerDuty API → Incident data + alert timeline
Slack API → #incidents channel messages
Prometheus → Error rate, latency during window
↓
Context Builder
↓
Claude (analysis + writing)
↓
Structured Postmortem (Markdown or Notion page)
Step 1 — PagerDuty Data Fetcher
# pagerduty_client.py
import requests
from dataclasses import dataclass
from typing import List
from datetime import datetime
@dataclass
class IncidentData:
id: str
title: str
severity: str
started_at: str
resolved_at: str
duration_minutes: int
service: str
triggered_by: str
notes: List[dict]
alerts: List[dict]
class PagerDutyClient:
def __init__(self, api_key: str):
self.headers = {
"Authorization": f"Token token={api_key}",
"Accept": "application/vnd.pagerduty+json;version=2",
}
self.base = "https://api.pagerduty.com"
def get_incident(self, incident_id: str) -> IncidentData:
r = requests.get(
f"{self.base}/incidents/{incident_id}",
headers=self.headers
)
inc = r.json()["incident"]
# Get notes/updates
notes_r = requests.get(
f"{self.base}/incidents/{incident_id}/notes",
headers=self.headers
)
notes = notes_r.json().get("notes", [])
# Get alerts
alerts_r = requests.get(
f"{self.base}/incidents/{incident_id}/alerts",
headers=self.headers
)
alerts = alerts_r.json().get("alerts", [])
started = datetime.fromisoformat(inc["created_at"].replace("Z", "+00:00"))
resolved = datetime.fromisoformat(inc["resolved_at"].replace("Z", "+00:00"))
duration = int((resolved - started).total_seconds() / 60)
return IncidentData(
id=incident_id,
title=inc["title"],
severity=inc["urgency"],
started_at=inc["created_at"],
resolved_at=inc["resolved_at"],
duration_minutes=duration,
service=inc.get("service", {}).get("summary", "unknown"),
triggered_by=inc.get("first_trigger_log_entry", {}).get("summary", ""),
notes=[{"time": n["created_at"], "content": n["content"]} for n in notes],
alerts=[{"summary": a["summary"], "time": a["created_at"]} for a in alerts],
)Step 2 — Slack Message Fetcher
# slack_client.py
from slack_sdk import WebClient
from datetime import datetime, timezone
class SlackFetcher:
def __init__(self, token: str):
self.client = WebClient(token=token)
def get_incident_messages(
self, channel_id: str, start_ts: str, end_ts: str
) -> List[dict]:
"""Fetch messages from incident channel during incident window"""
start = datetime.fromisoformat(start_ts.replace("Z", "+00:00"))
end = datetime.fromisoformat(end_ts.replace("Z", "+00:00"))
result = self.client.conversations_history(
channel=channel_id,
oldest=str(start.timestamp()),
latest=str(end.timestamp()),
limit=200,
)
messages = []
for msg in result.get("messages", []):
# Resolve user ID to name
user_info = self.client.users_info(user=msg.get("user", ""))
username = user_info["user"]["real_name"] if user_info["ok"] else "Unknown"
ts = datetime.fromtimestamp(float(msg["ts"]), tz=timezone.utc)
messages.append({
"user": username,
"time": ts.isoformat(),
"text": msg.get("text", ""),
})
return sorted(messages, key=lambda m: m["time"])Step 3 — Metrics Context
# metrics_collector.py
import requests
from datetime import datetime, timedelta
class MetricsCollector:
def __init__(self, prometheus_url: str):
self.url = prometheus_url
def get_incident_metrics(self, service: str, start: str, end: str) -> dict:
"""Get key metrics during incident window"""
start_ts = datetime.fromisoformat(start.replace("Z", "+00:00")).timestamp()
end_ts = datetime.fromisoformat(end.replace("Z", "+00:00")).timestamp()
def query_range(promql):
r = requests.get(f"{self.url}/api/v1/query_range", params={
"query": promql,
"start": start_ts,
"end": end_ts,
"step": "60",
})
data = r.json().get("data", {}).get("result", [])
if data:
values = [float(v[1]) for v in data[0]["values"]]
return {"max": max(values), "avg": sum(values) / len(values)}
return {}
return {
"error_rate": query_range(
f'sum(rate(http_requests_total{{job="{service}",code=~"5.."}}[5m])) / '
f'sum(rate(http_requests_total{{job="{service}"}}[5m]))'
),
"p99_latency_ms": query_range(
f'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{{job="{service}"}}[5m])) by (le)) * 1000'
),
"request_rate": query_range(
f'sum(rate(http_requests_total{{job="{service}"}}[5m]))'
),
}Step 4 — Postmortem Generator
# generator.py
import anthropic
import json
from pagerduty_client import PagerDutyClient, IncidentData
from slack_client import SlackFetcher
from metrics_collector import MetricsCollector
client = anthropic.Anthropic()
POSTMORTEM_PROMPT = """You are an experienced SRE writing a blameless postmortem.
Using the incident data provided, write a complete postmortem with these sections:
## Executive Summary
2-3 sentences: what happened, impact, duration.
## Timeline
Chronological events in table format: | Time | Event |
## Root Cause
What actually caused the incident. Use 5 Whys if appropriate.
## Contributing Factors
What made this worse or harder to detect/resolve.
## Impact
- Users/customers affected
- Services affected
- Error budget consumed (if SLO data available)
## Action Items
Table: | Priority | Action | Owner | Due Date |
Priority: P1 (immediate), P2 (this sprint), P3 (this quarter)
## Lessons Learned
What we learned that we didn't know before.
Write in a blameless tone — systems failed, not people. Be specific and actionable."""
def generate_postmortem(
incident: IncidentData,
slack_messages: list,
metrics: dict,
team_name: str = "on-call team",
) -> str:
context = f"""INCIDENT DATA:
ID: {incident.id}
Title: {incident.title}
Service: {incident.service}
Duration: {incident.duration_minutes} minutes
Started: {incident.started_at}
Resolved: {incident.resolved_at}
Triggered by: {incident.triggered_by}
PAGERDUTY NOTES:
{json.dumps(incident.notes, indent=2)}
SLACK MESSAGES (incident channel):
{json.dumps(slack_messages[:50], indent=2)}
METRICS DURING INCIDENT:
Error rate peak: {metrics.get('error_rate', {}).get('max', 'N/A')}
P99 latency peak: {metrics.get('p99_latency_ms', {}).get('max', 'N/A')} ms
Request rate: {metrics.get('request_rate', {}).get('avg', 'N/A')} req/s"""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=4096,
system=POSTMORTEM_PROMPT,
messages=[{"role": "user", "content": context}],
)
return response.content[0].textStep 5 — CLI
# main.py
import click
import os
from generator import generate_postmortem
from pagerduty_client import PagerDutyClient
from slack_client import SlackFetcher
from metrics_collector import MetricsCollector
@click.command()
@click.option("--incident-id", required=True, help="PagerDuty incident ID")
@click.option("--slack-channel", default="C01234567", help="Slack channel ID for #incidents")
@click.option("--output", default="postmortem.md", help="Output file")
def generate(incident_id, slack_channel, output):
"""Generate a postmortem for a PagerDuty incident."""
pd = PagerDutyClient(os.environ["PAGERDUTY_API_KEY"])
slack = SlackFetcher(os.environ["SLACK_TOKEN"])
metrics = MetricsCollector(os.environ.get("PROMETHEUS_URL", "http://localhost:9090"))
click.echo(f"Fetching incident {incident_id}...")
incident = pd.get_incident(incident_id)
click.echo(f"Fetching Slack messages from {incident.started_at} to {incident.resolved_at}...")
messages = slack.get_incident_messages(slack_channel, incident.started_at, incident.resolved_at)
click.echo("Fetching metrics...")
metric_data = metrics.get_incident_metrics(incident.service, incident.started_at, incident.resolved_at)
click.echo("Generating postmortem with Claude...")
postmortem = generate_postmortem(incident, messages, metric_data)
with open(output, "w") as f:
f.write(postmortem)
click.echo(f"✅ Postmortem saved to {output}")
click.echo(f"Duration: {incident.duration_minutes} minutes | Service: {incident.service}")
if __name__ == "__main__":
generate()Usage
export PAGERDUTY_API_KEY="your-key"
export SLACK_TOKEN="xoxb-your-token"
export ANTHROPIC_API_KEY="your-key"
# Generate postmortem
python main.py --incident-id P1ABC23 --output postmortems/2026-06-09-payment-outage.md
# View output
cat postmortems/2026-06-09-payment-outage.mdThe draft postmortem is ready in 2 minutes instead of 2 hours. The team reviews, adjusts attribution and action items, and publishes. The AI does the synthesis; the humans validate and own the action items.
Today I Fixed
Short real fixes from production — posted daily
Stay ahead of the curve
Get the latest DevOps, Kubernetes, AWS, and AI/ML guides delivered straight to your inbox. No spam — just practical engineering content.
Related Articles
AI-Powered Kubernetes Anomaly Detection: Beyond Static Thresholds
Static alerts miss 40% of real incidents. Learn how AI and ML-based anomaly detection — using tools like Prometheus + ML, Dynatrace, and custom LLM runbooks — catches what thresholds can't.
AI-Powered Log Analysis Is Replacing Manual Debugging in DevOps (2026)
How LLMs and AI are transforming log analysis, anomaly detection, and root cause analysis — and the tools DevOps engineers should know about in 2026.
AI-Powered Log Analysis — How LLMs Are Replacing grep for DevOps Engineers
How to use LLMs and AI tools for intelligent log analysis in DevOps. Covers practical workflows, open-source tools, prompt engineering for logs, and building custom log analysis agents.