🎉 DevOps Interview Prep Bundle is live — 1000+ Q&A across 20 topicsGet it →
All Articles

Build an AI-Powered Incident Postmortem Generator with Claude

Build a tool that takes PagerDuty incident data, Slack messages, and system metrics and generates a structured blameless postmortem — timeline, root cause, action items — in minutes instead of days.

DevOpsBoysJun 9, 20265 min read
Share:Tweet

Postmortems take hours to write and often get skipped or delayed. This tool pulls incident context from PagerDuty, Slack, and metrics automatically, then uses Claude to draft a complete blameless postmortem that your team can review and refine.


What It Generates

  • Executive summary (2-3 sentences)
  • Incident timeline (from alert to resolution)
  • Root cause analysis (5 Whys)
  • Impact assessment (users/services affected)
  • Contributing factors
  • Action items with owners and priorities
  • Lessons learned

Architecture

PagerDuty API → Incident data + alert timeline
Slack API     → #incidents channel messages
Prometheus    → Error rate, latency during window
      ↓
Context Builder
      ↓
Claude (analysis + writing)
      ↓
Structured Postmortem (Markdown or Notion page)

Step 1 — PagerDuty Data Fetcher

python
# pagerduty_client.py
import requests
from dataclasses import dataclass
from typing import List
from datetime import datetime
 
 
@dataclass
class IncidentData:
    id: str
    title: str
    severity: str
    started_at: str
    resolved_at: str
    duration_minutes: int
    service: str
    triggered_by: str
    notes: List[dict]
    alerts: List[dict]
 
 
class PagerDutyClient:
    def __init__(self, api_key: str):
        self.headers = {
            "Authorization": f"Token token={api_key}",
            "Accept": "application/vnd.pagerduty+json;version=2",
        }
        self.base = "https://api.pagerduty.com"
 
    def get_incident(self, incident_id: str) -> IncidentData:
        r = requests.get(
            f"{self.base}/incidents/{incident_id}",
            headers=self.headers
        )
        inc = r.json()["incident"]
 
        # Get notes/updates
        notes_r = requests.get(
            f"{self.base}/incidents/{incident_id}/notes",
            headers=self.headers
        )
        notes = notes_r.json().get("notes", [])
 
        # Get alerts
        alerts_r = requests.get(
            f"{self.base}/incidents/{incident_id}/alerts",
            headers=self.headers
        )
        alerts = alerts_r.json().get("alerts", [])
 
        started = datetime.fromisoformat(inc["created_at"].replace("Z", "+00:00"))
        resolved = datetime.fromisoformat(inc["resolved_at"].replace("Z", "+00:00"))
        duration = int((resolved - started).total_seconds() / 60)
 
        return IncidentData(
            id=incident_id,
            title=inc["title"],
            severity=inc["urgency"],
            started_at=inc["created_at"],
            resolved_at=inc["resolved_at"],
            duration_minutes=duration,
            service=inc.get("service", {}).get("summary", "unknown"),
            triggered_by=inc.get("first_trigger_log_entry", {}).get("summary", ""),
            notes=[{"time": n["created_at"], "content": n["content"]} for n in notes],
            alerts=[{"summary": a["summary"], "time": a["created_at"]} for a in alerts],
        )

Step 2 — Slack Message Fetcher

python
# slack_client.py
from slack_sdk import WebClient
from datetime import datetime, timezone
 
 
class SlackFetcher:
    def __init__(self, token: str):
        self.client = WebClient(token=token)
 
    def get_incident_messages(
        self, channel_id: str, start_ts: str, end_ts: str
    ) -> List[dict]:
        """Fetch messages from incident channel during incident window"""
        start = datetime.fromisoformat(start_ts.replace("Z", "+00:00"))
        end = datetime.fromisoformat(end_ts.replace("Z", "+00:00"))
 
        result = self.client.conversations_history(
            channel=channel_id,
            oldest=str(start.timestamp()),
            latest=str(end.timestamp()),
            limit=200,
        )
 
        messages = []
        for msg in result.get("messages", []):
            # Resolve user ID to name
            user_info = self.client.users_info(user=msg.get("user", ""))
            username = user_info["user"]["real_name"] if user_info["ok"] else "Unknown"
            ts = datetime.fromtimestamp(float(msg["ts"]), tz=timezone.utc)
            messages.append({
                "user": username,
                "time": ts.isoformat(),
                "text": msg.get("text", ""),
            })
 
        return sorted(messages, key=lambda m: m["time"])

Step 3 — Metrics Context

python
# metrics_collector.py
import requests
from datetime import datetime, timedelta
 
 
class MetricsCollector:
    def __init__(self, prometheus_url: str):
        self.url = prometheus_url
 
    def get_incident_metrics(self, service: str, start: str, end: str) -> dict:
        """Get key metrics during incident window"""
        start_ts = datetime.fromisoformat(start.replace("Z", "+00:00")).timestamp()
        end_ts = datetime.fromisoformat(end.replace("Z", "+00:00")).timestamp()
 
        def query_range(promql):
            r = requests.get(f"{self.url}/api/v1/query_range", params={
                "query": promql,
                "start": start_ts,
                "end": end_ts,
                "step": "60",
            })
            data = r.json().get("data", {}).get("result", [])
            if data:
                values = [float(v[1]) for v in data[0]["values"]]
                return {"max": max(values), "avg": sum(values) / len(values)}
            return {}
 
        return {
            "error_rate": query_range(
                f'sum(rate(http_requests_total{{job="{service}",code=~"5.."}}[5m])) / '
                f'sum(rate(http_requests_total{{job="{service}"}}[5m]))'
            ),
            "p99_latency_ms": query_range(
                f'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{{job="{service}"}}[5m])) by (le)) * 1000'
            ),
            "request_rate": query_range(
                f'sum(rate(http_requests_total{{job="{service}"}}[5m]))'
            ),
        }

Step 4 — Postmortem Generator

python
# generator.py
import anthropic
import json
from pagerduty_client import PagerDutyClient, IncidentData
from slack_client import SlackFetcher
from metrics_collector import MetricsCollector
 
client = anthropic.Anthropic()
 
POSTMORTEM_PROMPT = """You are an experienced SRE writing a blameless postmortem.
Using the incident data provided, write a complete postmortem with these sections:
 
## Executive Summary
2-3 sentences: what happened, impact, duration.
 
## Timeline
Chronological events in table format: | Time | Event |
 
## Root Cause
What actually caused the incident. Use 5 Whys if appropriate.
 
## Contributing Factors
What made this worse or harder to detect/resolve.
 
## Impact
- Users/customers affected
- Services affected
- Error budget consumed (if SLO data available)
 
## Action Items
Table: | Priority | Action | Owner | Due Date |
Priority: P1 (immediate), P2 (this sprint), P3 (this quarter)
 
## Lessons Learned
What we learned that we didn't know before.
 
Write in a blameless tone — systems failed, not people. Be specific and actionable."""
 
 
def generate_postmortem(
    incident: IncidentData,
    slack_messages: list,
    metrics: dict,
    team_name: str = "on-call team",
) -> str:
    context = f"""INCIDENT DATA:
ID: {incident.id}
Title: {incident.title}
Service: {incident.service}
Duration: {incident.duration_minutes} minutes
Started: {incident.started_at}
Resolved: {incident.resolved_at}
Triggered by: {incident.triggered_by}
 
PAGERDUTY NOTES:
{json.dumps(incident.notes, indent=2)}
 
SLACK MESSAGES (incident channel):
{json.dumps(slack_messages[:50], indent=2)}
 
METRICS DURING INCIDENT:
Error rate peak: {metrics.get('error_rate', {}).get('max', 'N/A')}
P99 latency peak: {metrics.get('p99_latency_ms', {}).get('max', 'N/A')} ms
Request rate: {metrics.get('request_rate', {}).get('avg', 'N/A')} req/s"""
 
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=4096,
        system=POSTMORTEM_PROMPT,
        messages=[{"role": "user", "content": context}],
    )
    return response.content[0].text

Step 5 — CLI

python
# main.py
import click
import os
from generator import generate_postmortem
from pagerduty_client import PagerDutyClient
from slack_client import SlackFetcher
from metrics_collector import MetricsCollector
 
 
@click.command()
@click.option("--incident-id", required=True, help="PagerDuty incident ID")
@click.option("--slack-channel", default="C01234567", help="Slack channel ID for #incidents")
@click.option("--output", default="postmortem.md", help="Output file")
def generate(incident_id, slack_channel, output):
    """Generate a postmortem for a PagerDuty incident."""
    pd = PagerDutyClient(os.environ["PAGERDUTY_API_KEY"])
    slack = SlackFetcher(os.environ["SLACK_TOKEN"])
    metrics = MetricsCollector(os.environ.get("PROMETHEUS_URL", "http://localhost:9090"))
 
    click.echo(f"Fetching incident {incident_id}...")
    incident = pd.get_incident(incident_id)
 
    click.echo(f"Fetching Slack messages from {incident.started_at} to {incident.resolved_at}...")
    messages = slack.get_incident_messages(slack_channel, incident.started_at, incident.resolved_at)
 
    click.echo("Fetching metrics...")
    metric_data = metrics.get_incident_metrics(incident.service, incident.started_at, incident.resolved_at)
 
    click.echo("Generating postmortem with Claude...")
    postmortem = generate_postmortem(incident, messages, metric_data)
 
    with open(output, "w") as f:
        f.write(postmortem)
 
    click.echo(f"✅ Postmortem saved to {output}")
    click.echo(f"Duration: {incident.duration_minutes} minutes | Service: {incident.service}")
 
 
if __name__ == "__main__":
    generate()

Usage

bash
export PAGERDUTY_API_KEY="your-key"
export SLACK_TOKEN="xoxb-your-token"
export ANTHROPIC_API_KEY="your-key"
 
# Generate postmortem
python main.py --incident-id P1ABC23 --output postmortems/2026-06-09-payment-outage.md
 
# View output
cat postmortems/2026-06-09-payment-outage.md

The draft postmortem is ready in 2 minutes instead of 2 hours. The team reviews, adjusts attribution and action items, and publishes. The AI does the synthesis; the humans validate and own the action items.

🔧

Today I Fixed

Short real fixes from production — posted daily

Browse fixes
Newsletter

Stay ahead of the curve

Get the latest DevOps, Kubernetes, AWS, and AI/ML guides delivered straight to your inbox. No spam — just practical engineering content.

Related Articles

Comments