Build an AI-Powered Incident Postmortem Generator with Claude

Build a tool that takes PagerDuty incident data, Slack messages, and system metrics and generates a structured blameless postmortem — timeline, root cause, action items — in minutes instead of days.

Postmortems take hours to write and often get skipped or delayed. This tool pulls incident context from PagerDuty, Slack, and metrics automatically, then uses Claude to draft a complete blameless postmortem that your team can review and refine.

What It Generates

Executive summary (2-3 sentences)
Incident timeline (from alert to resolution)
Root cause analysis (5 Whys)
Impact assessment (users/services affected)
Contributing factors
Action items with owners and priorities
Lessons learned

Architecture

PagerDuty API → Incident data + alert timeline
Slack API     → #incidents channel messages
Prometheus    → Error rate, latency during window
      ↓
Context Builder
      ↓
Claude (analysis + writing)
      ↓
Structured Postmortem (Markdown or Notion page)

Step 1 — PagerDuty Data Fetcher

python

# pagerduty_client.py
import requests
from dataclasses import dataclass
from typing import List
from datetime import datetime
 
 
@dataclass
class IncidentData:
    id: str
    title: str
    severity: str
    started_at: str
    resolved_at: str
    duration_minutes: int
    service: str
    triggered_by: str
    notes: List[dict]
    alerts: List[dict]
 
 
class PagerDutyClient:
    def __init__(self, api_key: str):
        self.headers = {
            "Authorization": f"Token token={api_key}",
            "Accept": "application/vnd.pagerduty+json;version=2",
        }
        self.base = "https://api.pagerduty.com"
 
    def get_incident(self, incident_id: str) -> IncidentData:
        r = requests.get(
            f"{self.base}/incidents/{incident_id}",
            headers=self.headers
        )
        inc = r.json()["incident"]
 
        # Get notes/updates
        notes_r = requests.get(
            f"{self.base}/incidents/{incident_id}/notes",
            headers=self.headers
        )
        notes = notes_r.json().get("notes", [])
 
        # Get alerts
        alerts_r = requests.get(
            f"{self.base}/incidents/{incident_id}/alerts",
            headers=self.headers
        )
        alerts = alerts_r.json().get("alerts", [])
 
        started = datetime.fromisoformat(inc["created_at"].replace("Z", "+00:00"))
        resolved = datetime.fromisoformat(inc["resolved_at"].replace("Z", "+00:00"))
        duration = int((resolved - started).total_seconds() / 60)
 
        return IncidentData(
            id=incident_id,
            title=inc["title"],
            severity=inc["urgency"],
            started_at=inc["created_at"],
            resolved_at=inc["resolved_at"],
            duration_minutes=duration,
            service=inc.get("service", {}).get("summary", "unknown"),
            triggered_by=inc.get("first_trigger_log_entry", {}).get("summary", ""),
            notes=[{"time": n["created_at"], "content": n["content"]} for n in notes],
            alerts=[{"summary": a["summary"], "time": a["created_at"]} for a in alerts],
        )

Step 2 — Slack Message Fetcher

python

# slack_client.py
from slack_sdk import WebClient
from datetime import datetime, timezone
 
 
class SlackFetcher:
    def __init__(self, token: str):
        self.client = WebClient(token=token)
 
    def get_incident_messages(
        self, channel_id: str, start_ts: str, end_ts: str
    ) -> List[dict]:
        """Fetch messages from incident channel during incident window"""
        start = datetime.fromisoformat(start_ts.replace("Z", "+00:00"))
        end = datetime.fromisoformat(end_ts.replace("Z", "+00:00"))
 
        result = self.client.conversations_history(
            channel=channel_id,
            oldest=str(start.timestamp()),
            latest=str(end.timestamp()),
            limit=200,
        )
 
        messages = []
        for msg in result.get("messages", []):
            # Resolve user ID to name
            user_info = self.client.users_info(user=msg.get("user", ""))
            username = user_info["user"]["real_name"] if user_info["ok"] else "Unknown"
            ts = datetime.fromtimestamp(float(msg["ts"]), tz=timezone.utc)
            messages.append({
                "user": username,
                "time": ts.isoformat(),
                "text": msg.get("text", ""),
            })
 
        return sorted(messages, key=lambda m: m["time"])

Step 3 — Metrics Context

python

# metrics_collector.py
import requests
from datetime import datetime, timedelta
 
 
class MetricsCollector:
    def __init__(self, prometheus_url: str):
        self.url = prometheus_url
 
    def get_incident_metrics(self, service: str, start: str, end: str) -> dict:
        """Get key metrics during incident window"""
        start_ts = datetime.fromisoformat(start.replace("Z", "+00:00")).timestamp()
        end_ts = datetime.fromisoformat(end.replace("Z", "+00:00")).timestamp()
 
        def query_range(promql):
            r = requests.get(f"{self.url}/api/v1/query_range", params={
                "query": promql,
                "start": start_ts,
                "end": end_ts,
                "step": "60",
            })
            data = r.json().get("data", {}).get("result", [])
            if data:
                values = [float(v[1]) for v in data[0]["values"]]
                return {"max": max(values), "avg": sum(values) / len(values)}
            return {}
 
        return {
            "error_rate": query_range(
                f'sum(rate(http_requests_total{{job="{service}",code=~"5.."}}[5m])) / '
                f'sum(rate(http_requests_total{{job="{service}"}}[5m]))'
            ),
            "p99_latency_ms": query_range(
                f'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{{job="{service}"}}[5m])) by (le)) * 1000'
            ),
            "request_rate": query_range(
                f'sum(rate(http_requests_total{{job="{service}"}}[5m]))'
            ),
        }

Step 4 — Postmortem Generator

python

# generator.py
import anthropic
import json
from pagerduty_client import PagerDutyClient, IncidentData
from slack_client import SlackFetcher
from metrics_collector import MetricsCollector
 
client = anthropic.Anthropic()
 
POSTMORTEM_PROMPT = """You are an experienced SRE writing a blameless postmortem.
Using the incident data provided, write a complete postmortem with these sections:
 
## Executive Summary
2-3 sentences: what happened, impact, duration.
 
## Timeline
Chronological events in table format: | Time | Event |
 
## Root Cause
What actually caused the incident. Use 5 Whys if appropriate.
 
## Contributing Factors
What made this worse or harder to detect/resolve.
 
## Impact
- Users/customers affected
- Services affected
- Error budget consumed (if SLO data available)
 
## Action Items
Table: | Priority | Action | Owner | Due Date |
Priority: P1 (immediate), P2 (this sprint), P3 (this quarter)
 
## Lessons Learned
What we learned that we didn't know before.
 
Write in a blameless tone — systems failed, not people. Be specific and actionable."""
 
 
def generate_postmortem(
    incident: IncidentData,
    slack_messages: list,
    metrics: dict,
    team_name: str = "on-call team",
) -> str:
    context = f"""INCIDENT DATA:
ID: {incident.id}
Title: {incident.title}
Service: {incident.service}
Duration: {incident.duration_minutes} minutes
Started: {incident.started_at}
Resolved: {incident.resolved_at}
Triggered by: {incident.triggered_by}
 
PAGERDUTY NOTES:
{json.dumps(incident.notes, indent=2)}
 
SLACK MESSAGES (incident channel):
{json.dumps(slack_messages[:50], indent=2)}
 
METRICS DURING INCIDENT:
Error rate peak: {metrics.get('error_rate', {}).get('max', 'N/A')}
P99 latency peak: {metrics.get('p99_latency_ms', {}).get('max', 'N/A')} ms
Request rate: {metrics.get('request_rate', {}).get('avg', 'N/A')} req/s"""
 
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=4096,
        system=POSTMORTEM_PROMPT,
        messages=[{"role": "user", "content": context}],
    )
    return response.content[0].text

Step 5 — CLI

python

# main.py
import click
import os
from generator import generate_postmortem
from pagerduty_client import PagerDutyClient
from slack_client import SlackFetcher
from metrics_collector import MetricsCollector
 
 
@click.command()
@click.option("--incident-id", required=True, help="PagerDuty incident ID")
@click.option("--slack-channel", default="C01234567", help="Slack channel ID for #incidents")
@click.option("--output", default="postmortem.md", help="Output file")
def generate(incident_id, slack_channel, output):
    """Generate a postmortem for a PagerDuty incident."""
    pd = PagerDutyClient(os.environ["PAGERDUTY_API_KEY"])
    slack = SlackFetcher(os.environ["SLACK_TOKEN"])
    metrics = MetricsCollector(os.environ.get("PROMETHEUS_URL", "http://localhost:9090"))
 
    click.echo(f"Fetching incident {incident_id}...")
    incident = pd.get_incident(incident_id)
 
    click.echo(f"Fetching Slack messages from {incident.started_at} to {incident.resolved_at}...")
    messages = slack.get_incident_messages(slack_channel, incident.started_at, incident.resolved_at)
 
    click.echo("Fetching metrics...")
    metric_data = metrics.get_incident_metrics(incident.service, incident.started_at, incident.resolved_at)
 
    click.echo("Generating postmortem with Claude...")
    postmortem = generate_postmortem(incident, messages, metric_data)
 
    with open(output, "w") as f:
        f.write(postmortem)
 
    click.echo(f"✅ Postmortem saved to {output}")
    click.echo(f"Duration: {incident.duration_minutes} minutes | Service: {incident.service}")
 
 
if __name__ == "__main__":
    generate()

Usage

bash

export PAGERDUTY_API_KEY="your-key"
export SLACK_TOKEN="xoxb-your-token"
export ANTHROPIC_API_KEY="your-key"
 
# Generate postmortem
python main.py --incident-id P1ABC23 --output postmortems/2026-06-09-payment-outage.md
 
# View output
cat postmortems/2026-06-09-payment-outage.md

The draft postmortem is ready in 2 minutes instead of 2 hours. The team reviews, adjusts attribution and action items, and publishes. The AI does the synthesis; the humans validate and own the action items.

Build an AI-Powered Incident Postmortem Generator with Claude

What It Generates

Architecture

Step 1 — PagerDuty Data Fetcher

Step 2 — Slack Message Fetcher

Step 3 — Metrics Context

Step 4 — Postmortem Generator

Step 5 — CLI

Usage

Stay ahead of the curve

Related Articles

AI-Powered Kubernetes Anomaly Detection: Beyond Static Thresholds

AI-Powered Log Analysis Is Replacing Manual Debugging in DevOps (2026)

AI-Powered Log Analysis — How LLMs Are Replacing grep for DevOps Engineers

Comments