Build an AI Deployment Readiness Checker for Kubernetes
Build a Python tool that analyzes your Kubernetes manifests before deployment, detects misconfigurations, missing limits, security issues, and gives a go/no-go decision using Claude.
Most deployment failures are preventable. Missing resource limits, no liveness probes, privileged containers, wrong image tags — these are detectable before deployment. This tool catches them automatically.
What It Does
kubectl apply -f manifests/ → runs readiness check first:
🔍 Analyzing deployment: my-app
───────────────────────────────
✅ Resource limits set
✅ Liveness probe configured
⚠️ No readiness probe — service may receive traffic before app is ready
❌ Container running as root (securityContext.runAsNonRoot not set)
❌ Image uses 'latest' tag — not reproducible
SCORE: 58/100 — ❌ NOT READY TO DEPLOY
Claude Analysis:
"Two critical issues found. Running as root violates least-privilege
and can lead to container escape. The 'latest' tag means you can't
guarantee what's being deployed — pin to a specific SHA or version tag.
Fix these before deploying to production."
Setup
pip install anthropic pyyaml kubernetes richStep 1: Parse Kubernetes Manifests
# manifest_parser.py
import yaml
from pathlib import Path
def load_manifests(path: str) -> list[dict]:
"""Load all YAML manifests from a file or directory."""
manifests = []
p = Path(path)
files = list(p.glob("**/*.yaml")) + list(p.glob("**/*.yml")) if p.is_dir() else [p]
for file in files:
with open(file) as f:
docs = list(yaml.safe_load_all(f))
manifests.extend([d for d in docs if d])
return manifests
def extract_deployable(manifests: list[dict]) -> list[dict]:
"""Filter to just Deployments, StatefulSets, DaemonSets."""
kinds = {"Deployment", "StatefulSet", "DaemonSet", "Job", "CronJob"}
return [m for m in manifests if m.get("kind") in kinds]Step 2: Run Checks
# checker.py
from dataclasses import dataclass
@dataclass
class CheckResult:
name: str
passed: bool
severity: str # critical / warning / info
message: str
def check_manifest(manifest: dict) -> list[CheckResult]:
results = []
kind = manifest.get("kind", "")
name = manifest.get("metadata", {}).get("name", "unknown")
# Get containers (handle nested path for different resource types)
if kind == "CronJob":
spec = manifest.get("spec", {}).get("jobTemplate", {}).get("spec", {}).get("template", {}).get("spec", {})
else:
spec = manifest.get("spec", {}).get("template", {}).get("spec", {})
containers = spec.get("containers", []) + spec.get("initContainers", [])
for container in containers:
cname = container.get("name", "unknown")
# Check 1: Resource limits
resources = container.get("resources", {})
limits = resources.get("limits", {})
requests = resources.get("requests", {})
if not limits.get("cpu") or not limits.get("memory"):
results.append(CheckResult(
name=f"{name}/{cname}",
passed=False,
severity="critical",
message="No resource limits set — can starve other pods or cause OOM on node"
))
else:
results.append(CheckResult(name=f"{name}/{cname}", passed=True, severity="info", message="Resource limits set"))
if not requests.get("cpu") or not requests.get("memory"):
results.append(CheckResult(
name=f"{name}/{cname}",
passed=False,
severity="warning",
message="No resource requests — scheduler can't make accurate placement decisions"
))
# Check 2: Image tag
image = container.get("image", "")
if image.endswith(":latest") or ":" not in image:
results.append(CheckResult(
name=f"{name}/{cname}",
passed=False,
severity="critical",
message=f"Image '{image}' uses latest/no tag — not reproducible, may deploy unexpected version"
))
else:
results.append(CheckResult(name=f"{name}/{cname}", passed=True, severity="info", message="Image tag pinned"))
# Check 3: Liveness probe
if not container.get("livenessProbe"):
results.append(CheckResult(
name=f"{name}/{cname}",
passed=False,
severity="warning",
message="No liveness probe — Kubernetes won't restart hung containers"
))
# Check 4: Readiness probe
if not container.get("readinessProbe"):
results.append(CheckResult(
name=f"{name}/{cname}",
passed=False,
severity="warning",
message="No readiness probe — service may route traffic before app is ready"
))
# Check 5: Security context
sec_ctx = container.get("securityContext", {})
if not sec_ctx.get("runAsNonRoot"):
results.append(CheckResult(
name=f"{name}/{cname}",
passed=False,
severity="critical",
message="Container may run as root — set securityContext.runAsNonRoot: true"
))
if sec_ctx.get("privileged"):
results.append(CheckResult(
name=f"{name}/{cname}",
passed=False,
severity="critical",
message="Container is privileged — highest security risk, avoid unless absolutely necessary"
))
if not sec_ctx.get("readOnlyRootFilesystem"):
results.append(CheckResult(
name=f"{name}/{cname}",
passed=False,
severity="warning",
message="Root filesystem is writable — set readOnlyRootFilesystem: true"
))
# Check 6: Replicas
if kind == "Deployment":
replicas = manifest.get("spec", {}).get("replicas", 1)
if replicas < 2:
results.append(CheckResult(
name=name,
passed=False,
severity="warning",
message=f"Only {replicas} replica — no high availability"
))
return results
def calculate_score(results: list[CheckResult]) -> int:
"""Score out of 100."""
if not results:
return 100
total = len(results)
critical = sum(1 for r in results if not r.passed and r.severity == "critical")
warnings = sum(1 for r in results if not r.passed and r.severity == "warning")
score = 100 - (critical * 20) - (warnings * 5)
return max(0, score)Step 3: Claude Analysis
# analyzer.py
import anthropic
import json
import os
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
def analyze_issues(manifest_name: str, issues: list, score: int) -> str:
if not issues:
return "All checks passed. Manifest looks production-ready."
issues_text = "\n".join([
f"- [{r.severity.upper()}] {r.name}: {r.message}"
for r in issues if not r.passed
])
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=400,
messages=[{
"role": "user",
"content": f"""Kubernetes deployment readiness check for '{manifest_name}' scored {score}/100.
Issues found:
{issues_text}
In 3-4 sentences: explain the most critical issues and what could go wrong in production if they're not fixed. Be specific and actionable."""
}]
)
return response.content[0].textStep 4: Main CLI Tool
# main.py
import sys
import argparse
from rich.console import Console
from rich.table import Table
from rich import print as rprint
from manifest_parser import load_manifests, extract_deployable
from checker import check_manifest, calculate_score
from analyzer import analyze_issues
console = Console()
def main():
parser = argparse.ArgumentParser(description="Kubernetes Deployment Readiness Checker")
parser.add_argument("path", help="Path to manifest file or directory")
parser.add_argument("--min-score", type=int, default=70, help="Minimum score to pass (default: 70)")
parser.add_argument("--no-ai", action="store_true", help="Skip Claude analysis")
args = parser.parse_args()
manifests = load_manifests(args.path)
deployable = extract_deployable(manifests)
if not deployable:
console.print("[yellow]No deployable resources found[/yellow]")
return
all_passed = True
for manifest in deployable:
name = manifest.get("metadata", {}).get("name", "unknown")
kind = manifest.get("kind")
console.print(f"\n[bold]🔍 Analyzing {kind}: {name}[/bold]")
console.print("─" * 50)
results = check_manifest(manifest)
score = calculate_score(results)
# Display results table
table = Table(show_header=True)
table.add_column("Status", width=6)
table.add_column("Severity", width=10)
table.add_column("Issue")
for r in results:
status = "✅" if r.passed else ("❌" if r.severity == "critical" else "⚠️ ")
if not r.passed:
table.add_row(status, r.severity.upper(), r.message)
console.print(table)
# Score
color = "green" if score >= 80 else ("yellow" if score >= 60 else "red")
verdict = "✅ READY" if score >= args.min_score else "❌ NOT READY"
console.print(f"\n[{color}]SCORE: {score}/100 — {verdict}[/{color}]")
if score < args.min_score:
all_passed = False
# Claude analysis
if not args.no_ai:
issues = [r for r in results if not r.passed]
if issues:
console.print("\n[bold]Claude Analysis:[/bold]")
analysis = analyze_issues(name, issues, score)
console.print(analysis)
# Exit code for CI/CD integration
sys.exit(0 if all_passed else 1)
if __name__ == "__main__":
main()Use in CI/CD Pipeline
# .github/workflows/deploy.yml
- name: Check deployment readiness
run: |
pip install anthropic pyyaml rich
python readiness_checker/main.py ./k8s/production/ --min-score 80
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
# Job fails if score < 80, blocking the deploymentCatch misconfigurations before they reach production. Add this to your CI pipeline and your team will never deploy a container running as root or missing resource limits again.
Get your Anthropic API key — Claude Haiku makes the AI analysis fast and cheap (< $0.01 per check).
Today I Fixed
Short real fixes from production — posted daily
Stay ahead of the curve
Get the latest DevOps, Kubernetes, AWS, and AI/ML guides delivered straight to your inbox. No spam — just practical engineering content.
Related Articles
Build an AI-Powered DevOps Chatbot with Streamlit on Kubernetes
Build a DevOps assistant chatbot that answers infrastructure questions, generates kubectl commands, and explains errors — deployed as a Streamlit app on Kubernetes.
Build LLM-Powered Runbook Automation with Haystack and Kubernetes
Turn your static runbooks into an AI system that answers 'what do I do when X happens' with step-by-step instructions retrieved from your actual documentation.
Build a Natural Language kubectl — Ask Questions to Your Cluster
Build a CLI tool that lets you describe what you want in plain English and generates the correct kubectl command — powered by Claude API.