Build an AI DevOps Assistant with MCP and Claude API
Use the Model Context Protocol (MCP) with Claude API to build a DevOps assistant that can read Kubernetes state, run Terraform commands, and query metrics via natural language.
MCP (Model Context Protocol) is an open standard that lets Claude connect to external data sources and tools. Instead of building custom tool integrations, you write an MCP server and Claude connects to it automatically. Here's how to build a DevOps MCP server.
What is MCP?
MCP lets you expose capabilities to Claude through a standardized interface:
- Resources — data Claude can read (cluster state, config files, dashboards)
- Tools — actions Claude can execute (run kubectl, apply terraform, query prometheus)
- Prompts — reusable templates for common tasks
Claude Desktop, the Claude API, and Claude Code can all use MCP servers.
Setup
# MCP SDK
pip install mcp anthropic
# DevOps tools
pip install kubernetes prometheus-api-clientBuild the DevOps MCP Server
#!/usr/bin/env python3
"""
DevOps MCP Server - exposes Kubernetes and monitoring tools to Claude
"""
import asyncio
import json
import subprocess
from typing import Any
from mcp.server import Server
from mcp.server.models import InitializationOptions
from mcp.server.stdio import stdio_server
from mcp.types import (
Resource,
Tool,
TextContent,
EmbeddedResource,
INVALID_PARAMS,
INTERNAL_ERROR,
)
from kubernetes import client, config
# Initialize
try:
config.load_incluster_config()
except Exception:
config.load_kube_config()
k8s_core = client.CoreV1Api()
k8s_apps = client.AppsV1Api()
server = Server("devops-assistant")
# ── RESOURCES ──────────────────────────────────────────────
@server.list_resources()
async def list_resources() -> list[Resource]:
"""Expose cluster overview as a resource."""
return [
Resource(
uri="k8s://cluster/overview",
name="Kubernetes Cluster Overview",
description="Current state of all namespaces and key resources",
mimeType="application/json",
),
Resource(
uri="k8s://nodes",
name="Kubernetes Nodes",
description="Node status and resource capacity",
mimeType="application/json",
),
]
@server.read_resource()
async def read_resource(uri: str) -> str:
if uri == "k8s://cluster/overview":
namespaces = k8s_core.list_namespace()
ns_names = [ns.metadata.name for ns in namespaces.items]
overview = {"namespaces": ns_names, "summary": {}}
for ns in ns_names[:10]: # limit for size
try:
pods = k8s_core.list_namespaced_pod(ns)
running = sum(1 for p in pods.items if p.status.phase == "Running")
total = len(pods.items)
overview["summary"][ns] = {"pods_running": running, "pods_total": total}
except Exception:
pass
return json.dumps(overview, indent=2)
elif uri == "k8s://nodes":
nodes = k8s_core.list_node()
node_info = []
for node in nodes.items:
conditions = {c.type: c.status for c in (node.status.conditions or [])}
capacity = node.status.capacity or {}
node_info.append({
"name": node.metadata.name,
"ready": conditions.get("Ready") == "True",
"cpu": capacity.get("cpu"),
"memory": capacity.get("memory"),
"labels": dict(list((node.metadata.labels or {}).items())[:5]),
})
return json.dumps(node_info, indent=2)
raise ValueError(f"Unknown resource: {uri}")
# ── TOOLS ──────────────────────────────────────────────────
@server.list_tools()
async def list_tools() -> list[Tool]:
return [
Tool(
name="kubectl_get",
description="Run kubectl get to list Kubernetes resources",
inputSchema={
"type": "object",
"properties": {
"resource": {"type": "string", "description": "Resource type (pods, deployments, services, etc.)"},
"namespace": {"type": "string", "description": "Namespace (default: all namespaces)"},
"label_selector": {"type": "string", "description": "Label selector like 'app=myapp'"},
},
"required": ["resource"],
},
),
Tool(
name="kubectl_describe",
description="Describe a specific Kubernetes resource",
inputSchema={
"type": "object",
"properties": {
"resource": {"type": "string"},
"name": {"type": "string"},
"namespace": {"type": "string"},
},
"required": ["resource", "name"],
},
),
Tool(
name="kubectl_logs",
description="Get logs from a pod",
inputSchema={
"type": "object",
"properties": {
"pod_name": {"type": "string"},
"namespace": {"type": "string", "default": "default"},
"tail": {"type": "integer", "default": 50},
"container": {"type": "string"},
},
"required": ["pod_name"],
},
),
Tool(
name="terraform_plan",
description="Run terraform plan in a directory and return the output",
inputSchema={
"type": "object",
"properties": {
"directory": {"type": "string", "description": "Path to terraform directory"},
"var_file": {"type": "string", "description": "Optional var file path"},
},
"required": ["directory"],
},
),
Tool(
name="run_safe_command",
description="Run a read-only shell command (no writes, no destructive ops)",
inputSchema={
"type": "object",
"properties": {
"command": {"type": "string", "description": "Command to run (read-only operations only)"},
},
"required": ["command"],
},
),
]
@server.call_tool()
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
try:
if name == "kubectl_get":
cmd = ["kubectl", "get", arguments["resource"], "-o", "wide"]
if arguments.get("namespace"):
cmd.extend(["-n", arguments["namespace"]])
else:
cmd.append("-A")
if arguments.get("label_selector"):
cmd.extend(["-l", arguments["label_selector"]])
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
output = result.stdout or result.stderr
return [TextContent(type="text", text=output[:5000])]
elif name == "kubectl_describe":
cmd = ["kubectl", "describe", arguments["resource"], arguments["name"]]
if arguments.get("namespace"):
cmd.extend(["-n", arguments["namespace"]])
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
return [TextContent(type="text", text=(result.stdout or result.stderr)[:8000])]
elif name == "kubectl_logs":
cmd = ["kubectl", "logs", arguments["pod_name"]]
if arguments.get("namespace"):
cmd.extend(["-n", arguments["namespace"]])
if arguments.get("container"):
cmd.extend(["-c", arguments["container"]])
cmd.extend(["--tail", str(arguments.get("tail", 50))])
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
return [TextContent(type="text", text=(result.stdout or result.stderr)[:5000])]
elif name == "terraform_plan":
directory = arguments["directory"]
cmd = ["terraform", "plan", "-no-color"]
if arguments.get("var_file"):
cmd.extend([f"-var-file={arguments['var_file']}"])
result = subprocess.run(
cmd, capture_output=True, text=True,
timeout=120, cwd=directory
)
output = (result.stdout + result.stderr)[:8000]
return [TextContent(type="text", text=output)]
elif name == "run_safe_command":
command = arguments["command"]
# Safety check - block destructive commands
blocked = ["rm ", "delete", "drop ", "truncate", "format", "> /", "dd if"]
if any(b in command.lower() for b in blocked):
return [TextContent(type="text", text="Error: Destructive commands are not allowed")]
result = subprocess.run(
command, shell=True, capture_output=True,
text=True, timeout=30
)
return [TextContent(type="text", text=(result.stdout + result.stderr)[:5000])]
else:
return [TextContent(type="text", text=f"Unknown tool: {name}")]
except subprocess.TimeoutExpired:
return [TextContent(type="text", text="Command timed out after 30 seconds")]
except Exception as e:
return [TextContent(type="text", text=f"Error: {e}")]
async def main():
async with stdio_server() as (read_stream, write_stream):
await server.run(
read_stream,
write_stream,
InitializationOptions(
server_name="devops-assistant",
server_version="1.0.0",
capabilities=server.get_capabilities(
notification_options=None,
experimental_capabilities={},
),
),
)
if __name__ == "__main__":
asyncio.run(main())Configure Claude Desktop to Use Your Server
// ~/Library/Application Support/Claude/claude_desktop_config.json
{
"mcpServers": {
"devops-assistant": {
"command": "python",
"args": ["/path/to/devops_mcp_server.py"],
"env": {
"KUBECONFIG": "/path/to/.kube/config"
}
}
}
}Now in Claude Desktop, you can ask:
"What pods are failing in the production namespace?" "Show me the logs from the nginx pod" "Run terraform plan in /infra/prod and tell me if there are any risky changes"
Claude will automatically call your MCP tools and synthesize the results.
Use via Claude API (Programmatic)
import anthropic
import json
client = anthropic.Anthropic()
# Define the same tools for API use
tools = [
{
"name": "kubectl_get",
"description": "List Kubernetes resources",
"input_schema": {
"type": "object",
"properties": {
"resource": {"type": "string"},
"namespace": {"type": "string"},
},
"required": ["resource"]
}
}
]
def run_devops_assistant(question: str) -> str:
messages = [{"role": "user", "content": question}]
while True:
response = client.messages.create(
model="claude-opus-4-8",
max_tokens=2000,
tools=tools,
messages=messages
)
if response.stop_reason == "tool_use":
tool_use = next(b for b in response.content if b.type == "tool_use")
# Execute the tool
result = execute_tool(tool_use.name, tool_use.input)
messages.append({"role": "assistant", "content": response.content})
messages.append({
"role": "user",
"content": [{"type": "tool_result", "tool_use_id": tool_use.id, "content": result}]
})
else:
return response.content[0].text
result = run_devops_assistant("Are there any pods in CrashLoopBackOff state?")
print(result)MCP makes it trivially easy to extend Claude with your own infrastructure tools. Once the server is written, any MCP-compatible client — Claude Desktop, Claude Code, or the API — can use it.
Resources: MCP docs | Anthropic MCP guide
Today I Fixed
Short real fixes from production — posted daily
Stay ahead of the curve
Get the latest DevOps, Kubernetes, AWS, and AI/ML guides delivered straight to your inbox. No spam — just practical engineering content.
Related Articles
AI-Driven Capacity Planning for Kubernetes Clusters (2026)
How to use AI and machine learning for Kubernetes capacity planning. Covers predictive autoscaling, cost optimization, tools like StormForge and Kubecost, and building custom ML models for resource forecasting.
AI-Powered Kubernetes Anomaly Detection: Beyond Static Thresholds
Static alerts miss 40% of real incidents. Learn how AI and ML-based anomaly detection — using tools like Prometheus + ML, Dynatrace, and custom LLM runbooks — catches what thresholds can't.
Build an AI-Powered Terraform Drift Detection System
Terraform drift happens silently. Here's how to build an automated drift detector using Terraform plan + Claude API that alerts your team and explains exactly what changed.