LLM Guardrails in Production — Input and Output Validation
Deploying LLMs without guardrails is a liability. Here's how to validate inputs, filter outputs, and prevent prompt injection, jailbreaks, and toxic content in production systems.
LLMs in production without guardrails will eventually cause problems — prompt injection, toxic outputs, confidential data leakage, or jailbreaks. Guardrails are the safety layer between user input and your model.
What Guardrails Protect Against
Input-side:
- Prompt injection — "Ignore previous instructions and..."
- Jailbreak attempts — trying to bypass system prompt restrictions
- Sensitive data in prompts — SSNs, credit cards, passwords
- Off-topic inputs — queries your system isn't designed to handle
- Overly long inputs that inflate costs
Output-side:
- Toxic or harmful content
- Hallucinated sensitive information (fake phone numbers, fake legal advice)
- Confidential information leakage
- Outputs that don't match expected format
Layer 1: Input Validation
# guardrails/input_validator.py
import re
from dataclasses import dataclass
@dataclass
class ValidationResult:
passed: bool
reason: str = ""
sanitized_input: str = ""
# Patterns that indicate prompt injection attempts
INJECTION_PATTERNS = [
r"ignore (previous|above|all) instructions",
r"disregard (your|the) (system |previous )?instructions",
r"you are now",
r"pretend (you are|to be)",
r"act as (if you are|a)",
r"forget (everything|all instructions)",
r"your new (role|instructions|persona) (is|are)",
r"system prompt:",
r"<\|im_start\|>", # Common injection token
]
# Sensitive data patterns
SENSITIVE_PATTERNS = {
"credit_card": r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b",
"ssn": r"\b\d{3}[-]?\d{2}[-]?\d{4}\b",
"api_key": r"(sk-|key-)[a-zA-Z0-9]{20,}",
"password": r"password\s*[:=]\s*\S+",
}
def validate_input(user_input: str,
max_length: int = 4000,
topic_keywords: list[str] = None) -> ValidationResult:
# Length check
if len(user_input) > max_length:
return ValidationResult(
passed=False,
reason=f"Input too long ({len(user_input)} chars, max {max_length})"
)
lowered = user_input.lower()
# Injection check
for pattern in INJECTION_PATTERNS:
if re.search(pattern, lowered):
return ValidationResult(
passed=False,
reason="Potential prompt injection detected"
)
# Sensitive data check (redact rather than reject)
sanitized = user_input
for data_type, pattern in SENSITIVE_PATTERNS.items():
matches = re.findall(pattern, sanitized, re.IGNORECASE)
if matches:
sanitized = re.sub(pattern, f"[{data_type.upper()}_REDACTED]", sanitized, flags=re.IGNORECASE)
# Topic relevance check (optional)
if topic_keywords:
if not any(kw.lower() in lowered for kw in topic_keywords):
return ValidationResult(
passed=False,
reason="Input doesn't appear relevant to this assistant's purpose"
)
return ValidationResult(passed=True, sanitized_input=sanitized)Layer 2: Output Validation
# guardrails/output_validator.py
import anthropic
import re
import os
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
# Patterns in outputs you never want to serve
OUTPUT_BLOCKLIST = [
r"\b(kill|murder|harm|hurt)\s+(yourself|himself|herself|themselves)\b",
r"how to (make|build|create) (a )?bomb",
r"social security number[:\s]+\d{3}",
]
def validate_output(llm_output: str) -> dict:
"""Check LLM output for harmful content."""
for pattern in OUTPUT_BLOCKLIST:
if re.search(pattern, llm_output, re.IGNORECASE):
return {
"passed": False,
"reason": "Output contains blocked content",
"output": "I can't provide that information."
}
return {"passed": True, "output": llm_output}
def llm_judge_output(user_query: str, llm_output: str, policy: str) -> dict:
"""Use a cheap, fast model to judge if output violates policy."""
prompt = f"""You are a content safety classifier.
Policy: {policy}
User asked: {user_query}
Assistant responded: {llm_output}
Does the response violate the policy? Answer with JSON only:
{{"violates": true/false, "reason": "brief explanation"}}"""
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=100,
messages=[{"role": "user", "content": prompt}]
)
import json
try:
return json.loads(response.content[0].text)
except:
return {"violates": False, "reason": "parse error — defaulting to allow"}Layer 3: Full Guardrail Wrapper
# guardrails/safe_llm.py
import anthropic
import os
from input_validator import validate_input
from output_validator import validate_output, llm_judge_output
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
POLICY = """This is a DevOps assistant. It should only answer questions about
DevOps, Kubernetes, cloud infrastructure, CI/CD, and related topics.
It must not provide harmful, illegal, or off-topic content."""
def safe_completion(
user_input: str,
system_prompt: str,
model: str = "claude-sonnet-4-6",
topic_keywords: list[str] = None
) -> dict:
# 1. Validate input
input_check = validate_input(
user_input,
max_length=4000,
topic_keywords=topic_keywords
)
if not input_check.passed:
return {
"success": False,
"blocked_at": "input",
"reason": input_check.reason,
"response": f"I can't process this request: {input_check.reason}"
}
# Use sanitized input (sensitive data redacted)
safe_input = input_check.sanitized_input or user_input
# 2. Call LLM
response = client.messages.create(
model=model,
max_tokens=1000,
system=system_prompt,
messages=[{"role": "user", "content": safe_input}]
)
llm_output = response.content[0].text
# 3. Validate output (pattern check)
output_check = validate_output(llm_output)
if not output_check["passed"]:
return {
"success": False,
"blocked_at": "output_patterns",
"reason": output_check["reason"],
"response": output_check["output"]
}
# 4. LLM-as-judge (for high-stakes use cases)
judge_result = llm_judge_output(safe_input, llm_output, POLICY)
if judge_result.get("violates"):
return {
"success": False,
"blocked_at": "llm_judge",
"reason": judge_result["reason"],
"response": "I'm not able to provide that response per our usage policy."
}
return {
"success": True,
"response": llm_output,
"input_was_sanitized": safe_input != user_input
}Production Monitoring
Log all guardrail blocks — they tell you what users are trying to do:
import logging
logger = logging.getLogger("guardrails")
def safe_completion_with_logging(user_input: str, **kwargs) -> dict:
result = safe_completion(user_input, **kwargs)
if not result["success"]:
logger.warning({
"event": "guardrail_block",
"blocked_at": result["blocked_at"],
"reason": result["reason"],
"input_preview": user_input[:100] # Don't log full input
})
return resultTrack: block rate, most common block reasons, blocked input patterns. High block rates on injection attempts means someone is actively probing your system.
Open Source Options
- Guardrails AI — framework with many validators out of the box
- NeMo Guardrails (NVIDIA) — conversation-level guardrails with rails DSL
- LlamaGuard (Meta) — fine-tuned classifier for safety (run locally or via API)
- Anthropic's built-in safety — Claude has built-in refusals, but don't rely on it alone for production
Build production LLM applications on the Anthropic API — built-in safety features reduce the guardrail surface you need to build yourself.
Today I Fixed
Short real fixes from production — posted daily
Stay ahead of the curve
Get the latest DevOps, Kubernetes, AWS, and AI/ML guides delivered straight to your inbox. No spam — just practical engineering content.
Related Articles
Build an AI Deployment Readiness Checker for Kubernetes
Build a Python tool that analyzes your Kubernetes manifests before deployment, detects misconfigurations, missing limits, security issues, and gives a go/no-go decision using Claude.
Auto-Generate Terraform Modules Using OpenAI Function Calling
Build a tool that takes plain English descriptions and generates production-ready Terraform modules using OpenAI's function calling API. No more starting from scratch.
Build an AI Cloud Cost Anomaly Detector with Python
Cloud costs spike without warning. Build a Python bot that pulls AWS cost data daily, detects anomalies using statistical analysis, and explains the spike using Claude AI.