and latency metrics. This satisfies compliance requirements and enables post-incident forensics.
5. Graceful Degradation: If a safety layer fails or times out, the system defaults to a safe state (reject or quarantine) rather than allowing unvalidated output to propagate.
Step-by-Step Implementation
1. Define Safety Policies and Schemas
import { z } from 'zod';
// Input contract: restricts payload structure and enforces type safety
const UserInputSchema = z.object({
prompt: z.string().min(1).max(2000),
contextId: z.string().uuid(),
userId: z.string().min(1),
metadata: z.record(z.string(), z.unknown()).optional()
});
// Output contract: enforces structure before semantic validation
const ModelOutputSchema = z.object({
response: z.string(),
citations: z.array(z.string().url()).optional(),
confidence: z.number().min(0).max(1),
tags: z.array(z.enum(['informational', 'actionable', 'warning', 'error']))
});
// Safety policy configuration
type SafetyPolicy = {
maxPromptLength: number;
blockedPatterns: RegExp[];
allowedDomains: string[];
latencyBudgetMs: number;
enableSemanticCheck: boolean;
};
2. Build the Guardrail Middleware
import { NextFunction, Request, Response } from 'express';
export interface GuardrailContext {
requestId: string;
policy: SafetyPolicy;
input: z.infer<typeof UserInputSchema>;
output?: z.infer<typeof ModelOutputSchema>;
safetyDecisions: Array<{ layer: string; decision: 'allow' | 'block' | 'quarantine'; latencyMs: number; reason?: string }>;
}
export async function llmSafetyGuardrail(
req: Request,
res: Response,
next: NextFunction,
policy: SafetyPolicy
): Promise<void> {
const context: GuardrailContext = {
requestId: req.headers['x-request-id'] as string || crypto.randomUUID(),
policy,
input: {} as any,
safetyDecisions: []
};
try {
// Layer 1: Structural & Pattern Validation
const startInput = Date.now();
const parsed = UserInputSchema.safeParse(req.body);
if (!parsed.success) {
context.safetyDecisions.push({ layer: 'input-schema', decision: 'block', latencyMs: Date.now() - startInput, reason: 'Invalid payload structure' });
return res.status(400).json({ error: 'Invalid input format', requestId: context.requestId });
}
context.input = parsed.data;
for (const pattern of policy.blockedPatterns) {
if (pattern.test(context.input.prompt)) {
context.safetyDecisions.push({ layer: 'pattern-filter', decision: 'block', latencyMs: Date.now() - startInput, reason: 'Blocked pattern detected' });
return res.status(403).json({ error: 'Policy violation', requestId: context.requestId });
}
}
context.safetyDecisions.push({ layer: 'pattern-filter', decision: 'allow', latencyMs: Date.now() - startInput });
// Layer 2: Async Semantic Validation (LLM-as-Judge)
if (policy.enableSemanticCheck) {
const startSemantic = Date.now();
const semanticResult = await evaluatePromptIntent(context.input.prompt, policy.latencyBudgetMs);
const semanticLatency = Date.now() - startSemantic;
if (semanticResult.latencyExceeded) {
context.safetyDecisions.push({ layer: 'semantic-judge', decision: 'block', latencyMs: semanticLatency, reason: 'Latency budget exceeded' });
return res.status(408).json({ error: 'Safety check timeout', requestId: context.requestId });
}
if (semanticResult.riskLevel === 'high') {
context.safetyDecisions.push({ layer: 'semantic-judge', decision: 'block', latencyMs: semanticLatency, reason: 'High-risk intent detected' });
return res.status(403).json({ error: 'Policy violation', requestId: context.requestId });
}
context.safetyDecisions.push({ layer: 'semantic-judge', decision: 'allow', latencyMs: semanticLatency });
}
// Attach context and proceed to model execution
(req as any).guardrailContext = context;
next();
} catch (error) {
// Fail-safe: block on unexpected errors
context.safetyDecisions.push({ layer: 'system', decision: 'block', latencyMs: 0, reason: 'Internal safety failure' });
res.status(500).json({ error: 'Safety pipeline failure', requestId: context.requestId });
}
}
// Stub for semantic evaluation (replace with actual LLM-as-judge or classifier)
async function evaluatePromptIntent(prompt: string, budgetMs: number) {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), budgetMs);
try {
// In production: call a lightweight classifier or small judge model
const result = await fetch('https://api.internal/safety-judge', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ prompt, budget: budgetMs }),
signal: controller.signal
}).then(r => r.json());
clearTimeout(timeout);
return { riskLevel: result.risk_level, latencyExceeded: false };
} catch (e: any) {
clearTimeout(timeout);
if (e.name === 'AbortError') return { riskLevel: 'unknown', latencyExceeded: true };
throw e;
}
}
3. Output Validation & Audit Logging
export function validateAndLogOutput(req: Request, res: Response, next: NextFunction): void {
const ctx = (req as any).guardrailContext as GuardrailContext;
const output = res.locals.modelOutput; // Set by downstream model handler
const startOutput = Date.now();
const parsed = ModelOutputSchema.safeParse(output);
const outputLatency = Date.now() - startOutput;
if (!parsed.success) {
ctx.safetyDecisions.push({ layer: 'output-schema', decision: 'block', latencyMs: outputLatency, reason: 'Output violates schema' });
logSafetyAudit(ctx);
return res.status(502).json({ error: 'Model output validation failed', requestId: ctx.requestId });
}
ctx.output = parsed.data;
ctx.safetyDecisions.push({ layer: 'output-schema', decision: 'allow', latencyMs: outputLatency });
// Attach sanitized output for response
res.locals.safeOutput = ctx.output;
logSafetyAudit(ctx);
next();
}
function logSafetyAudit(ctx: GuardrailContext): void {
// Production: ship to structured logging pipeline (OpenTelemetry, CloudWatch, Datadog)
console.log(JSON.stringify({
event: 'safety_audit',
requestId: ctx.requestId,
policyVersion: ctx.policy.latencyBudgetMs,
decisions: ctx.safetyDecisions,
totalLatencyMs: ctx.safetyDecisions.reduce((acc, d) => acc + d.latencyMs, 0)
}));
}
4. Integration Example
import express from 'express';
const app = express();
app.use(express.json());
const policy: SafetyPolicy = {
maxPromptLength: 2000,
blockedPatterns: [/sudo\s+rm\s+-rf/i, /DROP\s+TABLE/i, /exec\(/i],
allowedDomains: ['api.internal', 'docs.company.com'],
latencyBudgetMs: 150,
enableSemanticCheck: true
};
app.post('/chat',
(req, res, next) => llmSafetyGuardrail(req, res, next, policy),
async (req, res, next) => {
// Simulate LLM call
const mockOutput = {
response: 'Here is the requested information.',
citations: ['https://api.internal/doc/1'],
confidence: 0.92,
tags: ['informational']
};
res.locals.modelOutput = mockOutput;
next();
},
validateAndLogOutput,
(req, res) => {
res.json({ output: res.locals.safeOutput, requestId: (req as any).guardrailContext.requestId });
}
);
Pitfall Guide
-
Relying Solely on Regex/Keyword Filters
Regex cannot understand context, semantics, or adversarial obfuscation. Attackers use homoglyphs, token splitting, and multi-turn context to bypass pattern matches. Always pair fast filters with semantic or schema validation.
-
Ignoring Latency Budgets for Safety Layers
Safety checks that run synchronously without timeouts cause request queues to back up. Implement strict AbortController timeouts, circuit breakers, and fallback policies. A 150ms budget for semantic evaluation is standard for interactive APIs.
-
Hardcoding Safety Rules Instead of Dynamic Evaluation
Static rules degrade as language and attack vectors evolve. Use configurable policy engines, versioned rule sets, and LLM-as-judge systems that can be updated without redeploying application code.
-
Skipping Output Schema Validation
LLMs frequently return malformed JSON, unstructured text, or unexpected fields. Without schema enforcement, downstream systems crash or process invalid data. Zod or JSON Schema validation must be mandatory before any business logic.
-
Missing Audit Trails and Compliance Mapping
Regulators require proof of safety controls. Log every safety decision, policy version, model version, and latency metric. Map controls to NIST AI RMF or EU AI Act requirements. Without auditability, you cannot prove compliance or debug incidents.
-
Over-Constraining Outputs
Aggressive filtering breaks legitimate use cases. Medical, legal, and technical domains require nuanced safety boundaries. Implement tiered policies: strict for public-facing, permissive for internal RAG, and domain-specific for regulated workflows.
-
Assuming Fine-Tuned or RLHF Models Are Inherently Safe
Alignment is probabilistic, not absolute. Fine-tuning reduces but does not eliminate jailbreak success rates. Production systems must enforce guardrails at the application layer regardless of model training methodology.
Best Practices from Production:
- Threat-model your LLM integration like you would a database or payment processor.
- Run continuous red-teaming campaigns using automated prompt mutation tools.
- Implement canary deployments for safety policy updates.
- Separate safety infrastructure from core business logic to enable independent scaling and testing.
- Monitor false positive rates and adjust thresholds based on user feedback loops.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Public-facing chatbot | Multi-layer with strict semantic judge | High adversarial exposure requires defense-in-depth | Moderate (judge model calls + logging) |
| Internal RAG system | Schema validation + pattern filtering | Lower threat surface, latency sensitivity critical | Low |
| Financial/Healthcare compliance | Multi-layer + immutable audit + policy versioning | Regulatory mandates require full traceability and strict controls | High (audit infrastructure + judge models) |
| High-throughput API gateway | Async non-blocking + circuit breakers + fast filters | Throughput demands prevent synchronous blocking | Low-Moderate (infrastructure scaling) |
Configuration Template
// safety.config.ts
export const SafetyConfig = {
version: '1.0.0',
layers: {
input: {
schema: 'UserInputSchema',
blockedPatterns: [/sudo\s+rm\s+-rf/i, /DROP\s+TABLE/i, /exec\(/i],
maxPromptLength: 2000,
timeoutMs: 50
},
semantic: {
enabled: true,
endpoint: 'https://api.internal/safety-judge',
timeoutMs: 150,
riskThreshold: 'high',
fallback: 'block'
},
output: {
schema: 'ModelOutputSchema',
allowedCitationDomains: ['api.internal', 'docs.company.com'],
timeoutMs: 30
}
},
audit: {
enabled: true,
destination: 'cloudwatch',
retentionDays: 365,
includeLatency: true,
includePolicyVersion: true
},
policies: {
public: { semanticEnabled: true, strictMode: true },
internal: { semanticEnabled: false, strictMode: false },
regulated: { semanticEnabled: true, strictMode: true, auditRequired: true }
}
};
Quick Start Guide
- Install dependencies:
npm install zod express
- Copy the guardrail middleware into your project and import
llmSafetyGuardrail and validateAndLogOutput.
- Define your safety policy matching the
SafetyPolicy type, adjusting blocked patterns, latency budgets, and semantic evaluation settings to your threat model.
- Attach middleware to your route before and after the LLM call: pre-execution for input validation, post-execution for output verification and audit logging.
- Deploy with structured logging enabled and verify safety decisions appear in your observability pipeline. Run a quick red-team test using adversarial prompts to confirm blocking behavior.