ii', 'copyright', 'hate_speech', 'self_harm'
]);
export const ModerationAction = z.enum(['allow', 'flag', 'block', 'human_review']);
export const ModerationResult = z.object({
categories: z.array(ModerationCategory),
severity: z.number().min(0).max(1),
action: ModerationAction,
confidence: z.number().min(0).max(1),
reasoning: z.string(),
policy_version: z.string()
});
export type ModerationResult = z.infer<typeof ModerationResult>;
### Step 2: Implement Fast Pre-Filter
The fast filter handles high-volume, low-complexity submissions. It combines keyword matching, regex patterns, and embedding similarity against known violation vectors.
```typescript
export class FastFilter {
private keywordSets: Map<string, Set<string>>;
private embeddingModel: any; // e.g., sentence-transformers via API
constructor(config: FilterConfig) {
this.keywordSets = new Map(Object.entries(config.keywordSets));
this.embeddingModel = config.embeddingModel;
}
async evaluate(content: string): Promise<FilterResult> {
const matches: string[] = [];
for (const [category, keywords] of this.keywordSets) {
if (Array.from(keywords).some(kw => content.toLowerCase().includes(kw))) {
matches.push(category);
}
}
const embedding = await this.embeddingModel.embed(content);
const similarity = await this.checkViolationVectors(embedding);
return {
flagged: matches.length > 0 || similarity > 0.85,
categories: matches,
confidence: matches.length > 0 ? 0.95 : similarity,
bypassLLM: matches.length > 0 // deterministic blocks skip LLM
};
}
}
Step 3: LLM Evaluation with Structured Output
Frontier models must return strictly typed JSON. Prompt templates are versioned and injected with policy context. Rate limiting and circuit breakers prevent cascade failures.
import { createOpenAI } from '@ai-sdk/openai';
import { generateObject } from 'ai';
const openai = createOpenAI({ apiKey: process.env.OPENAI_API_KEY });
export class LLMModerator {
async evaluate(content: string, policyVersion: string): Promise<ModerationResult> {
const prompt = `
Evaluate the following content against policy v${policyVersion}.
Return strictly valid JSON matching the schema.
Content: "${content.slice(0, 4000)}"
`;
const result = await generateObject({
model: openai('gpt-4o-mini'),
schema: ModerationResult,
prompt,
temperature: 0.1,
maxTokens: 500
});
return result.object;
}
}
Step 4: Pipeline Orchestration & Confidence Routing
The orchestrator routes content through the fast filter, conditionally invokes the LLM, applies threshold logic, and logs all decisions for audit.
export class ModerationPipeline {
constructor(
private fastFilter: FastFilter,
private llm: LLMModerator,
private config: PipelineConfig
) {}
async moderate(content: string, context?: string): Promise<ModerationResult> {
const filterResult = await this.fastFilter.evaluate(content);
if (filterResult.bypassLLM && filterResult.confidence > 0.9) {
return this.mapFilterToResult(filterResult);
}
const llmResult = await this.llm.evaluate(content + (context ? ` Context: ${context}` : ''), this.config.policyVersion);
// Apply confidence thresholding
if (llmResult.confidence < this.config.lowConfidenceThreshold) {
llmResult.action = 'human_review';
}
// Log for audit & feedback loop
await this.auditLogger.log({ content, filterResult, llmResult, timestamp: Date.now() });
return llmResult;
}
private mapFilterToResult(filter: FilterResult): ModerationResult {
return {
categories: filter.categories as any[],
severity: filter.confidence,
action: filter.confidence > 0.9 ? 'block' : 'flag',
confidence: filter.confidence,
reasoning: 'Fast filter deterministic match',
policy_version: this.config.policyVersion
};
}
}
Architecture Decisions & Rationale
- Async Queue for Non-Real-Time: Batch UGC (forum posts, comments) routes through BullMQ or AWS SQS to decouple ingestion from evaluation, enabling horizontal scaling and retry logic.
- Structured Outputs Over Free-Form Prompts: Zod validation +
generateObject eliminates parsing failures and ensures schema compliance, reducing downstream errors by ~90%.
- Confidence Thresholding: Hard binary decisions cause churn. Routing low-confidence predictions to human review or deterministic fallbacks maintains compliance while controlling false positives.
- Policy Versioning: Every moderation decision embeds
policy_version. This enables rollback, A/B testing of rule changes, and precise audit trails for compliance reporting.
- Circuit Breaker Pattern: If LLM latency exceeds SLA or error rates spike, the pipeline falls back to the fast filter with elevated sensitivity, preventing platform degradation.
Pitfall Guide
1. Treating Moderation as a Single-Label Classification Problem
Mistake: Using one model to output a single category or binary allow/block decision.
Why it fails: Content frequently violates multiple policies simultaneously. Single-label models force artificial prioritization, causing policy gaps and inconsistent enforcement.
Best Practice: Implement multi-label classification with independent confidence scores per category. Route decisions using a weighted severity matrix rather than a single threshold.
2. Hardcoding Confidence Thresholds Without Calibration
Mistake: Setting confidence < 0.7 β human_review based on intuition.
Why it fails: Model confidence is poorly calibrated by default. A 0.7 score might mean 50% accuracy on toxic content but 90% on spam.
Best Practice: Run Platt scaling or isotonic regression on a held-out validation set. Map raw scores to calibrated probabilities. Adjust thresholds per category based on ROC curves and business risk tolerance.
3. Ignoring Context Window Truncation & Sliding Windows
Mistake: Passing full conversation threads or long-form posts directly to the LLM without chunking.
Why it fails: Context overflow causes silent truncation, missing critical violations in later messages. Token limits also inflate costs unpredictably.
Best Practice: Implement sliding window summarization. Preserve the last N messages verbatim, summarize earlier context, and flag if truncation occurs. Maintain message ordering metadata for policy evaluation.
4. Neglecting Adversarial Evasion Techniques
Mistake: Assuming users submit content in standard formatting.
Why it fails: Malicious actors use homoglyphs, zero-width characters, image-text mismatch, and prompt injection to bypass filters. Keyword filters fail on obfuscation; LLMs can be jailbroken.
Best Practice: Normalize text (Unicode NFC, strip zero-width chars, homoglyph mapping). Implement multimodal alignment checks. Regularly red-team with adversarial datasets. Update fast filter vectors monthly.
5. Deploying Without Continuous Evaluation Pipelines
Mistake: Launching moderation and assuming static performance.
Why it fails: Policy drift, linguistic evolution, and model updates degrade accuracy within weeks. False positives compound silently until user trust erodes.
Best Practice: Maintain a gold-standard evaluation set (5k+ labeled items). Run nightly batch evaluations against production traffic. Track precision, recall, calibration error, and latency. Alert on >3% metric degradation.
6. Treating All Content Equally
Mistake: Applying identical moderation logic to new accounts, trusted creators, and enterprise partners.
Why it fails: Risk profiles differ drastically. Over-moderating trusted users causes churn; under-moderating high-risk accounts triggers compliance violations.
Best Practice: Implement trust-score routing. New/unverified accounts use stricter thresholds and higher LLM routing probability. Trusted users bypass fast filters for low-severity categories. Adjust thresholds dynamically based on account history.
7. Skipping Audit Trails & Compliance Logging
Mistake: Logging only final decisions without intermediate states, confidence scores, or policy versions.
Why it fails: Regulatory audits (GDPR, DSA, COPPA) require explainability. Without full decision graphs, platforms cannot justify actions or rollback policy changes.
Best Practice: Store immutable audit logs containing raw input, filter results, LLM prompts, confidence scores, policy version, and final action. Retain for compliance windows. Provide user-facing appeal endpoints linked to audit IDs.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Real-time chat/messaging | Fast filter β Low-latency LLM (gpt-4o-mini) β Strict thresholds | Sub-300ms SLA required; false positives cause immediate churn | +40% vs batch, but prevents support escalation costs |
| High-volume UGC (forums, comments) | Fast filter β Async queue β Frontier LLM for edge cases | Throughput prioritized; latency tolerant; cost optimization critical | -65% vs full LLM routing; scales linearly with volume |
| Enterprise/Compliance-heavy | Hybrid pipeline + Human-in-the-loop + Full audit trail | Regulatory requirements demand explainability and zero-tolerance for misses | +120% due to human review, but avoids legal/compliance penalties |
| Low-trust/new user onboarding | Strict fast filter + High LLM routing + Lower confidence thresholds | Higher risk of spam/abuse; early detection prevents platform degradation | +25% inference cost, offset by reduced moderation queue volume |
Configuration Template
moderation:
policy_version: "v2.4.1"
thresholds:
toxicity:
block: 0.85
flag: 0.65
human_review: 0.45
spam:
block: 0.90
flag: 0.70
human_review: 0.50
pii:
block: 0.80
flag: 0.60
human_review: 0.40
routing:
fast_filter_bypass_confidence: 0.92
llm_timeout_ms: 450
circuit_breaker_error_threshold: 0.15
async_queue_concurrency: 50
trust_scores:
new_account_multiplier: 1.3
verified_creator_multiplier: 0.7
enterprise_partner_multiplier: 0.5
audit:
retention_days: 365
log_intermediate_states: true
enable_appeal_endpoint: true
evaluation:
nightly_batch_size: 5000
alert_degradation_pct: 3.0
calibration_method: "platt_scaling"
Quick Start Guide
- Initialize dependencies:
npm install ai zod @ai-sdk/openai bullmq and configure environment variables for API keys and queue endpoints.
- Deploy policy config: Copy the YAML template, adjust thresholds per your compliance requirements, and load into your configuration manager. Version every change.
- Run pipeline locally: Instantiate
FastFilter, LLMModerator, and ModerationPipeline. Pass sample content through moderate() and validate Zod schema compliance and confidence routing.
- Enable async routing: Configure BullMQ/SQS for batch workloads. Set up the circuit breaker and fallback logic. Verify latency stays under SLA during traffic spikes.
- Activate evaluation pipeline: Schedule nightly batch runs against your gold-standard dataset. Monitor precision, recall, and calibration error. Adjust thresholds based on ROC curves before production rollout.