putTokens: number;
costPer1kOutputTokens: number;
estimatedP50LatencyMs: number;
status: 'active' | 'degraded' | 'offline';
}
export const MODEL_REGISTRY: Record<string, ModelCapability> = {
'claude-3-5-sonnet': {
id: 'claude-3-5-sonnet',
provider: 'anthropic',
maxContextTokens: 200000,
supportsFunctionCalling: true,
supportsVision: false,
costPer1kInputTokens: 0.003,
costPer1kOutputTokens: 0.015,
estimatedP50LatencyMs: 380,
status: 'active',
},
'gpt-4o-mini': {
id: 'gpt-4o-mini',
provider: 'openai',
maxContextTokens: 128000,
supportsFunctionCalling: true,
supportsVision: true,
costPer1kInputTokens: 0.00015,
costPer1kOutputTokens: 0.0006,
estimatedP50LatencyMs: 120,
status: 'active',
},
'llama-3-8b-instruct': {
id: 'llama-3-8b-instruct',
provider: 'meta',
maxContextTokens: 8192,
supportsFunctionCalling: false,
supportsVision: false,
costPer1kInputTokens: 0.00005,
costPer1kOutputTokens: 0.00008,
estimatedP50LatencyMs: 90,
status: 'active',
},
};
### Step 2: Build the Request Classifier & Router
The router evaluates task requirements against model capabilities, scores candidates by cost-latency tradeoff, and enforces fallback chains.
```typescript
export interface RoutingRequest {
prompt: string;
requiresVision?: boolean;
requiresFunctionCalling?: boolean;
maxContextTokens: number;
maxCostPer1kTokens?: number;
maxLatencyMs?: number;
fallbackOrder?: string[];
}
export interface RoutingDecision {
selectedModel: ModelCapability;
fallbackChain: ModelCapability[];
estimatedCost: number;
estimatedLatencyMs: number;
rejectionReason?: string;
}
export class MultiModelRouter {
private circuitBreakers: Map<string, { failures: number; lastFailure: number }> = new Map();
private isCircuitOpen(modelId: string): boolean {
const state = this.circuitBreakers.get(modelId);
if (!state) return false;
if (state.failures >= 3 && Date.now() - state.lastFailure < 60000) return true;
return false;
}
private recordFailure(modelId: string): void {
const state = this.circuitBreakers.get(modelId) || { failures: 0, lastFailure: 0 };
state.failures++;
state.lastFailure = Date.now();
this.circuitBreakers.set(modelId, state);
}
public evaluate(request: RoutingRequest): RoutingDecision {
const candidates = Object.values(MODEL_REGISTRY)
.filter(m => m.status === 'active' && !this.isCircuitOpen(m.id))
.filter(m => m.maxContextTokens >= request.maxContextTokens)
.filter(m => (!request.requiresVision || m.supportsVision))
.filter(m => (!request.requiresFunctionCalling || m.supportsFunctionCalling));
if (candidates.length === 0) {
return {
selectedModel: null as any,
fallbackChain: [],
estimatedCost: 0,
estimatedLatencyMs: 0,
rejectionReason: 'No capable models available for request constraints',
};
}
const scored = candidates.map(m => ({
model: m,
score: this.calculateScore(m, request),
}));
scored.sort((a, b) => b.score - a.score);
const selected = scored[0].model;
const fallbackChain = scored.slice(1).map(s => s.model);
return {
selectedModel: selected,
fallbackChain,
estimatedCost: selected.costPer1kInputTokens * (request.maxContextTokens / 1000),
estimatedLatencyMs: selected.estimatedP50LatencyMs,
};
}
private calculateScore(model: ModelCapability, request: RoutingRequest): number {
let score = 100;
if (request.maxCostPer1kTokens && model.costPer1kInputTokens > request.maxCostPer1kTokens) score -= 50;
if (request.maxLatencyMs && model.estimatedP50LatencyMs > request.maxLatencyMs) score -= 30;
score -= (model.costPer1kInputTokens * 1000);
score -= (model.estimatedP50LatencyMs / 10);
return score;
}
}
Step 3: Implement Execution & Fallback Pipeline
Routing decisions must be paired with resilient execution. The pipeline attempts the primary model, captures failures, and routes to fallback candidates with exponential backoff.
export async function executeWithFallback(
decision: RoutingDecision,
prompt: string,
clientFactory: (provider: string) => any
): Promise<{ text: string; modelUsed: string; tokensUsed: number }> {
const allModels = [decision.selectedModel, ...decision.fallbackChain];
for (let i = 0; i < allModels.length; i++) {
const model = allModels[i];
try {
const client = clientFactory(model.provider);
const response = await client.chat.completions.create({
model: model.id,
messages: [{ role: 'user', content: prompt }],
max_tokens: Math.min(model.maxContextTokens, 2048),
});
return {
text: response.choices[0].message.content,
modelUsed: model.id,
tokensUsed: response.usage?.total_tokens || 0,
};
} catch (error: any) {
const router = new MultiModelRouter();
router.recordFailure(model.id);
if (i === allModels.length - 1) {
throw new Error(`All routing candidates failed. Last error: ${error.message}`);
}
const delay = Math.min(1000 * Math.pow(2, i), 5000);
await new Promise(res => setTimeout(res, delay));
}
}
throw new Error('Routing execution exhausted without result');
}
Architecture Decisions & Rationale
- Capability-First Matching: Routing decisions prioritize context window, vision, and tool-use requirements before cost. Cost optimization without capability validation causes silent quality degradation.
- Stateless Router with External Health Tracking: The router itself holds no execution state. Circuit breaker state is isolated per model and reset on cooldown, preventing cascade failures during provider outages.
- Deterministic Fallback Chains: Fallback order is computed at routing time, not hardcoded. This ensures fallbacks respect the same capability constraints as the primary selection.
- Token-Aware Cost Estimation: Cost scoring uses input token volume rather than flat rates, aligning routing decisions with actual workload size.
- Provider Abstraction via Client Factory: Execution decouples routing logic from SDK implementation, enabling runtime provider swapping without code changes.
Pitfall Guide
1. Static Routing Without Context Awareness
Hardcoding model selection based on endpoint paths or user tiers ignores token volume and task complexity. A 500-token classification task and a 50k-token summarization task routed identically will either waste compute or truncate context.
Best Practice: Route based on extracted request features: token count, modality, tool requirements, and latency budget. Recompute routing per request, not per session.
2. Ignoring Context Window & Token Limits
Routing decisions that don't validate maxContextTokens against prompt + response budget cause silent truncation or API rejections. Models silently drop tokens or throw 400 errors.
Best Practice: Implement token counting at ingestion. Reject or chunk requests exceeding the highest-capability model's window before routing.
3. Unstructured Fallback Chains
Chaining models without capability validation creates cascading failures. If the primary model fails due to context overflow, falling back to a smaller model guarantees the same failure.
Best Practice: Fallback chains must be capability-sorted. Always fallback to equal or higher capability, never downgrade for error recovery.
4. Missing Telemetry & Cost Attribution
Without per-request routing logs, teams cannot identify misalignment patterns. Cost reports show aggregate spend but hide routing inefficiencies.
Best Practice: Emit structured events: routing_decision, model_execution, fallback_triggered, cost_attribution. Tag every log with request_id, selected_model, fallback_chain, tokens_in, tokens_out.
5. Vendor API Contract Drift
Provider SDKs change response shapes, rate limit headers, and error codes. Routers that parse raw responses brittlely break during minor version updates.
Best Practice: Abstract provider responses into a normalized ModelResponse interface. Validate against schema on ingestion. Pin SDK versions and run contract tests in CI.
6. Over-Optimizing for Cost at the Expense of Safety
Routing sensitive or regulated prompts to cheaper, unverified models violates compliance boundaries. Cost-driven routing without safety filtering exposes data leakage risks.
Best Practice: Tag models with compliance flags (pii_safe, gdpr_compliant, audit_logged). Enforce safety routing rules before cost scoring. Route regulated workloads to explicitly approved models only.
7. Synchronous Routing Blocking the Critical Path
Evaluating routing logic synchronously on the request path adds 5β15ms latency. Under high concurrency, this compounds into P95 degradation.
Best Practice: Precompute routing decisions for known patterns. Cache capability evaluations. Use async routing with promise racing only when dynamic scoring is required.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume classification/extraction | Lightweight model router with cost-first scoring | Low capability requirements; cost dominates unit economics | -60% to -80% vs frontier models |
| Long-context summarization (>32k tokens) | Capability-first routing to extended-window models | Context window is hard constraint; fallback must preserve capacity | +15% vs fixed model, but prevents truncation losses |
| Real-time chat with tool use | Function-calling-aware router with latency budget | Tool execution requires low P50 latency and structured output support | Neutral cost; +40% reliability |
| Regulated/PII workloads | Safety-tagged routing with compliance filtering | Legal and audit requirements override cost optimization | +10β20% premium for approved models |
| Multi-provider redundancy | Circuit-broken fallback chains with health tracking | Prevents single-vendor outages from halting production | +5% overhead for health checks; -90% incident cost |
Configuration Template
{
"routing": {
"version": "2.1",
"evaluation": {
"mode": "capability_first",
"maxCandidates": 5,
"scoreWeights": {
"cost": 0.3,
"latency": 0.4,
"capabilityMatch": 0.3
}
},
"fallback": {
"enabled": true,
"maxRetries": 2,
"backoffStrategy": "exponential",
"circuitBreaker": {
"failureThreshold": 3,
"cooldownSeconds": 60,
"halfOpenRequests": 1
}
},
"telemetry": {
"enabled": true,
"attributes": ["request_id", "selected_model", "fallback_chain", "tokens_in", "tokens_out", "cost_cents"],
"exportTarget": "otlp_http"
},
"compliance": {
"piiSafeModels": ["gpt-4o-mini", "claude-3-5-sonnet"],
"blockUnsafeRouting": true,
"auditLogging": true
}
}
}
Quick Start Guide
- Install dependencies:
npm install @anthropic-ai/sdk openai zod
- Create registry & router: Copy the
MODEL_REGISTRY and MultiModelRouter classes into src/routing/router.ts. Export evaluate() as the public entrypoint.
- Define request shape: Create a
RoutingRequest interface matching your application's prompt structure. Extract token count, modality flags, and latency budget at ingestion.
- Wire execution pipeline: Implement
executeWithFallback() using your provider SDKs. Replace clientFactory with actual initialization logic. Attach telemetry hooks to log routing decisions and execution results.
- Deploy & validate: Run load tests with mixed token volumes. Verify that routing scores shift correctly under cost/latency constraints. Confirm fallback chains trigger only on genuine failures, not capability mismatches.