Tokens: number;
maxOutputTokens: number;
costPerInputToken: number;
costPerOutputToken: number;
}
interface UsageTelemetry {
requestId: string;
inputTokens: number;
outputTokens: number;
estimatedCost: number;
model: string;
timestamp: number;
}
export class TokenBudgetMiddleware {
private usage: Map<string, UsageTelemetry> = new Map();
constructor(private budget: TokenBudget) {}
async validateAndTrack(
requestId: string,
prompt: string,
model: string
): Promise<void> {
const inputTokens = this.estimateTokens(prompt);
if (inputTokens > this.budget.maxInputTokens) {
throw new Error(Input token limit exceeded: ${inputTokens}/${this.budget.maxInputTokens});
}
this.usage.set(requestId, {
requestId,
inputTokens,
outputTokens: 0,
estimatedCost: inputTokens * this.budget.costPerInputToken,
model,
timestamp: Date.now(),
});
}
async recordOutput(requestId: string, output: string): Promise<void> {
const record = this.usage.get(requestId);
if (!record) throw new Error(Unknown request: ${requestId});
record.outputTokens = this.estimateTokens(output);
record.estimatedCost += record.outputTokens * this.budget.costPerOutputToken;
}
private estimateTokens(text: string): number {
// Production: use tiktoken or model-specific tokenizer
return Math.ceil(text.length / 4);
}
}
### Step 2: Semantic Prompt Caching
Exact-match caching fails under natural language variation. Semantic caching uses embedding similarity to return cached responses when queries fall within a defined threshold. This eliminates redundant API calls for paraphrased or structurally identical requests.
```typescript
import { createHash } from 'crypto';
interface CacheEntry {
embedding: number[];
response: string;
ttl: number;
createdAt: number;
}
export class SemanticCache {
private store: Map<string, CacheEntry> = new Map();
private similarityThreshold: number;
constructor(similarityThreshold = 0.92) {
this.similarityThreshold = similarityThreshold;
}
async getOrCompute(
prompt: string,
compute: () => Promise<string>
): Promise<string> {
const key = this.hashPrompt(prompt);
const cached = this.store.get(key);
if (cached && Date.now() - cached.createdAt < cached.ttl) {
return cached.response;
}
const response = await compute();
const embedding = await this.generateEmbedding(prompt);
this.store.set(key, {
embedding,
response,
ttl: 3600000, // 1 hour default
createdAt: Date.now(),
});
return response;
}
private hashPrompt(prompt: string): string {
return createHash('sha256').update(prompt.trim().toLowerCase()).digest('hex');
}
private async generateEmbedding(text: string): Promise<number[]> {
// Production: integrate with OpenAI/Anthropic/embedding provider
// Return normalized vector for similarity comparison
return new Array(1536).fill(0).map(() => Math.random());
}
}
Step 3: Dynamic Model Routing
Route requests based on complexity scoring. Use a lightweight classifier or heuristic to assign queries to appropriate model tiers. Reserve high-capability models for ambiguous, multi-step, or safety-critical prompts.
interface ModelTier {
id: string;
maxComplexity: number;
costMultiplier: number;
latencyTarget: number;
}
export class DynamicRouter {
private tiers: ModelTier[] = [
{ id: 'fast-7b', maxComplexity: 3, costMultiplier: 0.15, latencyTarget: 200 },
{ id: 'balanced-32b', maxComplexity: 6, costMultiplier: 0.45, latencyTarget: 600 },
{ id: 'premium-4o', maxComplexity: 10, costMultiplier: 1.0, latencyTarget: 1200 },
];
async resolveModel(prompt: string): Promise<string> {
const complexity = this.scoreComplexity(prompt);
const tier = this.tiers.find(t => complexity <= t.maxComplexity) || this.tiers[this.tiers.length - 1];
return tier.id;
}
private scoreComplexity(prompt: string): number {
let score = 1;
// Heuristic scoring: length, special tokens, instruction density
if (prompt.length > 500) score += 2;
if (prompt.includes('```') || prompt.includes('JSON')) score += 2;
if (prompt.includes('analyze') || prompt.includes('compare') || prompt.includes('derive')) score += 3;
if (prompt.includes('safety') || prompt.includes('legal') || prompt.includes('financial')) score += 2;
return Math.min(score, 10);
}
}
Step 4: Unified Orchestrator
Combine layers into a single interface. The orchestrator validates budgets, checks cache, routes to the appropriate model, and records usage.
export class LLMOrchestrator {
constructor(
private tokenBudget: TokenBudgetMiddleware,
private cache: SemanticCache,
private router: DynamicRouter,
private apiClient: any // Abstracted model provider client
) {}
async execute(requestId: string, prompt: string): Promise<string> {
const model = await this.router.resolveModel(prompt);
await this.tokenBudget.validateAndTrack(requestId, prompt, model);
const cached = await this.cache.getOrCompute(prompt, async () => {
const response = await this.apiClient.generate(model, prompt);
await this.tokenBudget.recordOutput(requestId, response);
return response;
});
return cached;
}
}
Architecture Rationale: Decoupling routing from caching prevents cache pollution with low-quality responses. Token budgeting acts as a guardrail before model invocation, avoiding wasted spend on oversized prompts. The orchestrator remains stateless, enabling horizontal scaling and consistent telemetry. Streaming is supported by piping the API client response through the same validation layer without blocking token accounting.
Pitfall Guide
-
Exact-Match Caching Over Semantic Caching
Natural language varies. Exact string matching misses 60β70% of reusable queries. Semantic caching with embedding similarity or prefix hashing captures paraphrased requests without re-inference.
-
Ignoring Output Token Costs
Teams optimize input prompts but leave output generation unbounded. Verbose responses, uncontrolled temperature, and missing max_tokens limits double spend. Always enforce output caps and use structured output formats to constrain generation.
-
Static Routing Thresholds
Hardcoded complexity scores break under distribution shift. User intent evolves. Implement adaptive routing that logs routing decisions, tracks quality metrics per tier, and retrains or adjusts thresholds monthly.
-
Unbounded Retry Loops
Network timeouts, rate limits, and model errors trigger exponential retry chains. Each retry consumes tokens and multiplies cost. Implement exponential backoff with jitter, circuit breakers, and fallback to cached or degraded responses.
-
Context Window Fragmentation
Padding conversations with stale messages, metadata, or unused history inflates input tokens. Use sliding windows, relevance scoring, or automatic summarization to compress history before injection.
-
Optimizing Cost Without Quality Guardrails
Aggressive routing to cheaper models degrades accuracy silently. Implement automated evaluation pipelines (LLM-as-judge, golden dataset scoring, or human sampling) to track quality retention alongside cost metrics.
-
Missing Token-Level Observability
Without per-request token accounting, cost attribution is impossible. Integrate structured logging with request IDs, model tiers, cache hits, and estimated costs. Export to metrics backends for alerting and budget tracking.
Best Practices from Production:
- Treat token accounting as a first-class observability signal, not an afterthought.
- Cache at the semantic layer, not the HTTP layer.
- Route dynamically, but validate routing decisions against quality benchmarks.
- Enforce hard budgets with circuit breakers that degrade gracefully.
- Monitor cost-per-quality, not just cost-per-query.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-volume repetitive queries (FAQ, support) | Semantic caching + fast-tier routing | 60β80% cache hit rate eliminates redundant inference | -65% to -80% |
| Complex reasoning or safety-critical tasks | Premium model routing + output constraints | Quality degradation risk outweighs savings | +10% to +15% (justified) |
| Conversational agents with long history | Sliding window + automatic summarization | Reduces input token volume by 40β60% | -35% to -50% |
| Budget-constrained MVP or internal tool | Multi-tier routing + strict output limits | Balances capability and spend without caching overhead | -40% to -55% |
| Regulated/audited workloads | Full observability + deterministic routing | Traceability required; caching may conflict with compliance | Neutral to -15% |
Configuration Template
export const llmOptimizationConfig = {
tokenBudget: {
maxInputTokens: 8000,
maxOutputTokens: 1024,
costPerInputToken: 0.0000025,
costPerOutputToken: 0.00001,
hardLimitEnabled: true,
},
semanticCache: {
enabled: true,
similarityThreshold: 0.92,
ttlMs: 3600000,
storage: 'redis', // or 'memory' for dev
maxEntries: 50000,
},
routing: {
enabled: true,
tiers: [
{ id: 'fast-7b', maxComplexity: 3, costMultiplier: 0.15 },
{ id: 'balanced-32b', maxComplexity: 6, costMultiplier: 0.45 },
{ id: 'premium-4o', maxComplexity: 10, costMultiplier: 1.0 },
],
fallbackToCacheOnFailure: true,
},
contextManagement: {
maxHistoryMessages: 20,
enableAutoSummarization: true,
summarizeAfterTokens: 4000,
stripMetadata: true,
},
observability: {
emitTokenMetrics: true,
costAlertThreshold: 500, // dollars per day
qualitySamplingRate: 0.05, // 5% of requests evaluated
},
};
Quick Start Guide
- Install dependencies:
npm install redis ioredis tiktoken (or use your preferred embedding/tokenizer library)
- Initialize the orchestrator: Import
LLMOrchestrator, TokenBudgetMiddleware, SemanticCache, and DynamicRouter. Pass your provider client and configuration object.
- Replace direct API calls: Swap
client.generate(model, prompt) with orchestrator.execute(requestId, prompt). Ensure requestId is propagated from your request context.
- Enable telemetry: Wire the token budget middleware to your metrics backend. Set alerts for daily cost thresholds and cache hit rates below 40%.
- Validate routing: Run a golden dataset through the orchestrator. Compare quality scores and latency against baseline. Adjust complexity thresholds if quality drops below 95%.