tter } from 'events';
interface CacheConfig {
ttlSeconds: number;
mutexTimeoutMs: number;
metricsEmitter?: EventEmitter;
}
interface CacheEntry<T> {
value: T;
version: string;
expiresAt: number;
}
export class ProductionCacheManager {
private redis: Redis;
private config: CacheConfig;
private mutexKeyPrefix = 'lock:';
constructor(redisClient: Redis, config: CacheConfig) {
this.redis = redisClient;
this.config = config;
}
async get<T>(key: string, fallback: () => Promise<T>): Promise<T> {
const cacheKey = app:v1:${key};
// Attempt cache hit
const cached = await this.redis.get(cacheKey);
if (cached) {
const entry: CacheEntry<T> = JSON.parse(cached);
this.emitMetric('cache.hit', key);
return entry.value;
}
// Cache miss: protect against stampede
const mutexKey = `${this.mutexKeyPrefix}${cacheKey}`;
const acquired = await this.redis.set(mutexKey, '1', 'NX', 'EX', Math.ceil(this.config.mutexTimeoutMs / 1000));
if (acquired) {
try {
// Double-check after lock acquisition
const recheck = await this.redis.get(cacheKey);
if (recheck) {
this.emitMetric('cache.hit', key);
return JSON.parse(recheck).value;
}
// Fetch from source
const value = await fallback();
const entry: CacheEntry<T> = {
value,
version: Date.now().toString(36),
expiresAt: Date.now() + (this.config.ttlSeconds * 1000)
};
await this.redis.set(cacheKey, JSON.stringify(entry), 'EX', this.config.ttlSeconds);
this.emitMetric('cache.miss', key);
return value;
} finally {
await this.redis.del(mutexKey);
}
} else {
// Wait for lock holder to populate cache, then retry
await new Promise(resolve => setTimeout(resolve, 50));
return this.get(key, fallback);
}
}
async invalidate(key: string): Promise<void> {
const cacheKey = app:v1:${key};
await this.redis.del(cacheKey);
await this.redis.del(${this.mutexKeyPrefix}${cacheKey});
this.emitMetric('cache.invalidate', key);
}
async set<T>(key: string, value: T, ttlOverride?: number): Promise<void> {
const cacheKey = app:v1:${key};
const entry: CacheEntry<T> = {
value,
version: Date.now().toString(36),
expiresAt: Date.now() + ((ttlOverride ?? this.config.ttlSeconds) * 1000)
};
await this.redis.set(cacheKey, JSON.stringify(entry), 'EX', ttlOverride ?? this.config.ttlSeconds);
this.emitMetric('cache.set', key);
}
private emitMetric(type: string, key: string): void {
if (this.config.metricsEmitter) {
this.config.metricsEmitter.emit(type, { key, timestamp: Date.now() });
}
}
}
### Architecture Decisions and Rationale
**Cache-Aside with Explicit Invalidation:** The application controls when data enters and leaves the cache. This prevents silent staleness and allows precise invalidation triggers tied to domain events. TTL serves as a safety net, not the primary invalidation mechanism.
**Distributed Mutex for Stampede Protection:** Concurrent cache misses on hot keys cause database amplification. The `NX` + `EX` pattern ensures only one request executes the fallback function. Recheck after lock acquisition prevents redundant fetches if another node populated the cache during lock wait.
**Key Versioning (`app:v1:`):** Embedding version prefixes in keys enables zero-downtime cache schema migrations. When serialization format or data structure changes, increment the version prefix. Old keys expire naturally while new keys populate independently.
**Metrics Emission:** Hit/miss ratios and invalidation events are critical for capacity planning and pattern validation. Without instrumentation, caching becomes a black box that masks degradation until user-facing latency spikes occur.
**Separation of Cache and Session Clients:** Production systems should never share Redis connections between caching, rate limiting, and session storage. Connection pool exhaustion in one domain cascades to all others. Dedicated clients with isolated network buffers prevent cross-domain failure propagation.
## Pitfall Guide
### 1. Cache Stampede (Thundering Herd)
**Mistake:** Relying solely on TTL expiration without concurrency control. When a hot key expires, hundreds of requests simultaneously hit the database.
**Fix:** Implement distributed mutex locking as shown in the core solution. Limit fallback execution to one request per key. Use exponential backoff for retrying waiting requests. Monitor `cache.miss` burst rates to detect stampede conditions early.
### 2. TTL-Only Invalidation
**Mistake:** Assuming TTL guarantees data freshness. TTL creates predictable staleness windows that violate consistency requirements during updates.
**Fix:** Pair TTL with explicit invalidation. Invalidate cache entries synchronously after successful database writes. Use domain events to trigger invalidation across microservices. Reserve TTL strictly for disaster recovery and memory management.
### 3. Memory Fragmentation from Large Values
**Mistake:** Storing unbounded JSON objects or binary blobs directly in Redis. Large values increase serialization overhead, cause memory fragmentation, and degrade eviction performance.
**Fix:** Compress payloads exceeding 1KB using `zlib` or `lz4`. Split large objects into logical sub-keys using Redis hashes (`HSET`). Monitor `used_memory_frag_ratio` and trigger alerts when fragmentation exceeds 1.5. Implement value size limits at the application layer.
### 4. Key Namespace Collisions
**Mistake:** Using flat key structures like `user:123` or `config:global`. Cross-service collisions cause silent data corruption and cache poisoning.
**Fix:** Enforce hierarchical key naming: `{service}:{domain}:{entity}:{id}:{field}`. Use Redis Sentinel or Cluster-aware key hashing to ensure consistent distribution. Validate key patterns in CI/CD pipelines using regex linting.
### 5. Silent Cache Failures
**Mistake:** Allowing cache client timeouts to degrade application response times. When Redis becomes unreachable, synchronous `GET` calls block the event loop.
**Fix:** Implement circuit breaking with short timeouts (50-100ms). Fail open to the database when cache is unavailable. Use `ioredis` retry strategy with exponential backoff and maximum attempt limits. Log cache failures separately from business logic errors.
### 6. Missing Hit/Miss Metrics
**Mistake:** Deploying caching without observability. Teams cannot validate pattern effectiveness or detect hit rate decay.
**Fix:** Instrument every cache operation. Track `hit_rate`, `miss_rate`, `avg_latency`, and `invalidation_count`. Export metrics to Prometheus/Grafana. Set alerts when hit rate drops below 60% for read-heavy endpoints or when miss latency exceeds 200ms.
### 7. Inconsistent Serialization
**Mistake:** Mixing serialization formats across services or updating object structures without cache versioning. Causes `JSON.parse` failures and silent data corruption.
**Fix:** Standardize on JSON with explicit schema validation. Use TypeScript interfaces with runtime type guards. Increment cache key version prefixes on schema changes. Implement backward-compatible deserialization during migration windows.
## Production Bundle
### Action Checklist
- [ ] Audit current cache patterns: Identify endpoints using Cache-Aside, Write-Through, or TTL-only strategies and map them to consistency requirements.
- [ ] Implement stampede protection: Add distributed mutex locking to all cache `get` operations on high-traffic keys (>100 req/s).
- [ ] Enforce explicit invalidation: Replace TTL-dependent freshness with synchronous cache deletion after successful writes.
- [ ] Standardize key naming: Apply hierarchical namespace prefixes and validate patterns in automated tests.
- [ ] Isolate Redis connections: Deploy dedicated client pools for caching, sessions, and rate limiting with separate connection limits.
- [ ] Instrument metrics: Track hit/miss ratios, invalidation counts, and cache latency; alert on degradation thresholds.
- [ ] Test failure modes: Simulate Redis unavailability, network partitions, and stampede conditions in staging environments.
### Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|----------|---------------------|-----|-------------|
| Read-heavy dashboard with 5-minute data tolerance | Cache-Aside + TTL | Minimizes database load; staleness window aligns with business requirements | Low (cache RAM + minor invalidation overhead) |
| Financial transactions requiring strong consistency | Write-Through | Guarantees cache and database state alignment; eliminates stale read risk | High (doubles write latency and DB load) |
| High-volume event ingestion with async processing | Write-Behind | Batches writes to reduce database pressure; accepts bounded data loss risk | Medium (requires durable write queue and recovery logic) |
| Multi-tenant SaaS with frequent schema updates | Cache-Aside + Versioned Keys | Enables zero-downtime migrations; explicit invalidation prevents cross-tenant data leakage | Low-Medium (versioning increases key count but prevents corruption) |
### Configuration Template
```typescript
// redis-cache.config.ts
import Redis from 'ioredis';
import { ProductionCacheManager } from './ProductionCacheManager';
import { EventEmitter } from 'events';
const metricsBus = new EventEmitter();
// Cache-specific client with isolated connection pool
export const cacheClient = new Redis({
host: process.env.REDIS_CACHE_HOST || '127.0.0.1',
port: Number(process.env.REDIS_CACHE_PORT) || 6379,
password: process.env.REDIS_CACHE_PASSWORD,
maxRetriesPerRequest: 3,
retryStrategy: (times) => Math.min(times * 50, 2000),
connectTimeout: 1000,
commandTimeout: 100,
enableReadyCheck: true,
keyPrefix: 'cache:v1:',
family: 4
});
cacheClient.on('error', (err) => {
console.error('Cache client error:', err.message);
});
cacheClient.on('connect', () => {
console.log('Cache client connected');
});
// Production cache manager instance
export const cacheManager = new ProductionCacheManager(cacheClient, {
ttlSeconds: 300,
mutexTimeoutMs: 500,
metricsEmitter: metricsBus
});
// Metrics collection hook
metricsBus.on('cache.hit', (data) => {
// Export to Prometheus/Grafana/CloudWatch
console.log(`[METRIC] cache.hit key=${data.key} ts=${data.timestamp}`);
});
metricsBus.on('cache.miss', (data) => {
console.log(`[METRIC] cache.miss key=${data.key} ts=${data.timestamp}`);
});
metricsBus.on('cache.invalidate', (data) => {
console.log(`[METRIC] cache.invalidate key=${data.key} ts=${data.timestamp}`);
});
Quick Start Guide
- Install dependencies:
npm install ioredis @types/ioredis
- Create isolated Redis client: Configure a dedicated connection pool with short timeouts, retry limits, and key prefixes as shown in the configuration template.
- Wrap data fetches: Replace direct database calls with
cacheManager.get('resource:id', () => db.query(...)) to enable automatic cache-aside behavior and stampede protection.
- Add invalidation triggers: Call
cacheManager.invalidate('resource:id') immediately after successful write operations to maintain consistency.
- Verify metrics: Monitor
cache.hit and cache.miss events in your observability platform. Adjust TTL and mutex timeout values based on traffic patterns and latency requirements.