import { NatsConnection, connect } from 'nats';
const nats = await connect({ servers: 'nats://event-broker:4222' });
export class EventPublisher {
private static instance: EventPublisher;
private constructor() {}
static getInstance() {
if (!EventPublisher.instance) EventPublisher.instance = new EventPublisher();
return EventPublisher.instance;
}
async publish(event: { type: string; payload: any; idempotencyKey: string }) {
const subject = events.${event.type};
const data = JSON.stringify({
...event,
timestamp: Date.now(),
version: 'v1'
});
// NATS supports headers for idempotency routing
const headers = nats.headers();
headers.set('Idempotency-Key', event.idempotencyKey);
await nats.publish(subject, data, { headers });
}
}
**Architecture Decision:** Use NATS over Kafka for startup scale. Kafka requires ZooKeeper/KRaft, partition management, and heavier operational overhead. NATS JetStream provides durability, at-least-once delivery, and consumer groups with a fraction of the complexity. Reserve Kafka for >10M events/day or strict replay requirements.
### Step 2: Database Scaling via Read Replicas and Connection Routing
Primary databases fail under concurrent read/write contention. Implement read/write splitting with a connection router that directs queries based on type. Use connection pooling with strict limits to prevent connection exhaustion.
```typescript
// db-router.ts
import { Pool } from 'pg';
const writePool = new Pool({
host: 'db-primary.internal',
max: 20,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 2000,
});
const readPool = new Pool({
host: 'db-replica.internal',
max: 50,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 2000,
});
export async function query(sql: string, params?: any[], isWrite = false) {
const pool = isWrite ? writePool : readPool;
const client = await pool.connect();
try {
const start = Date.now();
const res = await client.query(sql, params);
// Emit metric to OpenTelemetry
console.log(`Query ${isWrite ? 'WRITE' : 'READ'}: ${Date.now() - start}ms`);
return res;
} finally {
client.release();
}
}
Architecture Decision: Do not shard prematurely. Sharding introduces distributed transaction complexity, cross-shard queries, and rebalancing overhead. Start with read replicas, query optimization, and connection pooling. Shard only when a single table exceeds 2β5TB or write throughput saturates a single primary node. Use logical partitioning (e.g., tenant_id, region) when sharding becomes necessary.
Step 3: Multi-Layer Caching with Stampede Prevention
Caching reduces database load but introduces consistency and concurrency risks. Implement a cache-aside pattern with TTL, cache locking, and request coalescing to prevent thundering herds.
// cache-layer.ts
import Redis from 'ioredis';
import { EventEmitter } from 'events';
const redis = new Redis('redis://cache.internal:6379');
const emitters = new Map<string, EventEmitter>();
export async function getCachedOrFetch<T>(
key: string,
ttl: number,
fetchFn: () => Promise<T>
): Promise<T> {
const cached = await redis.get(key);
if (cached) return JSON.parse(cached);
// Prevent cache stampede: coalesce concurrent fetches
if (emitters.has(key)) {
return new Promise<T>((resolve) => {
emitters.get(key)!.once('result', resolve);
});
}
const emitter = new EventEmitter();
emitters.set(key, emitter);
try {
const data = await fetchFn();
await redis.set(key, JSON.stringify(data), 'EX', ttl);
emitter.emit('result', data);
return data;
} finally {
emitters.delete(key);
}
}
Architecture Decision: Cache hot reads, not writes. Use CDN for static assets, application-level cache for computed results, and Redis for session/data caching. Always set explicit TTLs. Implement cache invalidation via event pub/sub rather than manual deletion to maintain consistency across distributed instances.
Step 4: Observability and Metric-Driven Auto-Scaling
Auto-scaling based on CPU/memory is insufficient for modern applications. Scale based on business and latency metrics: request queue depth, database connection utilization, and p95 latency. Integrate OpenTelemetry for distributed tracing and Prometheus for metric aggregation.
# autoscaling-policy.yaml (Kubernetes HPA example)
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: api-deployment
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: api-service
minReplicas: 3
maxReplicas: 50
metrics:
- type: Pods
pods:
metric:
name: http_request_duration_p95
target:
type: AverageValue
averageValue: "250ms"
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 65
Architecture Decision: Never auto-scale below 3 replicas. Two replicas create split-brain risks during rolling updates. Set cooldown periods (300s minimum) to prevent thrashing. Correlate scaling events with deployment timestamps to distinguish traffic spikes from code regressions.
Pitfall Guide
-
Premature Database Sharding
Sharding before reaching connection or storage limits introduces cross-shard joins, distributed transactions, and operational complexity. Wait until a single primary node hits 80% sustained write utilization or table size exceeds 3TB. Use read replicas and query optimization first.
-
Unbounded Connection Pools
Default connection pool settings often allow hundreds of connections per instance. At 1M users, 10 instances Γ 100 connections = 1,000 concurrent DB connections. Most databases throttle or crash beyond 500β800 active connections. Cap pools, monitor active/idle ratios, and implement queueing when pools are exhausted.
-
Synchronous Service Chains
Service A calls Service B calls Service C. A 200ms latency in C becomes 600ms in A. At scale, timeout propagation causes cascading failures. Replace synchronous calls with events or async polling. Implement circuit breakers with fallback responses for non-critical paths.
-
Cache Stampedes and Thundering Herds
When a popular cache key expires, thousands of requests hit the database simultaneously. Use request coalescing (as shown in the cache-layer example), lock-based fetching, or probabilistic early expiration to stagger refreshes.
-
Missing Idempotency Keys
Retries, network partitions, and broker redeliveries cause duplicate processing. Without idempotency keys, users get charged twice, notifications duplicate, and state corrupts. Generate UUIDs at the edge, store processed keys in Redis with TTL, and skip duplicates before business logic execution.
-
Skipping Progressive Load Testing
Testing at 10k requests/sec and deploying to production guarantees failure at 1M users. Implement synthetic load testing at 2x expected peak. Test failure modes: kill replicas, simulate broker downtime, inject network latency. Validate degradation paths, not just happy paths.
-
Treating Consistency as Absolute
Strong consistency across distributed services kills throughput. Accept eventual consistency for non-critical data (analytics, notifications, feed generation). Use sagas or outbox patterns for transactional consistency where required. Document consistency guarantees per data domain.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Read-heavy workload (>80% reads) | Read replicas + Redis cache + CDN | Offloads primary DB, absorbs repeated queries, reduces egress costs | -40% infra cost |
| Write-heavy workload (>60% writes) | Event-driven pipeline + batch inserts + connection pooling | Decouples write latency, reduces DB round-trips, prevents connection exhaustion | +15% broker cost, -30% DB cost |
| Multi-tenant SaaS | Logical partitioning + tenant-aware routing + row-level security | Isolates data, simplifies compliance, avoids cross-tenant query overhead | +20% storage, -50% support incidents |
| Real-time features (chat, notifications) | WebSocket gateway + Redis Pub/Sub + fallback to polling | Low latency delivery, scales independently from REST API | +10% memory, +25% network |
Configuration Template
// scaling-config.ts
export const ScalingConfig = {
database: {
primary: { host: process.env.DB_PRIMARY_HOST, max: 20, idleTimeout: 30000 },
replica: { host: process.env.DB_REPLICA_HOST, max: 50, idleTimeout: 30000 },
connectionTimeout: 2000,
queryTimeout: 5000,
},
cache: {
redis: { url: process.env.REDIS_URL, maxRetriesPerRequest: 3, enableReadyCheck: false },
ttl: { hot: 300, warm: 1800, cold: 3600 },
stampedePrevention: true,
},
eventBroker: {
url: process.env.NATS_URL,
jetStream: { maxMsgs: 1000000, maxBytes: '10GB', storage: 'file' },
consumerGroup: 'api-workers',
ackWait: 30,
maxDeliver: 5,
},
observability: {
otel: { serviceName: 'api-service', exportInterval: 5000, samplingRate: 0.1 },
metrics: { p95LatencyThreshold: 250, cpuTarget: 65, cooldownSeconds: 300 },
},
rateLimiting: {
windowMs: 60000,
maxRequests: 120,
skipIpHeader: 'x-forwarded-for',
keyGenerator: (req: any) => req.user?.id || req.ip,
},
};
Quick Start Guide
- Deploy the event broker and cache layer: Run
docker compose up -d nats redis to spin up NATS JetStream and Redis. Verify connectivity with nats server check jetstream and redis-cli ping.
- Configure database routing: Point your application to primary and replica hosts. Apply the
db-router.ts pattern and enforce isWrite flags on all mutations.
- Enable observability and auto-scaling: Instrument OpenTelemetry, export metrics to Prometheus, and apply the HPA YAML to your Kubernetes cluster. Validate scaling triggers with a synthetic load test.
- Run progressive load validation: Use
k6 or autocannon to simulate 2x expected peak traffic. Monitor p95 latency, connection pool utilization, and event consumer lag. Adjust pool sizes and TTLs based on observed bottlenecks before production rollout.