reateHash } from 'crypto';
interface ExperimentConfig {
id: string;
variants: string[];
trafficAllocation: number; // 0.0 to 1.0
salt: string;
}
export class ExperimentAssignmentService {
private experiments: Map<string, ExperimentConfig>;
constructor(configs: ExperimentConfig[]) {
this.experiments = new Map(configs.map(c => [c.id, c]));
}
assign(userId: string, experimentId: string): string | null {
const config = this.experiments.get(experimentId);
if (!config) return null;
// Deterministic bucketing using consistent hash
const hashInput = `${config.salt}:${userId}:${experimentId}`;
const hash = createHash('sha256').update(hashInput).digest('hex');
const hashValue = parseInt(hash.slice(0, 8), 16);
const maxHash = 0xFFFFFFFF;
const normalizedValue = hashValue / maxHash;
// Traffic gating
if (normalizedValue > config.trafficAllocation) return null;
// Variant assignment
const variantIndex = Math.floor(normalizedValue * config.variants.length);
return config.variants[Math.min(variantIndex, config.variants.length - 1)];
}
}
**Architecture rationale:** Server-side assignment eliminates client-side race conditions and ensures backend services receive consistent variant metadata. The SHA-256 hash provides uniform distribution and cryptographic stability. Traffic gating happens before variant selection to prevent allocation drift when variants are added/removed.
### Step 2: Idempotent Event Tracking Middleware
Events must carry experiment context, survive retries, and avoid double-counting. A middleware layer attaches metadata and batches payloads for downstream ingestion.
```typescript
import { Request, Response, NextFunction } from 'express';
interface TrackedEvent {
eventId: string;
userId: string;
timestamp: number;
experimentId: string;
variant: string;
metric: string;
value: number;
}
export class EventTrackingMiddleware {
private batch: TrackedEvent[] = [];
private readonly BATCH_SIZE = 100;
private readonly FLUSH_INTERVAL_MS = 5000;
constructor() {
setInterval(() => this.flush(), this.FLUSH_INTERVAL_MS);
}
track(req: Request, res: Response, next: NextFunction) {
const userId = req.headers['x-user-id'] as string;
const experimentId = req.headers['x-experiment-id'] as string;
const variant = req.headers['x-variant'] as string;
if (!userId || !experimentId || !variant) {
return next();
}
// Attach tracking context to request
(req as any).trackingContext = { userId, experimentId, variant };
next();
}
emit(req: Request, metric: string, value: number = 1) {
const ctx = (req as any).trackingContext;
if (!ctx) return;
const event: TrackedEvent = {
eventId: crypto.randomUUID(),
userId: ctx.userId,
timestamp: Date.now(),
experimentId: ctx.experimentId,
variant: ctx.variant,
metric,
value
};
this.batch.push(event);
if (this.batch.length >= this.BATCH_SIZE) {
this.flush();
}
}
private async flush() {
if (this.batch.length === 0) return;
const payload = [...this.batch];
this.batch = [];
// Send to Kafka/EventBridge/Custom Pipeline
await this.ingest(payload);
}
private async ingest(events: TrackedEvent[]) {
// Idempotent ingestion logic with deduplication key = eventId
console.log(`Ingesting ${events.length} events`);
}
}
Architecture rationale: Batching reduces I/O overhead and network roundtrips. UUID-based deduplication prevents double-counting from client retries. Context attachment via headers ensures backend services can emit events without coupling to the tracking layer.
Step 3: Statistical Evaluation Engine
Production evaluation requires correction for multiple comparisons, sequential testing controls, and robust variance estimation. The following implements a Frequentist approach with Bonferroni correction and a sequential testing guard.
interface MetricResult {
variant: string;
mean: number;
variance: number;
count: number;
}
export class ExperimentEvaluator {
evaluate(
results: MetricResult[],
alpha: number = 0.05,
maxPeeks: number = 5
): { winner: string | null; pValue: number; significant: boolean } {
if (results.length < 2) throw new Error('Minimum 2 variants required');
// Bonferroni correction for multiple metrics
const correctedAlpha = alpha / maxPeeks;
// Pairwise t-test against control (assumed first variant)
const control = results[0];
let minPValue = 1;
let winner = null;
for (let i = 1; i < results.length; i++) {
const variant = results[i];
const pooledSE = Math.sqrt(
(control.variance / control.count) + (variant.variance / variant.count)
);
const tStat = (variant.mean - control.mean) / pooledSE;
const df = control.count + variant.count - 2;
const pValue = this.tToPValue(tStat, df);
if (pValue < minPValue) {
minPValue = pValue;
winner = variant.variant;
}
}
return {
winner: minPValue < correctedAlpha ? winner : null,
pValue: minPValue,
significant: minPValue < correctedAlpha
};
}
private tToPValue(t: number, df: number): number {
// Simplified approximation using normal distribution for large df
const z = Math.abs(t);
return 2 * (1 - this.normalCDF(z));
}
private normalCDF(x: number): number {
// Abramowitz & Stegun approximation
const a1 = 0.254829592;
const a2 = -0.284496736;
const a3 = 1.421413741;
const a4 = -1.453152027;
const a5 = 1.061405429;
const p = 0.3275911;
const sign = x < 0 ? -1 : 1;
x = Math.abs(x) / Math.sqrt(2);
const t = 1.0 / (1.0 + p * x);
const y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
return 0.5 * (1.0 + sign * y);
}
}
Architecture rationale: Sequential testing without correction invalidates p-values. The maxPeeks parameter enforces a predefined evaluation schedule. Bonferroni correction is conservative but production-safe; for higher sensitivity, replace with Benjamini-Hochberg FDR control. The engine outputs decision-ready signals, not raw statistics, preventing product teams from misinterpreting intermediate data.
Pitfall Guide
-
Peeking & Early Stopping
Root cause: Evaluating results before reaching statistical power violates the fixed-sample assumption of Frequentist tests. Each peek multiplies the false positive risk.
Mitigation: Predefine evaluation checkpoints. Use sequential testing methods (SPRT, Bayesian with stopping boundaries) or enforce a strict peek limit with alpha spending functions.
-
Sample Ratio Mismatch (SRM)
Root cause: Assignment logic, tracking gaps, or client-side filtering cause observed variant ratios to diverge from expected allocation.
Mitigation: Run SRM tests before evaluating metrics. If χ² test fails (p < 0.01), halt evaluation. Audit assignment service logs, SDK initialization order, and ad-blocker interference.
-
Testing Too Many Variants Simultaneously
Root cause: Splitting traffic across 4+ variants dilutes power per variant, requiring exponential sample size increases.
Mitigation: Limit to 2–3 variants per experiment. Use multi-armed bandit algorithms only for exploration-exploitation tradeoffs, not hypothesis testing. Run sequential experiments instead.
-
Ignoring Network Effects & Interference
Root cause: User interactions violate the Stable Unit Treatment Value Assumption (SUTVA). Social features, marketplaces, or shared resources cause contamination between variants.
Mitigation: Cluster randomization at account/tenant level. Use graph-based assignment or switchback testing for time-series interference. Model spillover effects in evaluation.
-
Undefined Primary Metric & Post-Hoc Hypothesis Generation
Root cause: Teams measure everything, then cherry-pick significant metrics. This guarantees false discoveries.
Mitigation: Define one primary metric, secondary metrics, and guardrail metrics before launch. Pre-register analysis plans. Treat post-hoc findings as hypotheses for future tests, not conclusions.
-
Overlooking Segment Heterogeneity
Root cause: Aggregated results mask divergent behavior across user cohorts, devices, or geographies.
Mitigation: Predefine segment splits. Use stratified randomization or post-stratification weighting. Avoid data-dredging segments after test completion.
-
Inadequate Infrastructure for Rollback
Root cause: Winning variants are deployed permanently without guardrails, causing regression when underlying assumptions shift.
Mitigation: Tie experiment state to deployment pipelines. Implement automatic rollback on guardrail metric degradation. Maintain variant toggle APIs for immediate reversion.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Frontend UI layout change | Client-side SDK with server fallback | Low latency, immediate feedback, easy A/B toggle | Low engineering, moderate data pipeline |
| Backend ranking algorithm | Server-side assignment + cluster randomization | Prevents interference, ensures consistent state across services | High engineering, high data pipeline |
| Pricing / monetization experiment | Tenant-level assignment with guardrail monitoring | Avoids cross-user contamination, protects revenue stability | Medium engineering, high monitoring overhead |
| Infrastructure migration (DB/cache) | Canary deployment with metric diffing | Not a user-facing hypothesis; requires gradual traffic shift | Low experimentation overhead, high infra cost |
| Multi-variant feature exploration | Multi-armed bandit with exploration decay | Optimizes for learning speed over statistical rigor | Medium engineering, low statistical overhead |
Configuration Template
{
"experimentId": "exp-checkout-flow-v3",
"name": "Checkout Flow Optimization",
"status": "running",
"trafficAllocation": 0.25,
"variants": ["control", "variant-a", "variant-b"],
"assignment": {
"strategy": "consistent_hash",
"hashKey": "userId",
"salt": "prod-experiment-2024-q3"
},
"metrics": {
"primary": {
"name": "checkout_completion_rate",
"type": "conversion",
"direction": "increase"
},
"secondary": [
{"name": "avg_session_duration", "type": "continuous"},
{"name": "cart_abandonment_rate", "type": "conversion"}
],
"guardrails": [
{"name": "error_rate", "threshold": 0.02, "action": "rollback"}
]
},
"evaluation": {
"method": "frequentist_t_test",
"alpha": 0.05,
"correction": "bonferroni",
"maxPeeks": 5,
"minSampleSize": 15000,
"schedule": "daily_0900_utc"
},
"targeting": {
"include": ["region:US", "accountType:free"],
"exclude": ["internal:true", "bot:true"]
},
"rollback": {
"enabled": true,
"conditions": ["guardrail_breach", "srm_detected"],
"autoDeploy": false
}
}
Quick Start Guide
- Initialize Assignment Service: Instantiate
ExperimentAssignmentService with your experiment configs. Mount it as middleware to attach x-variant headers to authenticated requests.
- Wire Tracking Middleware: Deploy
EventTrackingMiddleware. Replace the ingest stub with your event pipeline (Kafka, Kinesis, or custom HTTP endpoint). Ensure deduplication on eventId.
- Configure Evaluation Schedule: Set up a cron job or workflow orchestrator to pull aggregated metrics from your warehouse, run
ExperimentEvaluator.evaluate(), and publish results to your dashboard. Enforce maxPeeks in your CI/CD pipeline.
- Validate Before Launch: Run a 24-hour dry run with 1% traffic. Verify SRM passes (χ² p > 0.05), events contain correct variant metadata, and guardrail metrics remain stable. Scale traffic to target allocation only after validation succeeds.