lowing implementation demonstrates a production-grade assignment service with SRM detection and sticky session management.
import crypto from 'crypto';
export type Variant = 'control' | 'treatment' | 'override';
export type ExperimentConfig = {
id: string;
variants: Variant[];
weights: number[]; // e.g., [50, 50] for 50/50 split
requiredSampleSize: number;
srmThreshold: number; // Chi-square threshold for SRM alert
};
export class ExperimentService {
private configs: Map<string, ExperimentConfig> = new Map();
private userOverrides: Map<string, Map<string, Variant>> = new Map();
constructor() {
// In production, load configs from a remote config service
this.loadDefaultConfigs();
}
/**
* Deterministic assignment using MurmurHash3 for uniform distribution.
* Hashing the combination of experiment ID and user ID ensures:
* 1. Consistency: Same user always gets same variant.
* 2. Orthogonality: Assignment in one experiment is independent of others.
*/
getVariant(userId: string, experimentId: string): Variant {
const config = this.configs.get(experimentId);
if (!config) return 'control';
// Check for admin/developer overrides
const userOverrides = this.userOverrides.get(userId);
if (userOverrides?.has(experimentId)) {
return userOverrides.get(experimentId)!;
}
const hashInput = `${experimentId}:${userId}`;
const hash = this.murmur3(hashInput);
const normalizedHash = Math.abs(hash) % 10000;
let cumulativeWeight = 0;
for (let i = 0; i < config.variants.length; i++) {
cumulativeWeight += config.weights[i] * 100; // Weights are 0-100 scale
if (normalizedHash < cumulativeWeight) {
return config.variants[i];
}
}
return config.variants[0]; // Fallback
}
/**
* Sample Ratio Mismatch Detection.
* SRM indicates a flaw in the assignment logic, caching, or traffic routing.
* Running analysis with SRM invalidates statistical results.
*/
checkSRM(
experimentId: string,
counts: Record<string, number>
): { isHealthy: boolean; chiSquare: number } {
const config = this.configs.get(experimentId);
if (!config) return { isHealthy: false, chiSquare: 0 };
const totalObserved = Object.values(counts).reduce((a, b) => a + b, 0);
if (totalObserved < 100) return { isHealthy: true, chiSquare: 0 };
let chiSquare = 0;
config.variants.forEach((variant, index) => {
const expected = (config.weights[index] / 100) * totalObserved;
const observed = counts[variant] || 0;
chiSquare += Math.pow(observed - expected, 2) / expected;
});
// Critical value for df=1 at p=0.001 is approx 10.83
// Using strict threshold to catch subtle mismatches
const isHealthy = chiSquare < this.configs.get(experimentId)!.srmThreshold;
return { isHealthy, chiSquare };
}
setOverride(userId: string, experimentId: string, variant: Variant) {
if (!this.userOverrides.has(userId)) {
this.userOverrides.set(userId, new Map());
}
this.userOverrides.get(userId)!.set(experimentId, variant);
}
private murmur3(key: string): number {
// Simplified MurmurHash3 implementation for demonstration
// In production, use a dedicated library like 'murmurhash-js'
const buffer = Buffer.from(key, 'utf8');
const hash = crypto.createHash('md5').update(buffer).digest();
return hash.readInt32LE(0);
}
private loadDefaultConfigs() {
this.configs.set('checkout_flow_v2', {
id: 'checkout_flow_v2',
variants: ['control', 'treatment'],
weights: [50, 50],
requiredSampleSize: 15000,
srmThreshold: 10.83,
});
}
}
Architecture Decisions
- Hash-Based vs. Random Assignment: Random assignment fails to maintain consistency for returning users. Hash-based assignment using a stable identifier (User ID or Anonymous ID) ensures the user experience remains coherent. The hash input must include the experiment ID to prevent correlation between experiments.
- Server-Side vs. Client-Side: Critical metrics and revenue-impacting experiments should use server-side assignment to prevent flickering and ensure accurate tracking. Client-side assignment is acceptable for UI/UX tests where latency is paramount, provided the assignment is cached and the variant is passed to the backend for event enrichment.
- Event Enrichment: All analytics events must include the experiment ID and assigned variant. This allows analysis to be performed post-hoc without coupling the tracking library to the assignment logic.
- SRM as a Gate: The architecture must treat SRM as a hard gate. If SRM is detected, the experiment should be paused, and engineering must investigate assignment logic before resuming.
Pitfall Guide
1. Peeking and Multiple Comparisons
Mistake: Checking p-values daily and stopping the test as soon as p < 0.05.
Impact: This inflates the false positive rate. With daily checks, the probability of a false positive exceeds 20% after two weeks.
Best Practice: Use sequential analysis methods like Alpha Spending (e.g., O'Brien-Fleming) or Bayesian inference with Highest Posterior Density intervals. Alternatively, define a fixed horizon and do not look at results until the sample size is reached.
2. Sample Ratio Mismatch (SRM)
Mistake: Ignoring deviations between planned and actual traffic splits.
Impact: SRM indicates that the randomization is broken. Analysis results become biased and unreliable. Common causes include caching the variant assignment, bot traffic filtering differently per variant, or errors in the assignment function.
Best Practice: Implement automated SRM checks using Chi-square tests. Alert engineering immediately if SRM is detected. Do not analyze data with SRM.
3. Novelty and Primacy Effects
Mistake: Attributing short-term metric lifts to the feature itself without accounting for user reaction to change.
Impact: A new design may show a temporary spike due to curiosity (novelty) or a dip due to confusion (primacy). Shipping based on this data leads to suboptimal long-term performance.
Best Practice: Run tests for a sufficient duration to cover full business cycles (e.g., 2-4 weeks). Analyze trends over time rather than aggregate averages. Segment by user tenure to isolate effects on new vs. returning users.
4. Metric Hacking and Selection
Mistake: Optimizing for vanity metrics or switching primary metrics mid-test based on results.
Impact: P-hacking occurs when teams select the metric that shows significance after the fact. This invalidates the statistical guarantee.
Best Practice: Pre-register the primary metric and analysis plan. Use a hierarchy of metrics: Primary (business goal), Secondary (user engagement), and Guardrail (system health). Never change the primary metric after the test begins.
5. Simpson's Paradox
Mistake: Aggregating results across heterogeneous segments and missing opposing trends.
Impact: An experiment may show a positive overall effect while negatively impacting key segments. For example, a change might improve conversion for desktop users but severely degrade it for mobile users, with the aggregate appearing positive due to traffic volume imbalance.
Best Practice: Pre-define key segments for analysis. Use stratified sampling or post-stratification to ensure balanced representation. Always review segment-level results before shipping.
6. Underpowered Tests
Mistake: Running tests with insufficient traffic to detect the expected effect size.
Impact: High false negative rate. Teams conclude a change has no effect when the test simply lacked sensitivity. This leads to discarding valuable improvements.
Best Practice: Calculate sample size using power analysis before launching. If traffic is low, consider increasing the MDE, extending duration, or using metrics with lower variance (e.g., ratio metrics vs. absolute values).
7. Interference and Overlap
Mistake: Running multiple experiments on the same user journey without proper layering.
Impact: Experiments interfere with each other, causing interaction effects that skew results. A user might be in the treatment of Experiment A and treatment of Experiment B, making it impossible to attribute effects to a single change.
Best Practice: Implement experiment layers. Experiments in the same layer are mutually exclusive. Experiments in different layers can run simultaneously. Use orthogonal assignment to ensure independence between layers.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Low Traffic (<1k MAU) | Multi-Armed Bandit (MAB) or Quasi-Experiment | MAB allocates traffic dynamically to winners, maximizing reward. Quasi-experiments use historical controls when randomization is impossible. | Low (Maximizes value from limited traffic) |
| High Risk / Revenue Critical | Shadow Testing + Staged Rollout | Run the new logic in parallel without affecting users (shadow) to validate performance and metrics before exposure. | High (Requires infra for parallel execution) |
| Personalization / UX | Client-Side Assignment + Caching | Reduces latency and server load. Caching ensures consistency. | Low |
| Backend / Algorithm Change | Server-Side Assignment | Ensures accurate tracking and prevents flickering. Critical for metrics dependent on server processing. | Medium |
| Rapid Prototyping | Feature Flags with A/B Analysis | Quick toggle capability. Use with caution; ensure proper randomization and tracking are added. | Low |
Configuration Template
Use this JSON schema to define experiments in your configuration management system. This template enforces structure and includes fields for statistical governance.
{
"experiment": {
"id": "exp-checkout-2024-q3",
"name": "Checkout Flow Optimization",
"status": "DRAFT",
"hypothesis": "Reducing form fields will increase conversion by 5% without increasing error rates.",
"metrics": {
"primary": {
"name": "checkout_conversion",
"type": "RATE",
"direction": "UP",
"mde": 0.05,
"power": 0.8,
"alpha": 0.05,
"required_sample_size": 15000
},
"guardrails": [
{ "name": "api_latency_p95", "threshold": 200, "unit": "ms" },
{ "name": "checkout_error_rate", "direction": "UP", "max_delta": 0.01 }
]
},
"assignment": {
"method": "HASH",
"hash_key": "user_id",
"variants": [
{ "id": "control", "weight": 50 },
{ "id": "treatment", "weight": 50 }
],
"layer": "checkout_ui",
"srm_check": {
"enabled": true,
"chi_square_threshold": 10.83
}
},
"segments": [
{ "name": "new_users", "condition": "user_tenure_days < 30" },
{ "name": "mobile", "condition": "device_type == 'mobile'" }
],
"analysis": {
"method": "SEQUENTIAL",
"alpha_spending": "OBRIEN_FLEMING"
}
}
}
Quick Start Guide
-
Initialize Experiment Service:
Import the ExperimentService class and load configurations from your remote config store. Ensure the service is initialized early in the application lifecycle.
const expService = new ExperimentService();
await expService.loadRemoteConfigs();
-
Assign Variant at Entry Point:
Call getVariant when the user enters the experiment scope. Store the result in the user context or session.
const variant = expService.getVariant(userId, 'exp-checkout-2024-q3');
req.context.experimentVariant = variant;
-
Instrument Events:
Attach experiment metadata to all relevant analytics events.
track('checkout_started', {
experiment_id: 'exp-checkout-2024-q3',
variant: req.context.experimentVariant,
// ... other payload
});
-
Monitor Health:
Set up a cron job or pipeline step to run SRM checks periodically.
const counts = await fetchCounts('exp-checkout-2024-q3');
const srmResult = expService.checkSRM('exp-checkout-2024-q3', counts);
if (!srmResult.isHealthy) {
triggerAlert('SRM detected in exp-checkout-2024-q3');
pauseExperiment('exp-checkout-2024-q3');
}
-
Analyze and Decide:
After reaching the required sample size, run the pre-registered analysis. Review primary metrics, guardrails, and segments. Make a decision to ship, iterate, or discard based on the data.