eventName: string;
timestamp: number; // ISO string or epoch
properties: Record<string, unknown>;
cohortId: string; // e.g., '2023-Q4', 'enterprise_trial'
}
export const validateEvent = (event: unknown): event is RetentionEvent => {
const schema = event as RetentionEvent;
return !!(
schema.userId &&
schema.sessionId &&
schema.eventType &&
schema.eventName &&
schema.timestamp &&
schema.cohortId
);
};
**Rationale:** Enforcing a `cohortId` and `sessionId` allows for granular cohort analysis and session-based friction detection. Type safety prevents schema drift, which is a common cause of data quality issues in retention pipelines.
### 2. Behavioral Scoring Engine
Retention risk is not binary. A scoring engine calculates a "Retention Health Score" based on user behavior decay and engagement frequency.
```typescript
// src/retention/scorer.ts
import Redis from 'ioredis';
const redis = new Redis();
// Decay factor: older actions contribute less to current score
const DECAY_FACTOR = 0.95;
const WEIGHTS = {
'feature_use': 1.0,
'error': -0.5,
'churn_signal': -2.0,
};
export const calculateRetentionScore = async (userId: string): Promise<number> => {
const events = await redis.zrange(`user:events:${userId}`, 0, -1, 'WITHSCORES');
let score = 0;
const now = Date.now();
for (let i = 0; i < events.length; i += 2) {
const eventName = events[i];
const timestamp = parseFloat(events[i + 1]);
const weight = WEIGHTS[eventName as keyof typeof WEIGHTS] || 0.5;
// Time decay: events older than 30 days lose significant weight
const daysSinceEvent = (now - timestamp) / (1000 * 60 * 60 * 24);
const decay = Math.pow(DECAY_FACTOR, daysSinceEvent);
score += weight * decay;
}
return score;
};
Rationale: Using Redis ZSETs allows for efficient storage and retrieval of time-series events with scores. The decay algorithm ensures that the score reflects recent behavior, making the system sensitive to immediate churn risks rather than historical activity.
3. Real-Time Intervention Trigger
The engine evaluates the score against thresholds and triggers interventions. This must be idempotent and rate-limited to prevent notification fatigue.
// src/retention/trigger.ts
import { calculateRetentionScore } from './scorer';
const THRESHOLDS = {
AT_RISK: 50,
CRITICAL: 20,
};
export const evaluateRetentionTrigger = async (userId: string) => {
const score = await calculateRetentionScore(userId);
if (score < THRESHOLDS.CRITICAL) {
// Trigger high-priority intervention
await sendIntervention(userId, 'critical', {
channel: 'in_app_modal',
action: 'offer_support',
});
} else if (score < THRESHOLDS.AT_RISK) {
// Trigger standard intervention
await sendIntervention(userId, 'at_risk', {
channel: 'push_notification',
action: 'highlight_value_prop',
});
}
};
// Rate limiter to prevent spam
const sendIntervention = async (
userId: string,
riskLevel: string,
payload: Record<string, string>
) => {
const rateLimitKey = `rate_limit:retention:${userId}:${riskLevel}`;
const lastSent = await redis.get(rateLimitKey);
if (lastSent) return; // Already sent recently
await redis.set(rateLimitKey, Date.now(), 'EX', 86400); // 24h cooldown
// Dispatch to notification service
console.log(`Dispatching ${riskLevel} intervention to ${userId}`, payload);
// await notificationService.send(userId, payload);
};
Rationale: Interventions are gated by rate limiting to preserve user trust. The separation of risk levels allows for tiered responses: critical risk triggers high-friction, high-value interventions (like support offers), while moderate risk triggers lower-friction nudges.
4. Architecture Decisions
- Stream Processing vs. Batch: Retention triggers must operate on streams. Batch processing introduces latency that renders interventions irrelevant. Use Kafka or Kinesis for event ingestion and a stream processor (e.g., kSQL or Node.js workers) for scoring.
- Idempotency: Retention triggers are re-evaluated frequently. The intervention system must be idempotent to ensure users receive the correct message only once per risk episode.
- Privacy by Design: Retention data often contains PII. Ensure the scoring engine processes data in a privacy-compliant manner, masking sensitive fields before they enter the analytics pipeline.
Pitfall Guide
1. Event Sprawl and Schema Drift
Mistake: Tracking every click and interaction without a governed schema.
Impact: Data lakes become unusable for retention analysis. Engineers spend 40% of their time debugging event schemas rather than optimizing retention.
Best Practice: Implement a strict event contract. Use code generation tools to derive TypeScript interfaces from a central schema definition. Reject events that do not match the schema at the ingestion edge.
2. Identity Resolution Failures
Mistake: Treating anonymous and authenticated events as separate users.
Impact: Retention scores are fragmented. A user may appear "at-risk" because the system loses track of their pre-signup engagement, leading to irrelevant interventions.
Best Practice: Implement robust identity stitching. Map anonymousId to userId immediately upon authentication and retroactively merge historical events.
3. Notification Fatigue
Mistake: Triggering interventions too frequently or overlapping channels.
Impact: Users mute notifications or uninstall the app. Retention drops due to the intervention itself.
Best Practice: Enforce global rate limits across channels. Implement a "quiet hours" policy and a suppression list for users who have explicitly opted out. Use a unified frequency cap.
4. Vanity Metrics Over Substance
Mistake: Optimizing for Daily Active Users (DAU) instead of Retention Cohorts.
Impact: Teams may drive short-term spikes via clickbait or gamification that do not correlate with long-term value, increasing churn in subsequent cohorts.
Best Practice: Define a "North Star Metric" tied to value delivery (e.g., "Transactions Completed" or "Reports Generated"). Optimize retention interventions to drive this metric, not just logins.
5. Ignoring the "Silent Churn"
Mistake: Focusing only on explicit churn signals (cancellations) and missing behavioral decay.
Impact: By the time a user cancels, the churn is irreversible. The opportunity to intervene was missed weeks ago.
Best Practice: Model behavioral decay patterns. Identify leading indicators of churn (e.g., decrease in session duration, increase in error rates, drop in feature adoption) and trigger interventions based on these signals before cancellation.
6. Correlation vs. Causation Errors
Mistake: Assuming that users who use Feature X have higher retention, therefore promoting Feature X will increase retention.
Impact: Resources are wasted promoting features that are correlated with retention but do not cause it. These users may have been retained regardless of the feature.
Best Practice: Use A/B testing to validate causation. Run controlled experiments where the intervention is applied to a subset of users and measure the delta in retention compared to a control group.
7. Privacy and Compliance Violations
Mistake: Using retention data for targeting without proper consent or data minimization.
Impact: GDPR/CCPA fines and loss of user trust. Retention strategies can backfire if users feel surveilled.
Best Practice: Implement data retention policies that automatically purge raw event data after a defined period. Ensure retention scoring uses aggregated or pseudonymized data where possible. Provide clear user controls over notification preferences.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| High-Value Enterprise Churn Risk | Real-time In-App Modal + CSM Alert | Immediate, high-touch intervention required to save high LTV accounts. | High (Engineering + CSM time), but ROI justified by LTV. |
| Low-Value Free Tier Decay | Automated Email Sequence | Scalable intervention for users with lower immediate value. | Low (Email service cost), minimal engineering overhead. |
| Onboarding Friction Detected | Contextual Help / Walkthrough | Addresses specific usability issues during the critical first session. | Medium (UI development), reduces support tickets. |
| Feature Adoption Drop | Gamification / Incentive | Encourages exploration of underutilized features to drive stickiness. | Medium (Feature dev + incentive cost), requires A/B testing. |
| Global Rate Limit Exceeded | Queue + Backoff Strategy | Prevents system overload and user spam during traffic spikes. | Low (Infrastructure tuning), protects brand reputation. |
Configuration Template
Use this template to configure retention thresholds and intervention channels in a centralized config file.
// retention.config.json
{
"scoring": {
"decayFactor": 0.95,
"weights": {
"feature_use": 1.0,
"error": -0.5,
"churn_signal": -2.0
},
"thresholds": {
"at_risk": 50,
"critical": 20
}
},
"interventions": {
"at_risk": {
"channel": "push_notification",
"cooldown_hours": 24,
"actions": ["highlight_value_prop", "nudge_feature"]
},
"critical": {
"channel": "in_app_modal",
"cooldown_hours": 72,
"actions": ["offer_support", "discount_offer"],
"alert_csm": true
}
},
"rate_limits": {
"global_per_minute": 1000,
"user_per_day": 3,
"quiet_hours": {
"start": "22:00",
"end": "08:00",
"timezone": "UTC"
}
}
}
Quick Start Guide
-
Initialize the Engine:
npm install @codcompass/retention-engine ioredis
Import the engine in your application entry point and configure the Redis connection.
-
Instrument Key Events:
Add the trackRetentionEvent function to critical user actions (e.g., feature usage, errors, signups). Ensure every event includes userId, sessionId, and cohortId.
-
Deploy Scoring Worker:
Set up a background worker that listens to the event stream and updates the Redis ZSET for each user. Configure the scoring parameters in retention.config.json.
-
Verify Interventions:
Use the internal dashboard to simulate user behavior and verify that interventions trigger correctly based on thresholds. Check rate limits and channel routing.
-
Monitor and Iterate:
Review the retention dashboard weekly. Adjust weights and thresholds based on A/B test results and cohort analysis. Continuously refine the North Star Metric alignment.