n>[] = [];
// Database connectivity & schema version
checks.push(
createConnection({
host: config.dbHost,
port: config.dbPort,
user: config.dbUser,
password: config.dbPass,
connectTimeout: 5000,
}).then((conn) => {
return conn.query('SELECT VERSION() as db_version').then(() => {
conn.end();
return true;
});
}).catch(() => false)
);
// SSL certificate validity
checks.push(
new Promise<boolean>((resolve) => {
const req = https.get(config.apiHealthEndpoint, { timeout: 3000 }, (res) => {
const cert = res.socket?.getPeerCertificate();
const valid = cert && new Date(cert.valid_to) > new Date();
resolve(valid);
});
req.on('error', () => resolve(false));
})
);
const results = await Promise.allSettled(checks);
const allPassed = results.every(r => r.status === 'fulfilled' && r.value === true);
if (!allPassed) {
throw new Error('Launch gate failed: environment validation did not pass all checks.');
}
return true;
}
**Architecture Rationale:** Centralizing validation prevents scattered `if (!process.env.X)` checks throughout the codebase. By abstracting checks into a single gate, you can run them locally, in CI, or as a pre-deployment hook. The use of `Promise.allSettled` ensures partial failures don't crash the validation process, allowing precise error reporting.
### Step 2: Payment & Webhook Idempotency
Payment processors like Stripe deliver events asynchronously. Network retries, duplicate deliveries, and delayed processing are guaranteed in production. Without idempotency, a single successful charge can trigger multiple subscription activations, leading to billing disputes and support overhead.
```typescript
// payment-pipeline.ts
import { verifyWebhookSignature } from 'stripe';
import { PrismaClient } from '@prisma/client';
const prisma = new PrismaClient();
interface WebhookPayload {
id: string;
type: string;
data: { object: { id: string; status: string; customer: string } };
}
export async function handlePaymentWebhook(
payload: string,
signature: string,
endpointSecret: string
): Promise<{ status: number; message: string }> {
let event: WebhookPayload;
try {
event = verifyWebhookSignature(payload, signature, endpointSecret) as WebhookPayload;
} catch {
return { status: 400, message: 'Invalid signature' };
}
// Idempotency guard: process only once per event ID
const processed = await prisma.webhookLog.findUnique({
where: { eventId: event.id },
});
if (processed) {
return { status: 200, message: 'Event already processed' };
}
try {
await prisma.$transaction(async (tx) => {
await tx.webhookLog.create({
data: { eventId: event.id, type: event.type, payload },
});
if (event.type === 'invoice.payment_succeeded') {
const customerId = event.data.object.customer;
await tx.subscription.upsert({
where: { customerId },
create: { customerId, status: 'active', stripeId: event.data.object.id },
update: { status: 'active' },
});
}
});
return { status: 200, message: 'Processed' };
} catch (err) {
console.error('Webhook processing failed:', err);
return { status: 500, message: 'Internal processing error' };
}
}
Architecture Rationale: Wrapping webhook processing in a database transaction with a log table guarantees exactly-once semantics. The webhookLog table acts as an idempotency key store. This pattern prevents double-provisioning, survives network retries, and provides an audit trail for billing disputes.
Step 3: Observability & Alert Routing
Monitoring isn't just about uptime; it's about signal-to-noise ratio. Flooding Slack with every 500ms latency spike causes alert fatigue. Routing must be tiered: critical failures trigger immediate pages, warnings route to async channels, and informational metrics feed dashboards.
// observability-router.ts
import { WebClient } from '@slack/web-api';
const slack = new WebClient(process.env.SLACK_BOT_TOKEN);
type AlertSeverity = 'critical' | 'warning' | 'info';
interface AlertPayload {
service: string;
severity: AlertSeverity;
metric: string;
value: number;
threshold: number;
timestamp: string;
}
export async function routeAlert(alert: AlertPayload): Promise<void> {
const channelMap: Record<AlertSeverity, string> = {
critical: '#ops-critical',
warning: '#ops-warnings',
info: '#ops-metrics',
};
const message = {
channel: channelMap[alert.severity],
text: `🚨 ${alert.severity.toUpperCase()} | ${alert.service}`,
blocks: [
{ type: 'section', text: { type: 'mrkdwn', text: `*Metric:* ${alert.metric}\n*Value:* ${alert.value} (threshold: ${alert.threshold})\n*Time:* ${alert.timestamp}` } }
],
};
await slack.chat.postMessage(message);
}
Architecture Rationale: Decoupling alert generation from routing allows you to swap notification channels without touching business logic. Severity-based routing preserves team focus during incidents. This pattern scales cleanly when integrating with PagerDuty, Datadog, or custom synthetic monitors.
Pitfall Guide
1. Silent Webhook Failures
Explanation: Payment processors retry failed deliveries, but if your endpoint returns 200 OK without processing, or crashes mid-transaction, the provider assumes success. Users are charged but lack access.
Fix: Implement idempotency logging, return explicit 200 only after successful database commits, and use a webhook testing CLI (e.g., Stripe CLI) to simulate retries before launch.
2. Environment Variable Drift
Explanation: Staging and production environments diverge over time. Missing NODE_ENV, incorrect database URLs, or expired API keys cause runtime failures that only appear under production traffic.
Fix: Enforce a .env.example schema validator in CI. Use a launch gate script that fails deployment if required keys are missing or malformed. Never hardcode fallbacks for critical configuration.
3. Mobile Viewport Neglect
Explanation: Desktop browser devtools simulate mobile screens but ignore touch targets, hardware keyboard overlays, and real-world network throttling. 60% of traffic is mobile; layout breaks directly impact conversion.
Fix: Test on physical devices across iOS and Android. Use real-device cloud testing platforms. Validate touch target sizes (minimum 44x44px), safe area insets, and scroll behavior under 3G/4G simulation.
4. Unvalidated Session Expiry
Explanation: Sessions that never expire or expire too quickly create security vulnerabilities or user friction. Default framework settings rarely align with production security policies.
Fix: Explicitly configure session TTL (1–24 hours based on risk profile). Implement sliding expiration with refresh tokens. Validate CSRF protection on all state-mutating endpoints. Log session creation/revocation for audit trails.
5. DNS & CDN Cache Staleness
Explanation: After deployment, users may receive cached assets or stale DNS records, causing version mismatches, broken assets, or routing to decommissioned servers.
Fix: Invalidate CDN caches programmatically post-deployment. Set low TTL values during launch windows. Verify DNS propagation using multiple resolvers. Implement cache-busting query parameters for static assets.
6. Compliance & Legal Gaps
Explanation: Payment processors and app stores require privacy policies, terms of service, and cookie consent mechanisms. Missing these triggers account freezes, payment holds, or submission rejections.
Fix: Generate legally compliant documents before launch. Implement a cookie consent manager that respects regional regulations (GDPR, CCPA). Route all legal pages through a version-controlled CMS to track changes.
Explanation: Launching without established performance metrics makes it impossible to detect regressions. A 2-second page load feels fine until it degrades to 5 seconds under load.
Fix: Capture Core Web Vitals, TTFB, and API latency during staging. Set automated alerts for threshold breaches. Use synthetic monitoring to track performance from multiple geographic regions post-launch.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Early-stage MVP | Manual checklist + basic uptime monitoring | Low overhead, fast iteration, acceptable risk tolerance | Minimal (Free tiers) |
| Growth-stage SaaS | Automated launch gate + idempotent webhooks + tiered alerting | Prevents revenue leakage, scales with user base, reduces support load | Moderate ($50–$200/mo) |
| Enterprise/Compliance-heavy | Full synthetic monitoring + real-device testing + audit logging | Meets regulatory requirements, guarantees SLA compliance, prevents payment processor freezes | High ($200–$800/mo) |
Configuration Template
// launch.config.ts
export const launchConfig = {
validation: {
timeoutMs: 15000,
requiredEnvVars: ['DATABASE_URL', 'STRIPE_SECRET', 'SESSION_SECRET', 'NODE_ENV'],
sslCheckEndpoint: 'https://api.yourdomain.com/health',
},
payments: {
webhookEndpoint: '/api/webhooks/stripe',
idempotencyTable: 'webhook_logs',
retryLimit: 3,
allowedEventTypes: ['invoice.payment_succeeded', 'customer.subscription.deleted'],
},
monitoring: {
uptimeCheckInterval: 60,
alertChannels: {
critical: '#ops-critical',
warning: '#ops-warnings',
info: '#ops-metrics',
},
performanceThresholds: {
lcp: 2500,
fid: 100,
cls: 0.1,
},
},
security: {
sessionTtlHours: 12,
csrfProtection: true,
rateLimitLogin: { maxAttempts: 5, windowMinutes: 15 },
},
};
Quick Start Guide
- Initialize the validation gate: Copy
launch-gate.ts into your project root. Replace the database and SSL check endpoints with your actual production URLs. Run node launch-gate.ts locally to verify connectivity.
- Wire the webhook handler: Place
payment-pipeline.ts in your API routes directory. Configure your payment provider to point to the webhook endpoint. Use the provider's CLI to replay test events and confirm idempotency.
- Configure alert routing: Add
observability-router.ts to your monitoring service. Set the SLACK_BOT_TOKEN environment variable. Trigger a test alert to verify channel delivery.
- Apply the configuration template: Import
launch.config.ts into your CI/CD pipeline. Add a pre-deployment step that fails the build if validation checks return false.
- Verify post-launch: After deployment, run synthetic checks from multiple regions. Confirm analytics events fire, payment webhooks process without duplication, and alert channels receive test signals. Monitor Core Web Vitals for 48 hours to establish a performance baseline.