: 'exploratory' | 'validation' | 'contextual';
orderIndex: number;
allowedFollowUps: string[];
solutionHintPenalty: boolean; // true if question accidentally pitches a solution
}
export interface SessionResponse {
questionId: string;
rawTranscript: string;
extractedInsights: Insight[];
confidenceScore: number;
tags: string[];
}
export interface Insight {
id: string;
category: 'pain-point' | 'workflow' | 'constraint' | 'success-metric';
severity: 'critical' | 'high' | 'medium' | 'low';
quote: string;
actionable: boolean;
mappedBacklogId: string | null;
}
### Step 2: Implement Question Validation & Bias Control
Unstructured questions introduce solution-pitching and confirmation bias. A pre-session validator checks question templates against anti-patterns before interviews are scheduled.
```typescript
export function validateQuestionTemplate(question: string): ValidationResult {
const solutionPitchPatterns = /\b(build|implement|feature|tool|platform|should|would you like)\b/i;
const leadingPatterns = /\b(don't you think|agree|correct|obviously|everyone knows)\b/i;
const closedPatterns = /^(yes|no|true|false|sure|yeah|nope)\b/i;
const flags: string[] = [];
if (solutionPitchPatterns.test(question)) flags.push('SOLUTION_PITCH_DETECTED');
if (leadingPatterns.test(question)) flags.push('LEADING_QUESTION');
if (closedPatterns.test(question)) flags.push('CLOSED_FORMAT');
return {
valid: flags.length === 0,
flags,
recommendation: flags.includes('SOLUTION_PITCH_DETECTED')
? 'Reframe to focus on current workflow or outcome, not proposed solution.'
: flags.includes('LEADING_QUESTION')
? 'Remove assumption-laden phrasing. Use neutral, open-ended prompts.'
: 'Question structure is acceptable.'
};
}
export interface ValidationResult {
valid: boolean;
flags: string[];
recommendation: string;
}
Raw transcripts must be converted into structured insights. The pipeline uses deterministic tagging rules, confidence scoring, and deduplication to prevent insight fragmentation.
export class InsightExtractor {
async processTranscript(transcript: string): Promise<Insight[]> {
const rawSegments = this.splitIntoContextualSegments(transcript);
const insights: Insight[] = [];
for (const segment of rawSegments) {
const extracted = await this.extractInsight(segment);
if (extracted.actionable && extracted.confidenceScore > 0.7) {
insights.push(extracted);
}
}
return this.deduplicateAndRank(insights);
}
private splitIntoContextualSegments(text: string): string[] {
// Split by sentence boundaries, preserve context windows of 3 sentences
const sentences = text.match(/[^.!?]+[.!?]+/g) || [];
const segments: string[] = [];
for (let i = 0; i < sentences.length; i += 3) {
segments.push(sentences.slice(i, i + 3).join(' '));
}
return segments;
}
private async extractInsight(segment: string): Promise<Insight & { confidenceScore: number }> {
// Production: integrate Whisper/AssemblyAI transcript + LLM tagging or rule-based NLP
const isPainPoint = /struggle|pain|friction|slow|manual|error|waste|confusing|break/i.test(segment);
const isWorkflow = /currently|process|step|workflow|how do you|routine/i.test(segment);
const isConstraint = /budget|limit|compliance|security|integration|legacy|policy/i.test(segment);
let category: Insight['category'] = 'workflow';
if (isPainPoint) category = 'pain-point';
if (isConstraint) category = 'constraint';
const severity = this.calculateSeverity(segment);
const confidence = this.calculateConfidence(segment, category);
return {
id: crypto.randomUUID(),
category,
severity,
quote: segment.trim(),
actionable: confidence > 0.65,
mappedBacklogId: null,
confidenceScore: confidence
};
}
private calculateSeverity(text: string): Insight['severity'] {
const intensity = /(critical|impossible|unusable|breaks|data loss|compliance fail|revenue impact)/i.test(text) ? 3 :
/(slow|frustrating|manual work|frequent error|costly)/i.test(text) ? 2 : 1;
return intensity === 3 ? 'critical' : intensity === 2 ? 'high' : 'medium';
}
private calculateConfidence(text: string, category: string): number {
let score = 0.5;
if (category === 'pain-point') score += 0.15;
if (text.length > 150) score += 0.1; // longer responses indicate deeper context
if (/\b(really|definitely|always|never|consistently)\b/i.test(text)) score += 0.1;
return Math.min(score, 0.95);
}
private deduplicateAndRank(insights: Insight[]): Insight[] {
const unique = new Map<string, Insight>();
for (const insight of insights) {
const key = insight.category + insight.severity + insight.quote.slice(0, 50);
if (!unique.has(key)) unique.set(key, insight);
}
return Array.from(unique.values()).sort((a, b) => {
const severityOrder = { critical: 4, high: 3, medium: 2, low: 1 };
return severityOrder[b.severity] - severityOrder[a.severity];
});
}
}
Step 4: Backlog Integration & Traceability
Insights must flow into engineering planning without manual transcription. The pipeline pushes structured insights to GitHub Issues or Jira, linking them to PRs and release notes.
export class BacklogIntegrator {
constructor(private readonly github: Octokit, private readonly jira: JiraClient) {}
async pushToBacklog(insight: Insight, teamSlug: string): Promise<string> {
const title = `[CD] ${insight.category}: ${insight.quote.slice(0, 60)}...`;
const body = `
## Customer Insight
**Category:** ${insight.category}
**Severity:** ${insight.severity}
**Quote:** "${insight.quote}"
**Actionable:** ${insight.actionable}
## Engineering Traceability
- Link to original session: ${insight.id}
- Priority: ${this.mapSeverityToPriority(insight.severity)}
- Acceptance Criteria: TBD during refinement
`.trim();
const { data: issue } = await this.github.rest.issues.create({
owner: 'your-org',
repo: teamSlug,
title,
body,
labels: ['customer-development', insight.category, insight.severity]
});
return issue.html_url;
}
private mapSeverityToPriority(severity: string): string {
const map = { critical: 'P0', high: 'P1', medium: 'P2', low: 'P3' };
return map[severity as keyof typeof map] || 'P2';
}
}
Architecture Decisions & Rationale:
- Decoupled Stages: Question validation, transcription, insight extraction, and backlog integration operate independently. This enables parallel processing, retry logic, and component swapping (e.g., replacing Whisper with AssemblyAI without breaking the pipeline).
- Idempotent Processing: Each insight is hashed and deduplicated to prevent backlog pollution from repeated customer phrases.
- Bias Mitigation: Pre-session question validation blocks solution-pitching and leading phrasing. Randomized question ordering prevents priming effects across sessions.
- Traceability: Every insight carries a session ID, confidence score, and severity tag. This creates an audit trail from customer statement to PR implementation, enabling post-release validation.
- Engineering-First Design: The pipeline outputs native issue formats, integrates with existing CI/CD notification channels, and respects sprint boundaries. No external product management tools are required to consume the data.
Pitfall Guide
1. Solution-Pitching During Discovery
Asking "Would you use X?" or "Should we build Y?" shifts the conversation from problem discovery to solution validation. Customers will politely agree, creating false confidence. Engineers then build features that solve phantom problems. Best Practice: Restrict questions to current workflows, constraints, and outcomes. Use "How do you currently handle..." and "What breaks when..." instead of "Would you prefer..."
2. Confirmation Bias in Question Design
Teams unconsciously craft questions that validate existing roadmaps. This produces data that confirms assumptions rather than challenging them. Best Practice: Implement automated question validation (as shown in the CDIP schema). Require peer review of question templates. Rotate interviewers across teams to break echo chambers.
3. Ignoring Non-Verbal & Contextual Signals
Transcripts capture words, not hesitation, frustration, or workarounds. Customers often downplay pain points to avoid sounding difficult. Best Practice: Record sessions with consent. Tag vocal stress, pause duration, and workflow detours. Cross-reference transcript data with session metadata (e.g., "customer switched to spreadsheet 3 times during demo").
4. Unstructured Note-Taking & Data Loss
Freeform notes fragment insights across Slack threads, Notion pages, and memory. Valuable signals disappear before sprint planning. Best Practice: Enforce structured JSON output per session. Automate transcription-to-insight conversion. Store all artifacts in a versioned repository with immutable session IDs.
5. Skipping the "Why" Layer
Surface-level answers ("It's slow") lack engineering context. Without probing constraints, teams optimize the wrong metric. Best Practice: Apply the 5-Why technique systematically. Track constraint categories (technical, compliance, budget, workflow). Map answers to measurable outcomes (time saved, error reduction, revenue impact).
6. Over-Indexing on Vocal Minorities
Loud customers skew roadmaps. Their edge cases become P0 priorities, while silent majority needs go unaddressed. Best Practice: Weight insights by customer segment, usage frequency, and revenue impact. Implement confidence scoring and deduplication. Require cross-segment validation before promoting insights to sprint backlog.
7. No Feedback Loop Closure
Customers who participate in interviews rarely hear how their input influenced product decisions. This degrades participation rates and trust. Best Practice: Automate post-session follow-ups. Tag insights with release versions. Notify participants when their feedback ships. Track participation-to-adoption correlation.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Early-stage startup (0-10 customers) | Manual interviews + structured JSON logging | Low volume allows high-touch probing; automation overhead outweighs benefits | Minimal tooling cost; high engineering time investment per session |
| Mid-market SaaS (50-500 customers) | CDIP pipeline + automated transcription + backlog integration | Volume requires systematic processing; traceability prevents rework | Moderate infra cost; 40% reduction in discovery rework hours |
| Enterprise internal tool (compliance/security focus) | CDIP pipeline + constraint tagging + audit trail | Regulatory requirements demand traceable validation; bias control critical | Higher compliance overhead; reduced audit risk and faster security sign-off |
| Hardware/IoT product | CDIP pipeline + workflow simulation + field telemetry correlation | Physical constraints require cross-validation with usage data | Integration cost with telemetry systems; prevents costly hardware revisions |
Configuration Template
// cdip.config.ts
export const CDIP_CONFIG = {
pipeline: {
transcription: {
provider: 'whisper' | 'assemblyai' | 'custom',
confidenceThreshold: 0.7,
language: 'en',
diarization: true
},
insight: {
maxDuplicatesPerSession: 3,
minConfidenceForBacklog: 0.75,
severityMapping: {
critical: 'P0',
high: 'P1',
medium: 'P2',
low: 'P3'
}
},
backlog: {
provider: 'github' | 'jira',
autoLabel: true,
linkToPR: true,
sprintCycle: 2 // weeks
},
biasControl: {
validateBeforeSession: true,
randomizeOrder: true,
solutionPitchBlock: true,
leadingQuestionBlock: true
}
},
storage: {
format: 'json',
versioning: 'git',
retentionDays: 365,
encryptionAtRest: true
},
notifications: {
onInsightCreated: 'slack#product-discovery',
onBacklogPush: 'slack#engineering-planning',
onReleaseShipped: 'email|customer-segment'
}
};
Quick Start Guide
- Initialize the pipeline: Run
npx create-cdip-pipeline to scaffold the TypeScript project with schema definitions, validator, and backlog integrator.
- Configure transcription: Add your API key for Whisper or AssemblyAI to
.env. Set CDIP_CONFIG.pipeline.transcription.provider in the config file.
- Validate first template: Use
npx cdip validate question.txt to check for solution-pitching or leading phrasing before scheduling your first session.
- Run session & push insights: Record the interview, upload audio to the pipeline, and execute
npx cdip process --session-id <id>. Insights auto-create GitHub/Jira issues with traceability tags.
- Close the loop: Configure
CDIP_CONFIG.notifications.onReleaseShipped to automatically notify participants when their insights ship in a release. Verify adoption metrics in your sprint retrospective.