t class QueryTransformer {
constructor(private readonly llmClient: any) {}
async transform(query: string): Promise<QueryTransformation> {
// 1. Expand domain terminology using LLM or synonym dictionary
const expanded = await this.expandTerms(query);
// 2. Decompose if multi-intent detected
const decomposed = query.includes('?') || query.includes(',')
? await this.decomposeIntent(query)
: undefined;
return { original: query, expanded, decomposed };
}
private async expandTerms(query: string): Promise<string[]> {
const prompt = Given the query "${query}", return 3 semantically equivalent variations used in technical documentation. Output only the variations, one per line.;
const response = await this.llmClient.completions.create({ prompt, max_tokens: 60 });
return response.text.split('\n').filter(Boolean);
}
private async decomposeIntent(query: string) {
// Implementation splits compound questions into atomic subqueries
// Returns structured intent/subquery pairs for parallel retrieval
}
}
### Step 2: Multi-Strategy Retrieval
Dense and sparse retrieval capture orthogonal signal. Dense embeddings excel at semantic similarity; sparse (BM25) captures exact lexical matches, acronyms, and code identifiers.
```typescript
import { QdrantClient } from '@qdrant/js-client';
interface RetrievalResult {
id: string;
score: number;
metadata: Record<string, any>;
content: string;
}
export class HybridRetriever {
constructor(
private readonly vectorDB: QdrantClient,
private readonly bm25Index: any,
private readonly embeddingModel: any
) {}
async retrieve(query: string, k: number = 10): Promise<RetrievalResult[]> {
const denseResults = await this.denseSearch(query, k);
const sparseResults = await this.sparseSearch(query, k);
return this.fuseResults(denseResults, sparseResults, k);
}
private async denseSearch(query: string, k: number): Promise<RetrievalResult[]> {
const embedding = await this.embeddingModel.embed(query);
const response = await this.vectorDB.search('documents', {
vector: embedding,
limit: k,
with_payload: true
});
return response.map(r => ({
id: r.id,
score: r.score,
metadata: r.payload as Record<string, any>,
content: r.payload?.content || ''
}));
}
private async sparseSearch(query: string, k: number): Promise<RetrievalResult[]> {
// BM25 implementation returns token-weighted matches
const matches = this.bm25Index.search(query);
return matches.slice(0, k).map(m => ({
id: m.id,
score: m.score,
metadata: m.metadata,
content: m.content
}));
}
}
Step 3: Score Fusion with Reciprocal Rank Fusion (RRF)
RRF combines rankings without requiring score normalization. It is parameter-free and robust to distribution shifts between dense and sparse pipelines.
private fuseResults(dense: RetrievalResult[], sparse: RetrievalResult[], k: number): RetrievalResult[] {
const rankMap = new Map<string, { denseRank: number; sparseRank: number }>();
dense.forEach((r, i) => {
if (!rankMap.has(r.id)) rankMap.set(r.id, { denseRank: Infinity, sparseRank: Infinity });
rankMap.get(r.id)!.denseRank = i + 1;
});
sparse.forEach((r, i) => {
if (!rankMap.has(r.id)) rankMap.set(r.id, { denseRank: Infinity, sparseRank: Infinity });
rankMap.get(r.id)!.sparseRank = i + 1;
});
const fused = Array.from(rankMap.entries()).map(([id, ranks]) => ({
id,
score: (1 / (60 + ranks.denseRank)) + (1 / (60 + ranks.sparseRank)),
metadata: dense.find(d => d.id === id)?.metadata || sparse.find(s => s.id === id)?.metadata || {},
content: dense.find(d => d.id === id)?.content || sparse.find(s => s.id === id)?.content || ''
}));
return fused.sort((a, b) => b.score - a.score).slice(0, k);
}
Step 4: Cross-Encoder Reranking
Bi-encoders compute embeddings independently, losing interaction signal. Cross-encoders process query-document pairs jointly, capturing relevance with higher precision. Batch processing mitigates latency overhead.
import { pipeline } from '@huggingface/transformers';
export class Reranker {
private model: any;
async init() {
this.model = await pipeline('text-classification', 'cross-encoder/ms-marco-MiniLM-L-6-v2');
}
async rerank(query: string, candidates: RetrievalResult[], topK: number): Promise<RetrievalResult[]> {
const pairs = candidates.map(c => [query, c.content]);
const scores = await this.model(pairs, { pooling: 'mean', normalize: true });
return candidates
.map((c, i) => ({ ...c, rerankScore: scores[i].score }))
.sort((a, b) => b.rerankScore - a.rerankScore)
.slice(0, topK);
}
}
Step 5: Contextual Compression & Deduplication
Retrieved chunks often overlap or contain boilerplate. Compression removes redundant context, preserving only tokens that directly answer the query.
export class ContextCompressor {
async compress(query: string, chunks: RetrievalResult[]): Promise<string> {
const unique = this.deduplicate(chunks);
const filtered = unique.filter(c => this.relevanceScore(query, c.content) > 0.3);
return filtered.map(c => c.content).join('\n\n');
}
private deduplicate(chunks: RetrievalResult[]): RetrievalResult[] {
const seen = new Set<string>();
return chunks.filter(c => {
const hash = this.hash(c.content);
if (seen.has(hash)) return false;
seen.add(hash);
return true;
});
}
private hash(text: string): string {
return Buffer.from(text).toString('base64').slice(0, 16);
}
private relevanceScore(query: string, content: string): number {
// Lightweight lexical overlap + keyword matching
const qWords = new Set(query.toLowerCase().split(/\W+/));
const cWords = content.toLowerCase().split(/\W+/);
const overlap = cWords.filter(w => qWords.has(w)).length;
return overlap / Math.max(qWords.size, 1);
}
}
Architecture Decisions & Rationale
- Hybrid over Dense-Only: Dense embeddings miss exact matches for identifiers, versions, and domain-specific nomenclature. BM25 compensates without retraining.
- RRF over Weighted Sum: Score distributions differ between dense (cosine) and sparse (BM25). RRF operates on ranks, eliminating normalization drift.
- Cross-Encoder Batching: Reranking is compute-intensive. Processing candidates in batches of 32-64 keeps p95 latency under 80ms while maintaining precision gains.
- Compression Before Generation: LLM context windows are expensive. Removing overlapping chunks and low-signal boilerplate reduces token waste by 40-60% without sacrificing recall.
- Async Pipeline Orchestration: Query transformation, dense search, and sparse search run concurrently. Reranking and compression execute sequentially on fused results. This minimizes tail latency.
Pitfall Guide
-
Treating Embedding Models as Universal
General-purpose embeddings (e.g., text-embedding-3-small) degrade sharply on domain-specific corpora. Legal, medical, and code repositories require fine-tuned or domain-adapted models. Always benchmark retrieval on your actual corpus, not public datasets.
-
Fixed Chunk Sizes Ignoring Semantic Boundaries
Splitting documents at arbitrary character counts breaks logical context. Use recursive character splitting with fallback to paragraph/code-block boundaries. Preserve metadata (section headers, document IDs) to enable post-retrieval filtering.
-
Ignoring Metadata Filtering in Retrieval
Vector search over entire corpora returns irrelevant results from deprecated versions or unrelated modules. Push metadata filters (version, module, author, date) to the vector database query layer. Hybrid retrieval should respect pre-filter constraints.
-
Synchronous Reranker Blocking
Running cross-encoder reranking synchronously on the critical path inflates p99 latency. Implement async reranking with fallback to hybrid scores. Cache reranker outputs for repeated query patterns.
-
Query Drift Without Decomposition
Multi-part queries ("How does auth work in v2.3 and what changed in v2.4?") confuse single-vector retrieval. Decompose into atomic subqueries, retrieve independently, and merge results with deduplication.
-
Over-Optimizing Recall, Ignoring Precision
Fetching 50 chunks to maximize recall wastes context window and introduces noise. LLMs degrade when context exceeds 70% irrelevant tokens. Cap retrieval at 10-15 high-signal chunks post-reranking.
-
No Retrieval Evaluation Pipeline
Shipping without retrieval metrics guarantees production failures. Implement automated evaluation with metrics like Recall@K, MRR, and Context Utilization. Track distribution shift monthly.
Best Practices from Production
- Run retrieval evaluation separately from generation. A good retriever with a mediocre LLM outperforms a bad retriever with a state-of-the-art LLM.
- Implement query routing: classify intent before retrieval to select domain-specific embeddings and filters.
- Use late interaction models (ColBERT) when latency budget allows. They preserve token-level alignment without full cross-encoder cost.
- Monitor embedding drift. Retrain or swap models when cosine similarity distributions shift beyond 15%.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Low-latency consumer app (<50ms p95) | Hybrid (RRF) + lightweight reranker | Balances precision with strict latency budgets | Low |
| Enterprise knowledge base | Hybrid + cross-encoder reranker + metadata filters | Maximizes precision for complex, domain-specific queries | Medium |
| Multi-domain platform | Query routing + domain-specific embeddings + hybrid | Prevents cross-domain interference and improves recall | High |
| Budget-constrained MVP | Dense + BM25 hybrid, skip reranker | 80% of precision gain at 20% of reranker cost | Low |
| Code/documentation retrieval | Multi-vector (code + prose) + late interaction | Captures structural and semantic signals in technical content | Medium-High |
Configuration Template
export interface RetrievalPipelineConfig {
embedding: {
model: string;
dimensions: number;
batch_size: number;
};
vectorDB: {
provider: 'qdrant' | 'weaviate' | 'milvus';
collection: string;
hnsw: { m: number; ef_construction: number; ef_search: number };
};
sparse: {
enabled: boolean;
k1: number;
b: number;
};
fusion: {
strategy: 'rrf' | 'weighted';
k: number;
rrf_constant: number;
};
reranker: {
enabled: boolean;
model: string;
batch_size: number;
async_fallback: boolean;
};
compression: {
deduplicate: boolean;
min_relevance_threshold: number;
max_context_tokens: number;
};
evaluation: {
track_recall_k: number[];
track_mrr: boolean;
log_context_utilization: boolean;
};
}
export const defaultConfig: RetrievalPipelineConfig = {
embedding: { model: 'text-embedding-3-small', dimensions: 1536, batch_size: 32 },
vectorDB: { provider: 'qdrant', collection: 'docs', hnsw: { m: 16, ef_construction: 100, ef_search: 64 } },
sparse: { enabled: true, k1: 1.2, b: 0.75 },
fusion: { strategy: 'rrf', k: 10, rrf_constant: 60 },
reranker: { enabled: true, model: 'cross-encoder/ms-marco-MiniLM-L-6-v2', batch_size: 32, async_fallback: true },
compression: { deduplicate: true, min_relevance_threshold: 0.3, max_context_tokens: 4000 },
evaluation: { track_recall_k: [5, 10, 20], track_mrr: true, log_context_utilization: true }
};
Quick Start Guide
-
Initialize Dependencies
Install vector client, embedding SDK, and reranker library. Configure environment variables for API keys and collection names.
npm install @qdrant/js-client @huggingface/transformers lunr
-
Seed the Corpus
Chunk documents using recursive splitting. Generate embeddings and upsert to vector database. Build BM25 index from raw text.
await pipeline.ingestDocuments(rawDocs);
-
Deploy Hybrid Pipeline
Instantiate QueryTransformer, HybridRetriever, Reranker, and ContextCompressor. Wire into async request handler. Enable RRF fusion and cross-encoder reranking.
-
Validate with Retrieval Metrics
Run evaluation suite against held-out queries. Track Recall@10, MRR, and context utilization. Adjust fusion constants and reranker batch size based on p95 latency targets. Deploy to staging with canary traffic before production rollout.