lConfig {
id: string;
provider: string;
params: Record<string, unknown>; // temperature, max_tokens, etc.
}
export interface ExecutionOptions {
concurrency: number;
repetitions: number;
timeoutMs: number;
seed: number;
}
**2. Implement the Benchmark Runner**
The runner orchestrates dataset ingestion, parallel inference, and metric calculation.
```typescript
import { v4 as uuidv4 } from 'uuid';
export class BenchmarkRunner {
private config: BenchmarkConfig;
private results: BenchmarkResult[] = [];
constructor(config: BenchmarkConfig) {
this.config = config;
// Enforce deterministic seeds for reproducibility
if (!config.options.seed) {
throw new Error('Benchmark config must include a fixed seed.');
}
}
async execute(): Promise<BenchmarkReport> {
const startTime = Date.now();
const executionPromises = this.config.models.map(model =>
this.runModelEvaluation(model)
);
const modelResults = await Promise.all(executionPromises);
return {
id: uuidv4(),
benchmarkName: this.config.name,
timestamp: new Date().toISOString(),
durationMs: Date.now() - startTime,
models: modelResults,
summary: this.generateSummary(modelResults)
};
}
private async runModelEvaluation(model: ModelConfig): Promise<ModelEvaluation> {
const items = await this.config.dataset.load();
// Parallel execution with concurrency control
const pLimit = await import('p-limit').then(m => m.default);
const limit = pLimit(this.config.options.concurrency);
const itemPromises = items.map(item =>
limit(() => this.evaluateItem(model, item))
);
const results = await Promise.all(itemPromises);
return this.aggregateResults(model, results);
}
private async evaluateItem(model: ModelConfig, item: DatasetItem): Promise<ItemResult> {
const requestStart = performance.now();
try {
const response = await this.inference(model, item.prompt);
const latency = performance.now() - requestStart;
const metrics = await Promise.all(
this.config.metrics.map(metric =>
metric.calculate(item.groundTruth, response, item.metadata)
)
);
return {
itemId: item.id,
modelId: model.id,
response,
latency,
cost: this.calculateCost(model, item.prompt, response),
metrics
};
} catch (error) {
return {
itemId: item.id,
modelId: model.id,
error: error.message,
latency: performance.now() - requestStart,
metrics: []
};
}
}
private async inference(model: ModelConfig, prompt: string): Promise<string> {
// Abstract provider call
const provider = ProviderFactory.get(model.provider);
return provider.complete({
model: model.id,
prompt,
...model.params,
seed: this.config.options.seed
});
}
}
3. Metric Implementation Strategy
Metrics must handle both exact and fuzzy comparisons.
export interface MetricDefinition {
name: string;
calculate: (
groundTruth: string,
prediction: string,
metadata: Record<string, unknown>
) => Promise<MetricScore>;
}
export const ExactMatchMetric: MetricDefinition = {
name: 'exact_match',
calculate: async (gt, pred) => ({
name: 'exact_match',
score: gt.trim() === pred.trim() ? 1.0 : 0.0,
weight: 1.0
})
};
export const LatencyP95Metric: MetricDefinition = {
name: 'latency_p95',
calculate: async (_, __, metadata) => {
// Latency is aggregated at the model level,
// but this metric can be used for per-item thresholding
const threshold = metadata.latencyThreshold || 2000;
return {
name: 'latency_p95',
score: metadata.latency <= threshold ? 1.0 : 0.0,
weight: 1.0
};
}
};
4. Aggregation and Statistical Analysis
Raw scores are insufficient. The system must compute confidence intervals.
function computeBootstrapConfidenceInterval(
scores: number[],
confidence: number = 0.95,
iterations: number = 1000
): { lower: number; upper: number; mean: number } {
const bootMeans: number[] = [];
const n = scores.length;
for (let i = 0; i < iterations; i++) {
let sum = 0;
for (let j = 0; j < n; j++) {
const idx = Math.floor(Math.random() * n);
sum += scores[idx];
}
bootMeans.push(sum / n);
}
bootMeans.sort((a, b) => a - b);
const lowerIdx = Math.floor((1 - confidence) / 2 * iterations);
const upperIdx = Math.floor((1 + confidence) / 2 * iterations);
return {
lower: bootMeans[lowerIdx],
upper: bootMeans[upperIdx],
mean: scores.reduce((a, b) => a + b, 0) / scores.length
};
}
Pitfall Guide
1. Data Contamination in Evaluation Sets
- Mistake: Using public datasets or internally generated data that was later included in model pre-training.
- Impact: Scores are artificially inflated. Model selection decisions are based on memorization, not reasoning.
- Remediation: Use holdout datasets created after the model's knowledge cutoff. Implement contamination detection tools that check dataset overlap against model release notes.
2. Prompt Leakage and Variance
- Mistake: Evaluating models with a single prompt template or leaking ground truth hints into the prompt.
- Impact: Results do not generalize. Models may exploit prompt artifacts rather than understanding the task.
- Remediation: Use prompt templates with randomized few-shot examples. Run evaluations with multiple prompt variations to measure stability.
3. LLM-as-a-Judge Bias
- Mistake: Using a judge model to evaluate outputs without calibrating for position bias, verbosity bias, or self-preference.
- Impact: Skewed scores favoring models from the same family as the judge or longer responses.
- Remediation: Randomize output order in judge prompts. Use multiple diverse judges. Calibrate judges against human-labeled validation sets.
4. Ignoring Latency and Cost Distributions
- Mistake: Optimizing solely for mean accuracy or mean latency.
- Impact: P95 latency spikes cause timeout errors in production. Cost per query exceeds budget constraints.
- Remediation: Track full latency distributions. Enforce cost budgets in the evaluation criteria. Include error rates in the composite score.
5. Statistical Noise and Small Sample Sizes
- Mistake: Drawing conclusions from benchmarks with fewer than 100 samples or single runs.
- Impact: Decisions based on random variance. False positives in model improvements.
- Remediation: Calculate confidence intervals. Require minimum sample sizes based on expected effect size. Use bootstrapping for robustness.
6. Metric Gaming
- Mistake: Designing metrics that models can exploit without improving actual utility.
- Impact: Models optimize for the metric but degrade user experience.
- Remediation: Align metrics with business KPIs. Include human review for critical metrics. Monitor for distribution shifts in outputs.
7. Context Window Mismatch
- Mistake: Evaluating models with context lengths that differ from production usage.
- Impact: Models appear capable in benchmarks but fail when processing full production documents.
- Remediation: Ensure benchmark inputs reflect production context distributions. Test with truncated and full contexts.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| MVP Development | Static Eval Suite + Public Leaderboards | Speed of iteration is critical; high fidelity not yet required. | Low |
| Model Migration | Hybrid Regression + Shadow Testing | Ensures no regression in key metrics while validating on live traffic. | Medium |
| Fine-Tuning Loop | Automated Regression Benchmarks | Rapid feedback on hyperparameter changes and dataset updates. | Low (Automated) |
| Critical Production | Dynamic Shadowing + Human Review | Zero tolerance for degradation; requires highest fidelity and safety. | High |
| Cost Optimization | Latency/Cost Weighted Benchmarking | Prioritizes efficiency metrics to reduce infrastructure spend. | Low |
Configuration Template
benchmark:
name: "production-customer-support-v2"
version: "1.0.0"
seed: 42
dataset:
type: "s3"
path: "s3://eval-bucket/datasets/support-v2.jsonl"
schema:
prompt: "string"
ground_truth: "string"
category: "string"
models:
- id: "llama-3-70b"
provider: "vllm"
params:
temperature: 0.1
max_tokens: 512
- id: "gpt-4-turbo"
provider: "openai"
params:
temperature: 0.1
max_tokens: 512
metrics:
- name: "exact_match"
weight: 0.3
- name: "llm_judge_accuracy"
params:
judge_model: "gpt-4-mini"
calibration_set: "s3://eval-bucket/calibration/support.jsonl"
weight: 0.4
- name: "latency_p95"
threshold_ms: 1500
weight: 0.3
execution:
concurrency: 10
repetitions: 3
timeout_ms: 5000
reporting:
format: "json"
output: "s3://eval-bucket/results/"
alert_threshold:
score_drop: 0.05
latency_increase: 200
Quick Start Guide
-
Initialize Project:
mkdir ai-benchmark && cd ai-benchmark
npm init -y
npm install @codcompass/benchmark-engine typescript
-
Create Config:
Copy the YAML template above to benchmark.yaml. Update dataset paths and model IDs.
-
Run Evaluation:
import { BenchmarkRunner } from '@codcompass/benchmark-engine';
import config from './benchmark.yaml';
const runner = new BenchmarkRunner(config);
const report = await runner.execute();
console.log(report.summary);
-
Analyze Results:
Review the JSON report for confidence intervals, latency distributions, and metric breakdowns. Compare models against the decision matrix criteria.
-
Automate:
Add a GitHub Action to run the benchmark on PR merges. Block merges if regression thresholds are exceeded.
# .github/workflows/benchmark.yml
name: AI Model Benchmark
on:
push:
branches: [main]
jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: npm ci
- run: npm run benchmark
- uses: actions/upload-artifact@v3
with:
name: benchmark-report
path: results.json
This framework transforms AI model benchmarking from a subjective, static exercise into a rigorous, automated engineering process. By implementing production-grade evaluation, teams can make data-driven model selections that optimize for accuracy, latency, cost, and user experience simultaneously.