ay<{ target: string; type: 'one-to-many' | 'many-to-one'; foreignKey: string }>;
temporalRules: { minStepInterval: number; maxStepInterval: number };
}
export interface DomainOntology {
version: string;
entities: EntityDefinition[];
kpiBoundaries: Record<string, { min: number; max: number; unit: string }>;
isa95Hierarchy: ['site', 'area', 'line', 'cell'];
}
export const ManufacturingOntology: DomainOntology = {
version: '1.0.0',
isa95Hierarchy: ['site', 'area', 'line', 'cell'],
entities: [
{
name: 'WorkOrder',
primaryKey: 'wo_id',
attributes: {
wo_id: { type: 'string', constraints: ['uuid_v4'] },
status: { type: 'enum', constraints: ['PENDING', 'IN_PROGRESS', 'COMPLETED', 'REJECTED'] },
priority: { type: 'number', constraints: ['min:1', 'max:5'] },
created_at: { type: 'timestamp' }
},
relations: [
{ target: 'MaterialLot', type: 'many-to-one', foreignKey: 'lot_id' }
],
temporalRules: { minStepInterval: 60, maxStepInterval: 3600 }
},
{
name: 'MaterialLot',
primaryKey: 'lot_id',
attributes: {
lot_id: { type: 'string', constraints: ['uuid_v4'] },
batch_code: { type: 'string' },
quantity: { type: 'number', constraints: ['min:0'] },
expiry_ts: { type: 'timestamp' }
},
relations: [],
temporalRules: { minStepInterval: 300, maxStepInterval: 86400 }
}
],
kpiBoundaries: {
throughput_rate: { min: 120, max: 450, unit: 'units/hr' },
defect_ratio: { min: 0.001, max: 0.045, unit: 'ratio' }
}
};
**Why this works:** The ontology is not a passive schema. It is a runtime contract. Every attribute constraint, relational mapping, and temporal rule is explicitly typed. When the simulator consumes this module, it cannot generate invalid rows. When AI tools consume the same module, they receive strict parameter boundaries. Alignment is enforced at the type level, eliminating integration drift.
### Step 2: Build the Time-Stepped Simulator Engine
The simulator reads the ontology and generates causally coherent data streams. It respects relational dependencies and temporal intervals defined in the configuration.
```typescript
// simulator-engine.ts
import { ManufacturingOntology } from './domain-ontology';
export class ManufacturingSimulator {
private state: Map<string, any> = new Map();
private tick: number = 0;
constructor(private ontology: typeof ManufacturingOntology) {}
async step(): Promise<Record<string, any>[]> {
const generated: Record<string, any>[] = [];
for (const entity of this.ontology.entities) {
if (this.shouldGenerate(entity)) {
const record = this.buildRecord(entity);
this.validateAgainstOntology(entity, record);
generated.push(record);
this.state.set(`${entity.name}:${record[entity.primaryKey]}`, record);
}
}
this.tick++;
return generated;
}
private shouldGenerate(entity: any): boolean {
const interval = Math.floor(Math.random() * (entity.temporalRules.maxStepInterval - entity.temporalRules.minStepInterval)) + entity.temporalRules.minStepInterval;
return this.tick % interval === 0;
}
private buildRecord(entity: any): any {
const record: any = { [entity.primaryKey]: crypto.randomUUID() };
for (const [key, def] of Object.entries(entity.attributes)) {
if (key === entity.primaryKey) continue;
record[key] = this.generateValue(def);
}
return record;
}
private validateAgainstOntology(entity: any, record: any): void {
for (const rel of entity.relations) {
const fkKey = rel.foreignKey;
if (record[fkKey] && !this.state.has(`${rel.target}:${record[fkKey]}`)) {
throw new Error(`Relational constraint violated: ${rel.target} not found for ${fkKey}`);
}
}
}
private generateValue(def: any): any {
if (def.type === 'enum') return def.constraints[Math.floor(Math.random() * def.constraints.length)];
if (def.type === 'number') {
const min = parseInt(def.constraints?.find((c: string) => c.startsWith('min:'))?.split(':')[1] || '0');
const max = parseInt(def.constraints?.find((c: string) => c.startsWith('max:'))?.split(':')[1] || '100');
return Math.random() * (max - min) + min;
}
return def.type === 'timestamp' ? new Date().toISOString() : `gen_${Math.random().toString(36).slice(2)}`;
}
}
Architecture Rationale: The simulator does not hardcode business logic. It interprets the ontology at runtime. This means adding a new entity type or adjusting a KPI boundary requires zero changes to the simulation core. The engine remains stable while the domain model evolves. Temporal rules prevent unrealistic burst generation, ensuring time-series data reflects real manufacturing cadence.
Step 3: Stream to Lakehouse via CDC
Raw simulation output is ingested into PostgreSQL, then captured via Change Data Capture (CDC) and materialized into an Apache Iceberg lakehouse. This decouples transactional simulation from analytical workloads.
-- PostgreSQL CDC Configuration (Debezium-style)
CREATE TABLE work_orders (
wo_id UUID PRIMARY KEY,
status VARCHAR(20) NOT NULL,
priority INT CHECK (priority BETWEEN 1 AND 5),
lot_id UUID,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Iceberg table mapping
CREATE TABLE iceberg.manufacturing.work_orders (
wo_id STRING,
status STRING,
priority INT,
lot_id STRING,
created_at TIMESTAMP
) USING ICEBERG
TBLPROPERTIES ('format-version'='2', 'write.format.default'='parquet');
Why this matters: CDC ensures exactly-once ingestion semantics. Iceberg provides schema evolution and time-travel capabilities, which are critical when replaying simulation seeds for calibration experiments. The 60-run calibration matrix (10 seeds across 6 templates) relies on deterministic replayability, which only a versioned lakehouse can guarantee.
Analytics and AI agents consume a star schema optimized for parameterized tool calls. Fact tables store operational metrics; dimension tables store contextual metadata.
-- Star Schema Materialization
CREATE VIEW analytics.fact_production_events AS
SELECT
wo.wo_id,
wo.created_at AS event_timestamp,
wo.priority,
lot.quantity AS material_volume,
DATEDIFF('second', lot.created_at, wo.created_at) AS lead_time_seconds
FROM work_orders wo
JOIN material_lots lot ON wo.lot_id = lot.lot_id
WHERE wo.status IN ('IN_PROGRESS', 'COMPLETED');
AI tools query this view using strict parameter boundaries defined in the ontology. The schema guarantees that every tool invocation receives structurally valid inputs, eliminating the parameter fabrication observed in unconstrained testing.
Pitfall Guide
1. Decoupling Schema from Simulation Logic
Explanation: Teams maintain separate schema definitions for the database, API contracts, and validation scripts. Over time, updates to one layer drift from the others.
Fix: Enforce single-source consumption. The ontology module must be the only source of truth. Import it directly into the simulator, validation layer, and AI tool parameter resolvers.
2. Ignoring Temporal Causality
Explanation: Generating rows independently without respecting time-step intervals produces unrealistic data bursts. AI agents trained on non-causal data fail to model real manufacturing latency.
Fix: Implement explicit temporalRules in the ontology. The simulator must respect minimum and maximum step intervals per entity type. Use deterministic seeding for reproducible time-series generation.
Explanation: Applying rigid schema constraints without allowing semantic flexibility causes agents to reject valid but novel operational scenarios.
Fix: Separate structural constraints (types, relations, boundaries) from semantic flexibility. Allow the LLM to reason within the ontology's parameter space, but enforce strict validation at the tool-calling boundary. Use JSON Schema or TypeScript zod validators at runtime.
4. Bypassing CDC for Lakehouse Ingestion
Explanation: Direct batch inserts into Iceberg or Parquet files break exactly-once semantics and complicate replayability.
Fix: Route all simulation output through PostgreSQL first. Use a CDC connector (Debezium, Flink CDC, or Airbyte) to stream changes to Iceberg. This preserves transactional integrity and enables deterministic seed replay.
5. Static KPI Validation
Explanation: Hardcoding acceptable KPI ranges in test scripts instead of deriving them from the ontology.
Fix: Store KPI boundaries in the configuration module. Validation scripts should import these boundaries dynamically. This allows calibration experiments to adjust ranges per domain template without code changes.
6. Neglecting ISA-95 Hierarchy Mapping
Explanation: Generating flat entity lists without respecting site β area β line β cell hierarchy.
Fix: Explicitly declare the hierarchy in the ontology. The simulator must enforce parent-child relationships during record generation. AI tools should resolve context by traversing the hierarchy, not by assuming flat relationships.
7. Skipping Seed Calibration
Explanation: Running simulation once and assuming output represents production variance.
Fix: Execute calibration matrices (e.g., 60 runs, 10 seeds per template). Verify that observed KPIs fall within configured boundaries across all seeds. Document variance thresholds and reject seeds that exceed them.
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Single-domain AI validation | Ontology-driven synthesis with PostgreSQL + Iceberg | Guarantees schema compliance and temporal causality | Low (single template, minimal infra) |
| Multi-domain validation (6+ verticals) | Parameterized ontology with shared simulator core | Identical framework code across aerospace, pharma, automotive, etc. | Medium (higher seed calibration overhead) |
| Real-time agent testing | CDC stream to Iceberg + materialized star schema | Enables low-latency analytics without blocking simulation | High (requires Flink/Debezium + compute) |
| Rapid prototyping / PoC | Static JSON fixtures with manual validation | Faster initial setup, acceptable for non-production demos | Low (high technical debt long-term) |
Configuration Template
// production-ontology-config.ts
export const ProductionOntology = {
version: '2.1.0',
isa95Hierarchy: ['site', 'area', 'line', 'cell'],
entities: [
{
name: 'ProductionBatch',
primaryKey: 'batch_id',
attributes: {
batch_id: { type: 'string', constraints: ['uuid_v4'] },
line_id: { type: 'string' },
target_quantity: { type: 'number', constraints: ['min:100', 'max:50000'] },
actual_quantity: { type: 'number', constraints: ['min:0'] },
yield_pct: { type: 'number', constraints: ['min:0', 'max:100'] },
start_ts: { type: 'timestamp' },
end_ts: { type: 'timestamp' }
},
relations: [
{ target: 'ProductionLine', type: 'many-to-one', foreignKey: 'line_id' }
],
temporalRules: { minStepInterval: 120, maxStepInterval: 7200 }
}
],
kpiBoundaries: {
overall_equipment_effectiveness: { min: 65, max: 89, unit: 'percent' },
cycle_time_variance: { min: 0.02, max: 0.15, unit: 'ratio' },
scrap_rate: { min: 0.005, max: 0.03, unit: 'ratio' }
},
validationRules: {
requireParentLine: true,
enforceYieldCap: true,
rejectNegativeQuantities: true
}
};
Quick Start Guide
- Initialize the Ontology Module: Copy the configuration template into your project. Adjust entity definitions, constraints, and KPI boundaries to match your target domain.
- Deploy the Simulator: Instantiate the
ManufacturingSimulator class with the ontology. Run a dry pass to verify relational constraint enforcement and temporal interval compliance.
- Configure CDC Pipeline: Set up PostgreSQL, create the target tables, and deploy a CDC connector to stream changes to an Iceberg catalog. Verify exactly-once ingestion with a small seed run.
- Materialize Analytics View: Execute the star schema DDL. Query the materialized view to confirm that AI tool parameters align with ontology boundaries.
- Run Calibration Matrix: Execute 10 simulation seeds per domain template. Validate that observed KPIs fall within configured ranges. Document variance thresholds and lock the ontology version for production validation.
This architecture transforms synthetic data generation from a fragile mocking exercise into a deterministic, schema-guaranteed validation pipeline. By treating the configuration module as the execution contract, teams eliminate integration drift, suppress LLM parameter hallucination, and scale AI validation across discrete manufacturing domains without exposing proprietary operational data.