mpletionAgent Agent { get; }
public SpecialistAgent(string role, Kernel baseKernel, string instructions, params KernelPlugin[] allowedPlugins)
{
Role = role;
Kernel = baseKernel.Clone();
foreach (var plugin in allowedPlugins)
{
Kernel.Plugins.Add(plugin);
}
Agent = new ChatCompletionAgent
{
Name = $"{role}Specialist",
Instructions = instructions,
Kernel = Kernel,
Arguments = new PromptExecutionSettings
{
FunctionChoiceBehavior = FunctionChoiceBehavior.Auto()
}
};
}
}
**Architecture Rationale:** Kernel cloning ensures each specialist has an independent execution context. Tool allow-lists prevent privilege escalation. Structured instructions enforce output contracts, which simplifies downstream parsing.
### Step 2: Implement Parallel Investigation
Incident triage requires simultaneous data gathering. We use Semantic Kernel's concurrent orchestration to fan out investigations across log, metric, and knowledge specialists.
```csharp
using Microsoft.SemanticKernel.Agents.Orchestration.Concurrent;
using Microsoft.SemanticKernel.Agents.Runtime.InProcess;
public class ParallelInvestigationPipeline
{
private readonly SpecialistAgent _logAnalyst;
private readonly SpecialistAgent _metricObserver;
private readonly SpecialistAgent _knowledgeRetriever;
public ParallelInvestigationPipeline(Kernel baseKernel)
{
_logAnalyst = new SpecialistAgent("Log", baseKernel,
"Analyze error logs. Return JSON: { error_pattern: string, affected_services: string[], confidence: float }",
LogQueryPlugin.Create());
_metricObserver = new SpecialistAgent("Metric", baseKernel,
"Correlate CPU, memory, and latency spikes. Return JSON: { anomaly_type: string, threshold_breach: bool, timeline: string }",
PrometheusQueryPlugin.Create());
_knowledgeRetriever = new SpecialistAgent("Knowledge", baseKernel,
"Search runbooks and past incidents. Return JSON: { matching_runbook: string, historical_resolution: string, relevance_score: float }",
VectorStorePlugin.Create());
}
public async Task<string[]> ExecuteAsync(string incidentPayload, CancellationToken ct = default)
{
var orchestration = new ConcurrentOrchestration(
_logAnalyst.Agent,
_metricObserver.Agent,
_knowledgeRetriever.Agent);
await using var runtime = new InProcessRuntime();
await runtime.StartAsync(ct);
var result = await orchestration.InvokeAsync(incidentPayload, runtime, ct);
var findings = await result.GetValueAsync(TimeSpan.FromSeconds(45), ct);
await runtime.RunUntilIdleAsync(ct);
return findings;
}
}
Architecture Rationale: Parallel execution reduces investigation latency from ~12 seconds (sequential) to ~4 seconds. Timeout boundaries prevent runaway agents. The InProcessRuntime manages lifecycle and cancellation propagation.
Step 3: Consensus Routing with Human Gate
Investigation findings often conflict. A group-chat debate orchestrator forces specialists to reconcile discrepancies. A lead agent terminates the discussion when confidence thresholds are met. Remediation requires explicit human approval before any state-changing tool executes.
using Microsoft.SemanticKernel.Agents.Orchestration.GroupChat;
public class ConsensusRouter
{
private readonly SpecialistAgent _diagnostician;
private readonly SpecialistAgent _knowledgeArchivist;
private readonly SpecialistAgent _leadArbiter;
public ConsensusRouter(Kernel baseKernel)
{
_diagnostician = new SpecialistAgent("Diagnostic", baseKernel,
"Evaluate investigation findings. Identify root cause. Flag uncertainties.",
DiagnosticTools.Create());
_knowledgeArchivist = new SpecialistAgent("Archivist", baseKernel,
"Cross-reference findings with historical incidents. Challenge unsupported conclusions.",
KnowledgeBaseTools.Create());
_leadArbiter = new SpecialistAgent("Lead", baseKernel,
"Moderate debate. Terminate when consensus confidence > 0.85. Output final hypothesis JSON.",
Array.Empty<KernelPlugin>());
}
public async Task<string> ResolveAsync(string[] findings, CancellationToken ct = default)
{
var debate = new GroupChatOrchestration(
new RoundRobinGroupChatManager { MaximumInvocationCount = 8 },
_diagnostician.Agent,
_knowledgeArchivist.Agent,
_leadArbiter.Agent)
{
ResponseCallback = async msg =>
{
Console.WriteLine($"[{msg.AuthorName}] {msg.Content?.Trim()}");
await Task.CompletedTask;
}
};
var prompt = $"Investigation Results:\n{string.Join("\n---\n", findings)}\n\nReconcile discrepancies and produce a single root-cause hypothesis.";
var result = await debate.InvokeAsync(prompt, runtime: new InProcessRuntime(), ct);
var hypothesis = await result.GetValueAsync(TimeSpan.FromSeconds(60), ct);
return hypothesis;
}
}
Architecture Rationale: Group chat enables adversarial validation, reducing false positives. The lead arbiter enforces termination conditions, preventing infinite loops. Human-in-the-loop gates are mandatory for any agent with write capabilities.
Step 4: Execution & Communication Handoff
Once approved, the system routes to a remediation specialist and a communications specialist. Context is passed explicitly; no implicit state sharing.
public class ExecutionHandoff
{
public static async Task<bool> ApproveAndExecuteAsync(string hypothesis, CancellationToken ct = default)
{
Console.WriteLine("=== REMEDIATION GATE ===");
Console.WriteLine($"Proposed Action: {hypothesis}");
Console.Write("Approve execution? (y/n): ");
var approval = Console.ReadLine()?.Trim().ToLower();
if (approval != "y") return false;
var remediationAgent = new SpecialistAgent("Remediation", new Kernel(),
"Execute approved fix. Return JSON: { status: string, action_taken: string, verification: bool }",
RemediationTools.Create());
var chat = new ChatHistory();
chat.AddUserMessage(hypothesis);
var result = await remediationAgent.Agent.InvokeAsync(chat, ct);
return result?.Content?.Contains("\"verification\": true") == true;
}
}
Architecture Rationale: Explicit approval gates prevent autonomous write operations. Verification steps confirm remediation success. The handoff pattern ensures clean context transfer without shared mutable state.
Pitfall Guide
1. Unbounded Context Expansion
Explanation: Debate or pipeline agents accumulate messages indefinitely, exhausting context windows and inflating costs.
Fix: Enforce strict message history limits per agent. Use sliding windows or summary compression after N turns. Set explicit MaximumInvocationCount in group chat managers.
Explanation: Triage or diagnostic agents inherit write-capable plugins, enabling unauthorized state changes.
Fix: Implement plugin allow-lists at agent instantiation. Use role-based tool scoping. Never share a base kernel with write plugins across read-only specialists.
3. Silent State Loss During Handoffs
Explanation: Sequential agents drop critical metadata (timestamps, alert IDs, confidence scores) when passing context.
Fix: Define explicit handoff contracts using strongly-typed DTOs or structured JSON schemas. Validate payload completeness before downstream execution.
4. Orchestration Over-Engineering
Explanation: Applying magnetic/orchestrator-worker patterns to linear tasks introduces unnecessary latency and cost.
Fix: Match topology to workflow shape. Use sequential pipelines for deterministic steps, concurrent fan-out for parallel data gathering, and group chat only when adversarial validation is required.
5. Missing Evaluation Baselines
Explanation: Teams deploy agents without regression testing, causing silent degradation when models or prompts update.
Fix: Build an eval harness that replays historical incidents nightly. Score outputs against human resolutions using deterministic metrics (tool call accuracy, JSON schema compliance, MTTR simulation).
6. Ignoring Failure Propagation
Explanation: A single agent timeout or tool failure crashes the entire orchestration pipeline.
Fix: Implement circuit breakers and fallback strategies. Wrap agent invocations in retry policies with exponential backoff. Define graceful degradation paths (e.g., switch to advise-only mode on repeated failures).
Production Bundle
Action Checklist
Decision Matrix
| Scenario | Recommended Approach | Why | Cost Impact |
|---|
| Linear alert parsing & routing | Sequential Pipeline | Deterministic steps, minimal coordination overhead | Low ($0.05-$0.12/incident) |
| Multi-source data gathering | Concurrent Fan-Out | Parallel execution reduces MTTR by 60% | Medium ($0.18-$0.29/incident) |
| Conflicting diagnostic findings | Group Chat Debate | Adversarial validation reduces false positives | High ($0.35-$0.48/incident) |
| Complex, evolving incidents | Magnetic Orchestrator | Dynamic replanning handles unknown failure modes | Very High ($0.55-$0.75/incident) |
| Enterprise-scale ops | Hierarchical Team-of-Teams | Isolates domains, scales horizontally | Medium-High ($0.25-$0.40/incident) |
Configuration Template
{
"SemanticKernel": {
"ChatModel": "gpt-4o",
"EmbeddingModel": "text-embedding-3-small",
"MaxTokensPerAgent": 4096,
"TimeoutSeconds": 45,
"RetryPolicy": {
"MaxRetries": 3,
"BackoffMultiplier": 2.0,
"InitialDelayMs": 1000
}
},
"Orchestration": {
"ConcurrentFanOut": {
"Enabled": true,
"MaxParallelAgents": 5,
"AggregationTimeoutSeconds": 30
},
"GroupChat": {
"MaxRounds": 8,
"ConfidenceThreshold": 0.85,
"HumanGateRequired": true
},
"Safety": {
"ToolAllowListEnforcement": true,
"WriteAccessRequiresApproval": true,
"KillSwitchFeatureFlag": "incident-response:advise-only"
}
},
"Observability": {
"OpenTelemetry": {
"Endpoint": "https://otel-collector.internal:4317",
"TraceAgentHandoffs": true,
"LogToolCalls": true,
"MetricPrefix": "sre.agents"
}
}
}
Quick Start Guide
- Initialize the Kernel: Install
Microsoft.SemanticKernel and Microsoft.SemanticKernel.Agents.Abstractions. Configure your model provider credentials in appsettings.json.
- Define Specialists: Create isolated
SpecialistAgent instances with scoped plugins and structured instructions. Clone the base kernel for each to prevent state leakage.
- Wire Orchestration: Use
ConcurrentOrchestration for parallel investigation and GroupChatOrchestration for consensus routing. Set explicit timeouts and invocation limits.
- Enforce Safety Gates: Implement human approval checks before any write-capable agent executes. Add circuit breakers and fallback modes.
- Deploy & Observe: Instrument with OpenTelemetry. Run the eval harness against historical incidents. Monitor token consumption, tool accuracy, and MTTR. Iterate on prompt boundaries and topology based on telemetry.