mmonjs --strict true --outDir dist
**Step 1: Define the Protobuf Contract**
Create `proto/telemetry.proto`. Use `proto3` syntax and explicit field numbering. Avoid `required`; use application-level validation instead.
```proto
syntax = "proto3";
package telemetry.v1;
message Metric {
string name = 1;
double value = 2;
map<string, string> labels = 3;
}
message IngestRequest {
string service_name = 1;
int64 timestamp = 2;
repeated Metric metrics = 3;
}
message IngestResponse {
bool acknowledged = 1;
string trace_id = 2;
}
service TelemetryService {
rpc Ingest(IngestRequest) returns (IngestResponse);
rpc StreamMetrics(stream IngestRequest) returns (IngestResponse);
}
Step 2: Generate TypeScript Stubs
Create scripts/generate.sh:
#!/bin/bash
npx protoc \
--plugin=./node_modules/.bin/protoc-gen-ts_proto \
--ts_proto_out=./src/generated \
--ts_proto_opt=env=node,useOptionals=messages,outputServices=grpc-js,esModuleInterop=true \
proto/telemetry.proto
Run: chmod +x scripts/generate.sh && ./scripts/generate.sh
Step 3: Implement the Server
Create src/server.ts. Focus on backpressure handling, deadline propagation, and graceful shutdown.
import * as grpc from '@grpc/grpc-js';
import { TelemetryService } from './generated/telemetry';
import { v4 as uuidv4 } from 'uuid';
const server = new grpc.Server();
server.addService(TelemetryService, {
ingest: (call, callback) => {
const { service_name, timestamp, metrics } = call.request;
if (call.cancelled) return callback(new Error('Client cancelled'));
// Simulate business logic
const response = {
acknowledged: true,
trace_id: uuidv4(),
};
callback(null, response);
},
streamMetrics: (call) => {
let processed = 0;
const traceId = uuidv4();
call.on('data', (request) => {
if (call.cancelled) {
call.end();
return;
}
// Backpressure check
const canContinue = call.write({
acknowledged: true,
trace_id: traceId,
});
if (!canContinue) {
call.pause();
call.once('drain', () => call.resume());
}
processed++;
});
call.on('end', () => {
call.end({
acknowledged: true,
trace_id: traceId,
});
});
call.on('error', (err) => {
console.error('Stream error:', err);
call.end();
});
},
});
const PORT = 50051;
server.bindAsync(`0.0.0.0:${PORT}`, grpc.ServerCredentials.createInsecure(), (err, port) => {
if (err) {
console.error('Failed to bind server:', err);
process.exit(1);
}
console.log(`gRPC Server listening on ${port}`);
});
// Graceful shutdown
process.on('SIGTERM', () => {
server.tryShutdown((err) => {
if (err) console.error('Shutdown error:', err);
process.exit(0);
});
});
Step 4: Implement the Client
Create src/client.ts. Demonstrate connection pooling, deadline handling, and retry logic.
import * as grpc from '@grpc/grpc-js';
import { TelemetryServiceClient } from './generated/telemetry';
import { IngestRequest } from './generated/telemetry';
const client = new TelemetryServiceClient(
'localhost:50051',
grpc.credentials.createInsecure(),
{
'grpc.keepalive_time_ms': 30000,
'grpc.keepalive_timeout_ms': 10000,
'grpc.max_send_message_length': 50 * 1024 * 1024,
}
);
async function unaryIngest() {
const request: IngestRequest = {
service_name: 'web-frontend',
timestamp: Date.now(),
metrics: [
{ name: 'cpu_usage', value: 0.75, labels: { env: 'prod' } },
{ name: 'memory_mb', value: 1024, labels: { env: 'prod' } },
],
};
const deadline = new Date(Date.now() + 5000);
return new Promise<void>((resolve, reject) => {
client.ingest(request, { deadline }, (err, response) => {
if (err) {
console.error('Unary failed:', err.message);
reject(err);
return;
}
console.log('Unary response:', response);
resolve();
});
});
}
async function streamIngest() {
const call = client.streamMetrics({ deadline: new Date(Date.now() + 10000) });
for (let i = 0; i < 5; i++) {
call.write({
service_name: 'stream-worker',
timestamp: Date.now(),
metrics: [{ name: 'queue_depth', value: i * 10, labels: {} }],
});
await new Promise(r => setTimeout(r, 200));
}
call.end();
return new Promise<void>((resolve, reject) => {
call.on('data', (res) => console.log('Stream response:', res));
call.on('end', () => resolve());
call.on('error', (err) => reject(err));
});
}
(async () => {
try {
await unaryIngest();
await streamIngest();
} catch (e) {
console.error('Client execution failed:', e);
} finally {
client.close();
}
})();
Run
# Terminal 1: Server
npx tsx src/server.ts
# Terminal 2: Client
npx tsx src/client.ts
Pitfall Guide
| Symptom | Root Cause | Resolution |
|---|
14 UNAVAILABLE: DNS resolution failed | gRPC defaults to dns: scheme. Localhost resolution fails in containers or strict DNS environments. | Use ipv4:localhost:50051 or static:127.0.0.1:50051 in client channel options. |
4 DEADLINE_EXCEEDED vs 14 UNAVAILABLE | DEADLINE_EXCEEDED means the server received the request but timed out. UNAVAILABLE means the connection/stream never established. | Set explicit deadlines on every call. Use grpc.deadline metadata for cross-service propagation. Log grpc-status-details-bin for server-side stack traces. |
| Stream stalls or memory leak | Ignoring write() return value in server streams causes unbounded buffering. | Check call.write() return boolean. If false, call call.pause() and listen for call.once('drain', () => call.resume()). |
| Schema drift breaks clients | Reusing field numbers or changing types without backward compatibility. | Never reuse field numbers. Use optional for new fields. Deploy schema registry checks in CI. Clients ignore unknown fields; servers reject unknown required fields. |
HTTP/2 GOAWAY frames drop requests | Server connection limits or idle timeout triggers connection teardown while streams are active. | Configure grpc.http2.max_pings_without_data and client retry policies. Implement exponential backoff with jitter on UNAVAILABLE status. |
| Binary payload debugging | grpcurl returns base64 or fails to decode. | Use grpcurl -plaintext -import-path proto -proto telemetry.proto -d @ localhost:50051 telemetry.v1.TelemetryService/Ingest < payload.json. For live inspection, run grpcurl -v or attach protoc --decode to packet captures. |
Production Bundle
Containerization
FROM node:22-slim AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY . .
RUN npx tsc
FROM node:22-slim
WORKDIR /app
COPY --from=builder /app/dist ./dist
COPY --from=builder /app/node_modules ./node_modules
COPY proto ./proto
USER node
EXPOSE 50051
CMD ["node", "dist/server.js"]
Kubernetes Health & Scaling
gRPC lacks native HTTP health endpoints. Use the official grpc_health_probe binary or implement a lightweight health service:
livenessProbe:
exec:
command: ["grpc_health_probe", "-addr=:50051"]
initialDelaySeconds: 5
readinessProbe:
exec:
command: ["grpc_health_probe", "-addr=:50051"]
initialDelaySeconds: 3
Load Balancer Configuration
HTTP/2 multiplexing requires LBs that preserve TCP connections and support h2 protocol negotiation.
- Nginx/Envoy: Enable
http2 and disable connection pooling limits that fragment streams.
- AWS ALB: Use
protocol: GRPC with target-type: ip. Avoid protocol: HTTP which forces HTTP/1.1 fallback.
- Sticky Sessions: Avoid IP-hash or cookie-based stickiness for gRPC. HTTP/2 streams are multiplexed; stickiness causes uneven load distribution. Use round-robin or least-connections.
Observability & Tracing
Integrate OpenTelemetry auto-instrumentation to propagate traceparent headers across gRPC boundaries:
import { NodeSDK } from '@opentelemetry/sdk-node';
import { GrpcInstrumentation } from '@opentelemetry/instrumentation-grpc';
import { ConsoleSpanExporter } from '@opentelemetry/sdk-trace-base';
const sdk = new NodeSDK({
traceExporter: new ConsoleSpanExporter(),
instrumentations: [new GrpcInstrumentation()],
});
sdk.start();
This automatically attaches grpc.method, grpc.status_code, and request/response sizes to spans, enabling latency heatmaps and error rate tracking without manual instrumentation.
Schema Evolution Strategy
- Versioned Packages: Use
telemetry.v1, telemetry.v2 in proto packages. Deploy new versions alongside old ones during transition.
- Contract Testing: Generate client/server stubs in CI. Run
ts-proto with strict mode to catch type mismatches before deployment.
- Field Deprecation: Mark deprecated fields with
[deprecated = true]. Do not remove until all clients report zero usage via telemetry.
- Backward Compatibility: Additive changes are safe. Removing fields or changing types requires coordinated rollout or dual-write migration windows.