A Dynamo cluster serving 10,000 requests per second across 64 H100 GPUs generates hundreds of metrics per second: per-GPU utilization, KV cache occupancy per block, queue depth per worker, time-to-first-token (TTFT) per request, time-between-tokens (TBT) per decode step, preemption count, error rate, and memory allocation events. Without proper dashboards, operators are blind. Without proper alerts, a p99 latency spike goes unnoticed for hours. Without a playbook, the on-call engineer spends 30 minutes figuring out what the alert means before they can start fixing it.
This post covers the complete monitoring stack for LLM serving: the essential metrics, Grafana dashboard design, Prometheus alert rules, and the on-call playbook for every common failure mode.
Essential Metrics
The Metric Hierarchy
LLM serving metrics form a hierarchy: user-facing metrics (what the user experiences), system metrics (what the infrastructure shows), and diagnostic metrics (what you need to debug root causes).
from dataclasses import dataclass, field
from enum import Enum
class MetricLevel(Enum):
USER = "user" # Directly affects user experience
SYSTEM = "system" # Infrastructure health
DIAGNOSTIC = "diagnostic" # Root cause debugging
class MetricType(Enum):
HISTOGRAM = "histogram"
GAUGE = "gauge"
COUNTER = "counter"
SUMMARY = "summary"
@dataclass
class MetricDefinition:
name: str
level: MetricLevel
metric_type: MetricType
unit: str
description: str
labels: list
alert_thresholds: dict = field(default_factory=dict)
# Complete metric catalog for LLM serving
METRIC_CATALOG = [
# User-facing metrics
MetricDefinition(
name="llm_ttft_seconds",
level=MetricLevel.USER,
metric_type=MetricType.HISTOGRAM,
unit="seconds",
description="Time to first token: duration from request "
"receipt to first generated token",
labels=["model", "endpoint", "priority"],
alert_thresholds={
"p50_warn": 0.5,
"p50_critical": 1.0,
"p99_warn": 2.0,
"p99_critical": 5.0,
},
),
MetricDefinition(
name="llm_tbt_seconds",
level=MetricLevel.USER,
metric_type=MetricType.HISTOGRAM,
unit="seconds",
description="Time between tokens: inter-token latency "
"during generation",
labels=["model", "endpoint"],
alert_thresholds={
"p50_warn": 0.03,
"p50_critical": 0.05,
"p99_warn": 0.08,
"p99_critical": 0.15,
},
),
MetricDefinition(
name="llm_request_duration_seconds",
level=MetricLevel.USER,
metric_type=MetricType.HISTOGRAM,
unit="seconds",
description="Total request duration end to end",
labels=["model", "endpoint", "status"],
),
MetricDefinition(
name="llm_tokens_per_second",
level=MetricLevel.USER,
metric_type=MetricType.GAUGE,
unit="tokens/sec",
description="Generation throughput in tokens per second",
labels=["model", "worker_id"],
),
MetricDefinition(
name="llm_error_total",
level=MetricLevel.USER,
metric_type=MetricType.COUNTER,
unit="count",
description="Total number of request errors",
labels=["model", "error_type", "endpoint"],
alert_thresholds={
"rate_1m_warn": 0.01,
"rate_1m_critical": 0.05,
},
),
# System metrics
MetricDefinition(
name="llm_gpu_utilization",
level=MetricLevel.SYSTEM,
metric_type=MetricType.GAUGE,
unit="fraction",
description="GPU SM utilization (0.0 to 1.0)",
labels=["gpu_id", "worker_id", "node"],
alert_thresholds={
"low_warn": 0.3, # Underutilized
"high_warn": 0.95, # Near saturation
},
),
MetricDefinition(
name="llm_gpu_memory_used_bytes",
level=MetricLevel.SYSTEM,
metric_type=MetricType.GAUGE,
unit="bytes",
description="GPU memory used",
labels=["gpu_id", "worker_id", "node"],
alert_thresholds={
"high_warn_pct": 0.90,
"high_critical_pct": 0.95,
},
),
MetricDefinition(
name="llm_kv_cache_utilization",
level=MetricLevel.SYSTEM,
metric_type=MetricType.GAUGE,
unit="fraction",
description="KV cache block utilization",
labels=["worker_id", "gpu_id"],
alert_thresholds={
"high_warn": 0.85,
"high_critical": 0.95,
},
),
MetricDefinition(
name="llm_queue_depth",
level=MetricLevel.SYSTEM,
metric_type=MetricType.GAUGE,
unit="requests",
description="Number of requests waiting in queue",
labels=["worker_id", "priority"],
alert_thresholds={
"high_warn": 50,
"high_critical": 200,
},
),
MetricDefinition(
name="llm_batch_size",
level=MetricLevel.SYSTEM,
metric_type=MetricType.GAUGE,
unit="requests",
description="Current number of requests in the batch",
labels=["worker_id"],
),
MetricDefinition(
name="llm_preemption_total",
level=MetricLevel.DIAGNOSTIC,
metric_type=MetricType.COUNTER,
unit="count",
description="Number of request preemptions (evicted "
"from batch due to memory pressure)",
labels=["worker_id", "reason"],
alert_thresholds={
"rate_1m_warn": 0.1,
"rate_1m_critical": 1.0,
},
),
]
Healthy Metric Ranges for LLM Serving (Llama 70B, 8xH100)
| Metric | Healthy Range | Warning | Critical |
|---|---|---|---|
| TTFT p50 | 100-300ms | 500ms-1s | above 1s |
| TTFT p99 | 500ms-1.5s | 2-3s | above 5s |
| TBT p50 | 15-25ms | 30-40ms | above 50ms |
| GPU utilization | 60-90% | below 30% or above 95% | below 10% or above 99% |
| KV cache utilization | 50-85% | 85-95% | above 95% |
| Queue depth | 0-20 | 50-100 | above 200 |
| Error rate (1min) | below 0.1% | 0.1-1% | above 1% |
| Preemption rate (1min) | below 0.01/s | 0.1-0.5/s | above 1/s |
Metrics Collection
Prometheus Metric Exporter
import time
import threading
from collections import defaultdict
class LLMMetricsCollector:
"""
Collect and export metrics for Prometheus scraping.
Implements the Prometheus client library interface
for histogram, gauge, and counter metrics.
"""
def __init__(self):
self.histograms = {}
self.gauges = {}
self.counters = {}
self.lock = threading.Lock()
# Pre-define buckets for histograms
self.ttft_buckets = [
0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 1.0, 1.5,
2.0, 3.0, 5.0, 10.0, float("inf"),
]
self.tbt_buckets = [
0.01, 0.015, 0.02, 0.025, 0.03, 0.04,
0.05, 0.07, 0.1, 0.15, 0.2, float("inf"),
]
self._initialize_metrics()
def _initialize_metrics(self):
"""Initialize all metric collectors."""
self.histograms["llm_ttft_seconds"] = {
"buckets": self.ttft_buckets,
"observations": defaultdict(list),
}
self.histograms["llm_tbt_seconds"] = {
"buckets": self.tbt_buckets,
"observations": defaultdict(list),
}
self.histograms["llm_request_duration_seconds"] = {
"buckets": [0.1, 0.5, 1, 2, 5, 10, 30, 60, float("inf")],
"observations": defaultdict(list),
}
self.gauges["llm_gpu_utilization"] = defaultdict(float)
self.gauges["llm_gpu_memory_used_bytes"] = defaultdict(float)
self.gauges["llm_kv_cache_utilization"] = defaultdict(float)
self.gauges["llm_queue_depth"] = defaultdict(float)
self.gauges["llm_batch_size"] = defaultdict(float)
self.gauges["llm_tokens_per_second"] = defaultdict(float)
self.counters["llm_error_total"] = defaultdict(int)
self.counters["llm_preemption_total"] = defaultdict(int)
self.counters["llm_requests_total"] = defaultdict(int)
def observe_ttft(self, value, labels=None):
"""Record a time-to-first-token observation."""
key = self._labels_key(labels)
with self.lock:
self.histograms["llm_ttft_seconds"]["observations"][key].append(value)
def observe_tbt(self, value, labels=None):
"""Record a time-between-tokens observation."""
key = self._labels_key(labels)
with self.lock:
self.histograms["llm_tbt_seconds"]["observations"][key].append(value)
def set_gpu_utilization(self, gpu_id, value):
"""Set current GPU utilization."""
self.gauges["llm_gpu_utilization"][f"gpu_{gpu_id}"] = value
def set_kv_cache_utilization(self, worker_id, value):
"""Set current KV cache utilization."""
self.gauges["llm_kv_cache_utilization"][f"worker_{worker_id}"] = value
def set_queue_depth(self, worker_id, value):
"""Set current queue depth."""
self.gauges["llm_queue_depth"][f"worker_{worker_id}"] = value
def increment_error(self, error_type, labels=None):
"""Increment error counter."""
key = f"{error_type}_{self._labels_key(labels)}"
self.counters["llm_error_total"][key] += 1
def increment_preemption(self, worker_id, reason):
"""Increment preemption counter."""
key = f"worker_{worker_id}_{reason}"
self.counters["llm_preemption_total"][key] += 1
def export_prometheus(self):
"""Export metrics in Prometheus text format."""
lines = []
# Export histograms
for name, hist_data in self.histograms.items():
lines.append(f"# HELP {name} LLM serving metric")
lines.append(f"# TYPE {name} histogram")
for label_key, observations in hist_data["observations"].items():
if not observations:
continue
# Compute bucket counts
sorted_obs = sorted(observations)
for bucket in hist_data["buckets"]:
count = sum(1 for o in sorted_obs if o <= bucket)
bucket_str = f"+Inf" if bucket == float("inf") else f"{bucket}"
lines.append(
f'{name}_bucket{{le="{bucket_str}",{label_key}}} {count}'
)
lines.append(
f'{name}_count{{{label_key}}} {len(observations)}'
)
lines.append(
f'{name}_sum{{{label_key}}} {sum(observations):.6f}'
)
# Export gauges
for name, gauge_data in self.gauges.items():
lines.append(f"# HELP {name} LLM serving metric")
lines.append(f"# TYPE {name} gauge")
for label_key, value in gauge_data.items():
lines.append(f'{name}{{{label_key}}} {value}')
# Export counters
for name, counter_data in self.counters.items():
lines.append(f"# HELP {name} LLM serving metric")
lines.append(f"# TYPE {name} counter")
for label_key, value in counter_data.items():
lines.append(f'{name}{{{label_key}}} {value}')
return "\n".join(lines)
def _labels_key(self, labels):
"""Convert labels dict to a Prometheus-format string."""
if not labels:
return ""
parts = [f'{k}="{v}"' for k, v in sorted(labels.items())]
return ",".join(parts)
Grafana Dashboard Configuration
Dashboard JSON Model
import json
class GrafanaDashboardBuilder:
"""
Build Grafana dashboard JSON for LLM serving monitoring.
Layout:
Row 1: User Experience (TTFT, TBT, throughput, errors)
Row 2: GPU Health (utilization, memory, temperature)
Row 3: KV Cache & Scheduling (cache util, queue, batch size)
Row 4: Detailed Diagnostics (preemptions, per-worker stats)
"""
def __init__(self, datasource="prometheus"):
self.datasource = datasource
self.panels = []
self.next_id = 1
self.next_y = 0
def build(self):
"""Build the complete dashboard."""
self._add_user_experience_row()
self._add_gpu_health_row()
self._add_kv_cache_row()
self._add_diagnostics_row()
dashboard = {
"title": "Dynamo LLM Serving",
"uid": "dynamo-llm-serving",
"tags": ["llm", "dynamo", "inference"],
"timezone": "utc",
"refresh": "10s",
"time": {"from": "now-1h", "to": "now"},
"panels": self.panels,
}
return json.dumps(dashboard, indent=2)
def _add_panel(self, title, panel_type, queries,
grid_x=0, grid_w=12, grid_h=8,
unit="", thresholds=None):
"""Add a panel to the dashboard."""
panel = {
"id": self.next_id,
"title": title,
"type": panel_type,
"datasource": self.datasource,
"gridPos": {
"x": grid_x,
"y": self.next_y,
"w": grid_w,
"h": grid_h,
},
"targets": [
{"expr": q["expr"], "legendFormat": q.get("legend", "")}
for q in queries
],
"fieldConfig": {
"defaults": {
"unit": unit,
},
},
}
if thresholds:
panel["fieldConfig"]["defaults"]["thresholds"] = {
"steps": thresholds,
}
self.panels.append(panel)
self.next_id += 1
def _add_user_experience_row(self):
"""Row 1: User-facing metrics."""
# TTFT histogram heatmap
self._add_panel(
"Time to First Token (TTFT)",
"timeseries",
[
{
"expr": 'histogram_quantile(0.50, rate(llm_ttft_seconds_bucket[5m]))',
"legend": "p50",
},
{
"expr": 'histogram_quantile(0.95, rate(llm_ttft_seconds_bucket[5m]))',
"legend": "p95",
},
{
"expr": 'histogram_quantile(0.99, rate(llm_ttft_seconds_bucket[5m]))',
"legend": "p99",
},
],
grid_x=0, grid_w=8, unit="s",
)
# TBT histogram
self._add_panel(
"Time Between Tokens (TBT)",
"timeseries",
[
{
"expr": 'histogram_quantile(0.50, rate(llm_tbt_seconds_bucket[5m]))',
"legend": "p50",
},
{
"expr": 'histogram_quantile(0.99, rate(llm_tbt_seconds_bucket[5m]))',
"legend": "p99",
},
],
grid_x=8, grid_w=8, unit="s",
)
# Throughput gauge
self._add_panel(
"Generation Throughput",
"stat",
[{"expr": "sum(llm_tokens_per_second)", "legend": "tokens/s"}],
grid_x=16, grid_w=4, unit="tokens/s",
)
# Error rate
self._add_panel(
"Error Rate (1min)",
"stat",
[{
"expr": (
"sum(rate(llm_error_total[1m])) / "
"sum(rate(llm_requests_total[1m]))"
),
"legend": "error rate",
}],
grid_x=20, grid_w=4, unit="percentunit",
thresholds=[
{"value": 0, "color": "green"},
{"value": 0.001, "color": "yellow"},
{"value": 0.01, "color": "red"},
],
)
self.next_y += 8
def _add_gpu_health_row(self):
"""Row 2: GPU health metrics."""
self._add_panel(
"GPU Utilization",
"timeseries",
[{
"expr": "llm_gpu_utilization",
"legend": "{{gpu_id}}",
}],
grid_x=0, grid_w=8, unit="percentunit",
)
self._add_panel(
"GPU Memory Usage",
"timeseries",
[{
"expr": "llm_gpu_memory_used_bytes / (80 * 1024 * 1024 * 1024)",
"legend": "{{gpu_id}}",
}],
grid_x=8, grid_w=8, unit="percentunit",
)
self._add_panel(
"GPU Memory Free (GB)",
"gauge",
[{
"expr": "(80 * 1024 * 1024 * 1024 - llm_gpu_memory_used_bytes) / (1024 * 1024 * 1024)",
"legend": "{{gpu_id}}",
}],
grid_x=16, grid_w=8, unit="decgbytes",
)
self.next_y += 8
def _add_kv_cache_row(self):
"""Row 3: KV cache and scheduling."""
self._add_panel(
"KV Cache Utilization",
"timeseries",
[{
"expr": "llm_kv_cache_utilization",
"legend": "{{worker_id}}",
}],
grid_x=0, grid_w=8, unit="percentunit",
thresholds=[
{"value": 0, "color": "green"},
{"value": 0.85, "color": "yellow"},
{"value": 0.95, "color": "red"},
],
)
self._add_panel(
"Queue Depth",
"timeseries",
[{
"expr": "llm_queue_depth",
"legend": "{{worker_id}}",
}],
grid_x=8, grid_w=8, unit="short",
)
self._add_panel(
"Batch Size",
"timeseries",
[{
"expr": "llm_batch_size",
"legend": "{{worker_id}}",
}],
grid_x=16, grid_w=8, unit="short",
)
self.next_y += 8
def _add_diagnostics_row(self):
"""Row 4: Detailed diagnostics."""
self._add_panel(
"Preemption Rate",
"timeseries",
[{
"expr": "rate(llm_preemption_total[5m])",
"legend": "{{worker_id}} - {{reason}}",
}],
grid_x=0, grid_w=12, unit="ops",
)
self._add_panel(
"Request Duration Distribution",
"heatmap",
[{
"expr": "rate(llm_request_duration_seconds_bucket[5m])",
"legend": "",
}],
grid_x=12, grid_w=12, unit="s",
)
self.next_y += 8
Alert Rules
Prometheus Alert Configuration
class AlertRuleBuilder:
"""
Build Prometheus alerting rules for LLM serving.
Alert severity:
- info: Log only, no page
- warning: Slack notification, investigate within 1 hour
- critical: Page on-call, respond within 15 minutes
"""
def build_alert_rules(self):
"""Build all alert rules."""
rules = []
# TTFT alerts
rules.append(self._alert(
name="LLMHighTTFT",
expr='histogram_quantile(0.99, rate(llm_ttft_seconds_bucket[5m])) > 5',
duration="5m",
severity="critical",
summary="p99 TTFT exceeds 5 seconds",
description="Time to first token is very high. "
"Users are experiencing slow initial response.",
runbook="Check KV cache utilization, queue depth, "
"and prefill batch size.",
))
rules.append(self._alert(
name="LLMElevatedTTFT",
expr='histogram_quantile(0.99, rate(llm_ttft_seconds_bucket[5m])) > 2',
duration="10m",
severity="warning",
summary="p99 TTFT exceeds 2 seconds",
description="TTFT is elevated. Not yet critical but "
"trending in the wrong direction.",
runbook="Monitor trend. If increasing, check for "
"traffic spike or long-context requests.",
))
# TBT alerts
rules.append(self._alert(
name="LLMHighTBT",
expr='histogram_quantile(0.99, rate(llm_tbt_seconds_bucket[5m])) > 0.15',
duration="5m",
severity="critical",
summary="p99 TBT exceeds 150ms",
description="Inter-token latency is very high. "
"Streaming responses appear to stutter.",
runbook="Check GPU utilization and batch size. "
"High batch size causes TBT regression.",
))
# Error rate
rules.append(self._alert(
name="LLMHighErrorRate",
expr='sum(rate(llm_error_total[1m])) / sum(rate(llm_requests_total[1m])) > 0.05',
duration="3m",
severity="critical",
summary="Error rate exceeds 5%",
description="More than 5% of requests are failing.",
runbook="Check error_type label distribution. "
"Common causes: OOM, timeout, model crash.",
))
# GPU OOM
rules.append(self._alert(
name="LLMGPUMemoryCritical",
expr='llm_gpu_memory_used_bytes / (80 * 1024 * 1024 * 1024) > 0.95',
duration="2m",
severity="critical",
summary="GPU memory usage above 95%",
description="GPU memory is nearly full. OOM errors imminent.",
runbook="Reduce max_batch_size or max_context_length. "
"Check for memory leaks in KV cache.",
))
# KV cache pressure
rules.append(self._alert(
name="LLMKVCachePressure",
expr="llm_kv_cache_utilization > 0.95",
duration="5m",
severity="warning",
summary="KV cache utilization above 95%",
description="KV cache is nearly full. Preemptions "
"will start if traffic continues.",
runbook="Check for long-context requests. Consider "
"reducing max_context_length or adding GPUs.",
))
# Preemption storm
rules.append(self._alert(
name="LLMPreemptionStorm",
expr="rate(llm_preemption_total[5m]) > 1",
duration="5m",
severity="critical",
summary="Preemption rate exceeds 1/second",
description="Requests are being preempted (evicted) "
"at a high rate. This causes restarts and "
"severely impacts throughput.",
runbook="Immediate: reduce batch size. Root cause: "
"KV cache too small for traffic pattern. "
"Add GPUs or reduce context length.",
))
# Queue buildup
rules.append(self._alert(
name="LLMQueueBuildup",
expr="llm_queue_depth > 200",
duration="5m",
severity="warning",
summary="Queue depth exceeds 200 requests",
description="Requests are queuing faster than "
"they are being processed.",
runbook="Check if GPUs are healthy. If utilization "
"is high, need more capacity. If low, "
"check for deadlocks or hangs.",
))
return self._format_rules(rules)
def _alert(self, name, expr, duration, severity,
summary, description, runbook):
"""Create an alert rule."""
return {
"alert": name,
"expr": expr,
"for": duration,
"labels": {"severity": severity},
"annotations": {
"summary": summary,
"description": description,
"runbook": runbook,
},
}
def _format_rules(self, rules):
"""Format rules as Prometheus AlertManager config."""
return {
"groups": [{
"name": "llm_serving_alerts",
"interval": "30s",
"rules": rules,
}],
}
On-Call Playbook
Common Issues and Fixes
@dataclass
class PlaybookEntry:
alert_name: str
severity: str
symptoms: list
likely_causes: list
diagnostic_steps: list
fix_steps: list
escalation: str
ONCALL_PLAYBOOK = [
PlaybookEntry(
alert_name="LLMHighTTFT",
severity="critical",
symptoms=[
"Users report slow initial response",
"TTFT p99 above 5 seconds on dashboard",
"Queue depth may also be elevated",
],
likely_causes=[
"Long-context requests consuming prefill capacity",
"KV cache full, causing prefill queuing",
"GPU utilization at 100% due to large batch",
"Network issue between router and workers",
],
diagnostic_steps=[
"1. Check KV cache utilization: "
" if above 90%, this is the bottleneck",
"2. Check queue depth per worker: "
" uneven distribution indicates routing issue",
"3. Check request context length distribution: "
" a few 128K-context requests can block the prefill queue",
"4. Check GPU utilization: "
" if below 50%, the issue is scheduling not compute",
],
fix_steps=[
"Immediate: Reduce max_batch_size by 50%",
"If KV cache full: Set max_context_length lower",
"If one worker overloaded: Restart the worker's router entry",
"If persistent: Add more GPU workers",
],
escalation="If not resolved in 30 minutes, escalate to "
"infrastructure team.",
),
PlaybookEntry(
alert_name="LLMPreemptionStorm",
severity="critical",
symptoms=[
"Preemption rate above 1/second",
"Throughput drops suddenly",
"TTFT and TBT both spike",
"Users see partial responses that restart",
],
likely_causes=[
"KV cache exhausted: all blocks allocated, new "
"requests force eviction of in-progress requests",
"Memory leak in KV cache manager (blocks not freed)",
"Sudden traffic spike with long-context requests",
],
diagnostic_steps=[
"1. Check KV cache utilization: should be at/near 100%",
"2. Check preemption reason label: 'memory' vs 'timeout'",
"3. Check if free block count is stuck at 0",
"4. Check for specific long-context requests dominating cache",
],
fix_steps=[
"Immediate: Reduce max_batch_size to 1 (drastic but stops storm)",
"Gradually increase batch size while monitoring",
"If blocks stuck at 0: Restart the worker (possible memory leak)",
"If traffic spike: Enable request rate limiting",
],
escalation="If blocks never free (suspected leak), escalate to "
"vLLM/Dynamo engineering.",
),
PlaybookEntry(
alert_name="LLMHighErrorRate",
severity="critical",
symptoms=[
"Error rate above 5%",
"Users see error messages instead of responses",
"May also see GPU OOM in logs",
],
likely_causes=[
"GPU OOM: model + KV cache + activations exceed GPU memory",
"Model process crashed (segfault, NCCL timeout)",
"Tokenizer error on malformed input",
"Timeout: requests taking too long, hitting deadline",
],
diagnostic_steps=[
"1. Check error_type label: 'oom', 'timeout', 'crash', 'input'",
"2. Check GPU memory: if at 100%, OOM is the cause",
"3. Check worker process status: any restarts?",
"4. Check NCCL logs for communication errors",
],
fix_steps=[
"OOM: Reduce max_batch_size and max_context_length",
"Crash: Restart affected workers, check core dumps",
"Timeout: Increase timeout threshold or reduce batch size",
"Input error: Add input validation at the router",
],
escalation="If workers keep crashing after restart, "
"escalate to GPU/driver team.",
),
PlaybookEntry(
alert_name="LLMGPUMemoryCritical",
severity="critical",
symptoms=[
"GPU memory above 95%",
"OOM errors in logs",
"Preemptions increasing",
],
likely_causes=[
"Batch size too large for available memory",
"Long-context requests allocating large KV caches",
"Memory fragmentation (many small blocks, no large contiguous blocks)",
"Memory leak (allocated blocks never freed)",
],
diagnostic_steps=[
"1. Check batch size: compare actual vs configured max",
"2. Check context length distribution of active requests",
"3. Check free block count and fragmentation ratio",
"4. Compare memory before/after serving period (leak detection)",
],
fix_steps=[
"Immediate: Reduce max_batch_size",
"Set max_context_length to 4K (from 8K or higher)",
"If fragmented: Enable block defragmentation",
"If leaking: Schedule worker restart during low traffic",
],
escalation="Memory leaks require engineering investigation. "
"File a bug with memory profile attached.",
),
]
The most common on-call mistake: seeing high TTFT and reducing batch size when the real cause is KV cache exhaustion. Reducing batch size helps TBT (fewer decode steps per batch) but does not help TTFT if the prefill is queued behind KV cache allocation. Always check KV cache utilization first.
Health Check System
class HealthChecker:
"""
Comprehensive health check for LLM serving.
Returns a structured health report for monitoring
and automated remediation.
"""
def __init__(self, metrics_collector):
self.metrics = metrics_collector
def check(self):
"""Run all health checks."""
checks = {
"gpu_health": self._check_gpus(),
"kv_cache_health": self._check_kv_cache(),
"latency_health": self._check_latency(),
"error_health": self._check_errors(),
"queue_health": self._check_queue(),
}
# Overall status
statuses = [c["status"] for c in checks.values()]
if "critical" in statuses:
overall = "critical"
elif "warning" in statuses:
overall = "warning"
else:
overall = "healthy"
return {
"overall": overall,
"timestamp": time.time(),
"checks": checks,
}
def _check_gpus(self):
"""Check GPU health."""
utils = self.metrics.gauges.get(
"llm_gpu_utilization", {}
)
memory = self.metrics.gauges.get(
"llm_gpu_memory_used_bytes", {}
)
issues = []
for gpu_id, util in utils.items():
if util > 0.99:
issues.append(f"{gpu_id}: utilization at {util*100:.0f}%")
elif util < 0.1:
issues.append(f"{gpu_id}: utilization at {util*100:.0f}% (idle?)")
status = "healthy"
if any("idle" in i for i in issues):
status = "warning"
if any("99%" in i or "100%" in i for i in issues):
status = "warning"
return {"status": status, "issues": issues}
def _check_kv_cache(self):
"""Check KV cache health."""
kv_utils = self.metrics.gauges.get(
"llm_kv_cache_utilization", {}
)
max_util = max(kv_utils.values()) if kv_utils else 0
if max_util > 0.95:
return {"status": "critical", "max_utilization": max_util}
elif max_util > 0.85:
return {"status": "warning", "max_utilization": max_util}
return {"status": "healthy", "max_utilization": max_util}
def _check_latency(self):
"""Check latency health."""
ttft_obs = self.metrics.histograms.get(
"llm_ttft_seconds", {}
).get("observations", {})
all_obs = []
for obs_list in ttft_obs.values():
all_obs.extend(obs_list[-100:])
if not all_obs:
return {"status": "healthy", "note": "No recent observations"}
import numpy as np
p99 = np.percentile(all_obs, 99)
if p99 > 5.0:
return {"status": "critical", "ttft_p99": p99}
elif p99 > 2.0:
return {"status": "warning", "ttft_p99": p99}
return {"status": "healthy", "ttft_p99": round(p99, 3)}
def _check_errors(self):
"""Check error rate."""
errors = sum(self.metrics.counters.get("llm_error_total", {}).values())
total = sum(self.metrics.counters.get("llm_requests_total", {}).values())
if total == 0:
return {"status": "healthy", "error_rate": 0}
rate = errors / total
if rate > 0.05:
return {"status": "critical", "error_rate": rate}
elif rate > 0.01:
return {"status": "warning", "error_rate": rate}
return {"status": "healthy", "error_rate": round(rate, 4)}
def _check_queue(self):
"""Check queue depth."""
depths = self.metrics.gauges.get("llm_queue_depth", {})
max_depth = max(depths.values()) if depths else 0
if max_depth > 200:
return {"status": "critical", "max_depth": max_depth}
elif max_depth > 50:
return {"status": "warning", "max_depth": max_depth}
return {"status": "healthy", "max_depth": max_depth}
Key Takeaways
Production LLM monitoring requires LLM-specific metrics (TTFT, TBT, KV cache utilization, preemption rate) that do not exist in traditional web service monitoring. Generic CPU/memory/latency dashboards miss the most common LLM failure modes.
The essential monitoring stack:
-
Metrics hierarchy: User-facing (TTFT, TBT, error rate), system (GPU util, KV cache, queue), diagnostic (preemptions, per-worker stats). Dashboards should be organized in this order — the first thing you see should be user impact.
-
TTFT and TBT are separate concerns: TTFT depends on prefill compute and queue depth. TBT depends on decode batch size and GPU compute. Different root causes, different fixes. A dashboard that only shows “average latency” conflates these.
-
KV cache utilization is the key metric: When KV cache utilization reaches 95%, preemptions start, TTFT spikes, and throughput collapses. This is the most common failure mode in production LLM serving. Alert at 85% (warning) and 95% (critical).
-
Playbooks prevent mean-time-to-diagnose inflation: When the on-call engineer gets paged at 3 AM for a preemption storm, they should not have to think from first principles. The playbook says: check KV cache, if full, reduce batch size. Immediate fix in under 2 minutes.
-
Progressive severity: Info alerts for deviations from baseline, warning alerts for conditions that will become problems, critical alerts for conditions that are currently impacting users. Page only on critical. Everything else goes to Slack.