In the first month of production serving, our vLLM cluster hit CUDA OOM errors on 3.7% of decode steps — enough to cause visible user-facing failures every few minutes. CUDA driver errors, NaN outputs, tensor shape mismatches, and worker crashes added another 1.2%. Left unhandled, these errors cascade: one failed request preempts others, freed KV blocks trigger memory fragmentation, and eventually the entire worker needs a restart that drops all in-flight requests. vLLM v1’s error handling prevents this cascade through multiple defensive layers: preemptive memory management that stops new requests before OOM, graceful degradation that sacrifices throughput to preserve correctness, request-level retry for transient failures, and fast worker restart for fatal errors. This post covers each layer with code paths and the configuration knobs that tune recovery behavior.
CUDA OOM: The Most Common Production Error
CUDA OOM occurs when a memory allocation request exceeds available GPU memory. In LLM serving, this typically happens when the KV cache grows beyond the allocated budget.
class CUDAOOMHandler:
"""
Handle CUDA out-of-memory errors in vLLM.
Root causes:
1. KV cache exceeds allocated blocks (most common)
2. Activation memory spike during prefill of long sequence
3. Memory fragmentation after many allocations/frees
4. Concurrent model + KV cache exceeds total GPU memory
"""
def __init__(self, config: dict):
self.max_retries = config.get("max_retries", 3)
self.preemption_mode = config.get("preemption_mode", "recompute")
self.gpu_memory_utilization = config.get("gpu_memory_utilization", 0.90)
def handle_oom(self, error: Exception, context: dict) -> dict:
"""
Handle a CUDA OOM error.
Strategy:
1. First: preempt lowest-priority running requests
2. Second: reduce batch size for next iteration
3. Third: if still OOM, restart worker process
"""
if "out of memory" in str(error).lower():
return self._handle_kv_cache_oom(context)
elif "CUDA error" in str(error):
return self._handle_cuda_driver_error(context)
else:
return {"action": "propagate", "error": str(error)}
def _handle_kv_cache_oom(self, context: dict) -> dict:
"""
KV cache OOM: preempt requests to free blocks.
vLLM approach:
- Preempt the request with the most allocated blocks
- Two modes: recompute (discard KV) or swap (move to CPU)
"""
running_requests = context.get("running_requests", [])
if not running_requests:
return {"action": "restart_worker", "reason": "oom_no_requests"}
# Sort by KV cache usage (descending)
sorted_requests = sorted(
running_requests,
key=lambda r: r["kv_blocks_allocated"],
reverse=True
)
# Preempt the largest request
victim = sorted_requests[0]
if self.preemption_mode == "recompute":
return {
"action": "preempt_recompute",
"victim_request_id": victim["request_id"],
"blocks_freed": victim["kv_blocks_allocated"],
"cost": "Must re-prefill when rescheduled",
}
elif self.preemption_mode == "swap":
return {
"action": "preempt_swap",
"victim_request_id": victim["request_id"],
"blocks_freed": victim["kv_blocks_allocated"],
"cost": "CPU memory used, swap-in latency later",
}
return {"action": "restart_worker"}
def _handle_cuda_driver_error(self, context: dict) -> dict:
"""
CUDA driver errors (ECC, hardware fault, driver crash)
are typically unrecoverable. Restart the worker.
"""
return {
"action": "restart_worker",
"reason": "cuda_driver_error",
"all_requests_failed": True,
"retry_eligible": True,
}
CUDA OOM Recovery Strategies
| Strategy | Recovery Time | Data Loss | When Used |
|---|---|---|---|
| Preempt (recompute) | 0 ms | KV cache of 1 request | Memory pressure |
| Preempt (swap to CPU) | 5-50 ms | None (swapped) | High memory pressure |
| Reduce batch size | 0 ms | None | Sustained pressure |
| Worker restart | 5-30 sec | All in-flight requests | Fatal CUDA error |
CUDA OOM is NOT the same as a CUDA driver error. OOM is recoverable by freeing memory (preempting requests). A CUDA driver error (ECC failure, illegal memory access) corrupts the CUDA context and requires a full process restart. vLLM v1 distinguishes between these two cases: OOM triggers preemption, driver errors trigger worker restart.
Preemptive Memory Management
The best OOM handling is preventing OOM from occurring.
class PreemptiveMemoryManager:
"""
Prevent OOM by monitoring memory and acting before exhaustion.
"""
def __init__(self, config: dict):
self.watermark_high = config.get("watermark_high", 0.90)
self.watermark_critical = config.get("watermark_critical", 0.95)
self.watermark_oom = config.get("watermark_oom", 0.98)
def check_memory_pressure(self, gpu_memory_used_gb: float,
gpu_memory_total_gb: float) -> dict:
"""
Check memory utilization and recommend action.
"""
utilization = gpu_memory_used_gb / gpu_memory_total_gb
if utilization < self.watermark_high:
return {
"status": "normal",
"action": "none",
"can_accept_new_requests": True,
}
elif utilization < self.watermark_critical:
return {
"status": "high",
"action": "stop_accepting_new_requests",
"can_accept_new_requests": False,
"reason": "Memory above high watermark",
}
elif utilization < self.watermark_oom:
return {
"status": "critical",
"action": "preempt_lowest_priority",
"can_accept_new_requests": False,
"num_requests_to_preempt": 1,
}
else:
return {
"status": "oom_imminent",
"action": "emergency_preempt",
"can_accept_new_requests": False,
"num_requests_to_preempt": 3,
}
def admission_control(self, new_request: dict,
current_utilization: float) -> dict:
"""
Decide whether to admit a new request based on memory forecast.
Estimate memory needed = input_tokens * kv_bytes_per_token
+ max_output_tokens * kv_bytes_per_token
"""
estimated_input_tokens = new_request.get("input_length", 0)
estimated_output_tokens = new_request.get("max_tokens", 1024)
kv_bytes_per_token = new_request.get("kv_bytes_per_token", 80) # bytes
estimated_memory_mb = (
(estimated_input_tokens + estimated_output_tokens) *
kv_bytes_per_token / 1e6
)
headroom_mb = (1 - current_utilization) * 80 * 1024 # 80GB GPU
safety_margin_mb = headroom_mb * 0.1 # Keep 10% safety margin
if estimated_memory_mb < headroom_mb - safety_margin_mb:
return {"admit": True}
else:
return {
"admit": False,
"reason": "insufficient_memory",
"estimated_need_mb": estimated_memory_mb,
"available_mb": headroom_mb - safety_margin_mb,
"retry_after_ms": 500,
}
Memory Watermark Levels
Request Retry Logic
When a request fails due to a recoverable error, vLLM can retry it automatically.
class RequestRetryManager:
"""
Manage request retries for transient failures.
"""
def __init__(self, config: dict):
self.max_retries = config.get("max_retries", 3)
self.base_delay_ms = config.get("base_delay_ms", 100)
self.max_delay_ms = config.get("max_delay_ms", 5000)
self.retryable_errors = {
"cuda_oom",
"preempted",
"worker_restarted",
"timeout",
}
self.non_retryable_errors = {
"invalid_input",
"model_not_found",
"token_limit_exceeded",
"cancelled_by_client",
}
def should_retry(self, error_type: str, attempt: int) -> dict:
"""
Determine if a failed request should be retried.
"""
if error_type in self.non_retryable_errors:
return {"retry": False, "reason": f"Non-retryable error: {error_type}"}
if attempt >= self.max_retries:
return {"retry": False, "reason": f"Max retries ({self.max_retries}) exceeded"}
if error_type in self.retryable_errors:
# Exponential backoff with jitter
delay_ms = min(
self.base_delay_ms * (2 ** attempt),
self.max_delay_ms
)
import random
jitter_ms = random.uniform(0, delay_ms * 0.1)
delay_ms += jitter_ms
return {
"retry": True,
"delay_ms": delay_ms,
"attempt": attempt + 1,
"max_attempts": self.max_retries,
}
return {"retry": False, "reason": f"Unknown error type: {error_type}"}
def handle_preempted_request(self, request: dict) -> dict:
"""
Handle a request that was preempted to free memory.
Preempted requests are re-queued with higher priority
to avoid starvation.
"""
return {
"action": "requeue",
"priority_boost": True,
"original_request_id": request["request_id"],
"tokens_generated_before_preempt": request.get("output_tokens", 0),
"resume_strategy": request.get("preemption_mode", "recompute"),
}
Retry Configuration by Error Type
| Error Type | Retryable | Max Retries | Backoff Strategy | Typical Recovery |
|---|---|---|---|---|
| CUDA OOM (preempted) | Yes | 3 | Exponential 100ms-5s | 100-500 ms |
| Worker restart | Yes | 2 | Fixed 5s | 5-30 sec |
| Timeout | Yes | 2 | Exponential 1s-10s | 1-10 sec |
| Invalid input | No | - | - | Return error |
| NaN output | Yes | 1 | Fixed 100ms | 100 ms |
Worker Process Recovery
When a GPU worker crashes (CUDA driver error, segfault, Python exception), vLLM must restart it without losing the entire serving deployment.
class WorkerRecoveryManager:
"""
Manage worker process lifecycle and recovery.
"""
def __init__(self, config: dict):
self.num_workers = config.get("tensor_parallel_size", 1)
self.restart_timeout_sec = config.get("restart_timeout", 60)
self.max_restarts_per_hour = config.get("max_restarts_per_hour", 5)
self.restart_count = 0
def detect_worker_failure(self, worker_id: int) -> dict:
"""
Detect that a worker process has failed.
Detection methods:
1. Process exit code != 0
2. Heartbeat timeout (no response in N seconds)
3. CUDA error reported via health check
"""
return {
"worker_id": worker_id,
"detection_method": "heartbeat_timeout",
"failure_type": "cuda_driver_error",
}
def restart_worker(self, worker_id: int) -> dict:
"""
Restart a failed worker process.
Steps:
1. Kill the old process (if still running)
2. Release GPU resources (CUDA context destroy)
3. Launch new process with same GPU assignment
4. Reload model weights
5. Re-initialize KV cache
6. Signal readiness to scheduler
For TP (tensor parallel): ALL workers must be restarted
because NCCL communicators are invalidated.
"""
if self.restart_count >= self.max_restarts_per_hour:
return {
"action": "circuit_breaker_open",
"reason": f"Too many restarts ({self.restart_count}/hr)",
"recommendation": "Manual investigation required",
}
self.restart_count += 1
# For TP > 1: must restart all workers (NCCL requirement)
workers_to_restart = list(range(self.num_workers))
# Estimate restart time
model_load_sec = 30 # Typical for 70B FP16 from disk
kv_cache_init_sec = 2
nccl_init_sec = 5
total_restart_sec = model_load_sec + kv_cache_init_sec + nccl_init_sec
return {
"action": "restart_all_workers",
"workers": workers_to_restart,
"estimated_downtime_sec": total_restart_sec,
"requests_affected": "all_in_flight",
"mitigation": "Requests will be retried after restart",
}
def fast_restart_with_checkpointing(self, worker_id: int) -> dict:
"""
Fast restart using model weight caching.
If model weights are cached in shared memory or a tmpfs mount,
reload is much faster (memory copy vs disk read).
"""
# Model in shared memory: ~2 sec to remap
# Model on NVMe: ~10 sec for 70B
# Model on disk: ~30 sec for 70B
return {
"model_source": "shared_memory",
"model_load_sec": 2,
"kv_cache_init_sec": 2,
"nccl_reinit_sec": 5,
"total_restart_sec": 9,
"downtime_reduction": "3x faster than disk reload",
}
Worker Restart Time by Model Loading Strategy
In tensor parallel configurations (TP > 1), a single worker failure requires restarting ALL workers because NCCL communicators are tied to the process group. This means a single GPU error in an 8-GPU TP=8 setup causes a full restart of all 8 workers. This is a known limitation of NCCL and is one reason to prefer pipeline parallelism (PP) for reliability — a PP stage failure only affects that stage.
NaN Detection and Recovery
NaN (Not a Number) outputs can propagate silently through the model, producing garbage text.
class NaNDetector:
"""
Detect and handle NaN values in model outputs.
"""
def __init__(self, config: dict):
self.check_frequency = config.get("nan_check_frequency", "every_step")
self.action_on_nan = config.get("nan_action", "retry_with_different_seed")
def check_output(self, logits: list, request_id: str) -> dict:
"""
Check model output for NaN values.
Where NaNs appear:
1. Logits (most common: attention score overflow)
2. Hidden states (rare: weight corruption)
3. KV cache (very rare: memory corruption)
"""
has_nan = any(x != x for x in logits) # NaN != NaN is True
has_inf = any(abs(x) > 1e30 for x in logits)
if has_nan or has_inf:
return {
"detected": True,
"type": "nan" if has_nan else "inf",
"request_id": request_id,
"action": self._determine_action(),
}
return {"detected": False}
def _determine_action(self) -> dict:
"""
Determine recovery action for NaN outputs.
"""
actions = {
"retry_with_different_seed": {
"description": "Retry the same request with a different random seed",
"success_rate": "80% (if NaN was caused by numerical edge case)",
"cost": "One additional forward pass",
},
"clamp_and_continue": {
"description": "Clamp logits to [-100, 100] and sample anyway",
"success_rate": "60% (output quality may degrade)",
"cost": "None (in-place fix)",
},
"abort_request": {
"description": "Return error to client",
"success_rate": "N/A",
"cost": "Client must retry",
},
}
return actions[self.action_on_nan]
def investigate_nan_source(self) -> dict:
"""
Common causes and diagnostics for NaN in LLM inference.
"""
return {
"attention_overflow": {
"cause": "QK^T values too large, softmax overflows",
"symptoms": "NaN in attention output, often at long contexts",
"fix": "Use FlashAttention (numerically stable)",
"vllm_default": "FlashAttention enabled by default",
},
"fp16_overflow": {
"cause": "FP16 max is 65504, intermediate values exceed this",
"symptoms": "Sporadic NaN, more common with aggressive quantization",
"fix": "Use BF16 (larger range) or FP32 accumulation",
},
"weight_corruption": {
"cause": "GPU memory error (ECC failure, cosmic ray)",
"symptoms": "Persistent NaN for all requests after corruption",
"fix": "Restart worker (reload model weights)",
},
"kv_cache_corruption": {
"cause": "Memory allocation bug or hardware error",
"symptoms": "NaN only for specific cached requests",
"fix": "Evict corrupted KV blocks, re-prefill",
},
}
NaN Sources and Recovery
| Source | Frequency | Recovery | Success Rate |
|---|---|---|---|
| Attention overflow | Rare (with FlashAttn) | Automatic (FlashAttn) | 99.9% |
| FP16 overflow | Occasional | Retry with BF16 | 95% |
| Weight corruption | Very rare | Worker restart | 100% |
| KV cache corruption | Very rare | Evict + re-prefill | 100% |
| Quantization artifacts | Model-dependent | Higher precision | 90% |
Graceful Degradation Under Load
When the system is overloaded, graceful degradation is better than hard failures.
class GracefulDegradation:
"""
Degrade service quality gracefully under extreme load.
"""
def __init__(self, config: dict):
self.max_queue_depth = config.get("max_queue_depth", 1000)
self.degradation_levels = [
{"queue_depth": 100, "action": "reduce_max_tokens", "max_tokens": 2048},
{"queue_depth": 500, "action": "reduce_max_tokens", "max_tokens": 512},
{"queue_depth": 800, "action": "reject_low_priority", "priority_threshold": 3},
{"queue_depth": 1000, "action": "reject_all_new", "reason": "system_overloaded"},
]
def evaluate_degradation(self, current_queue_depth: int,
request_priority: int) -> dict:
"""
Determine degradation level and action for new request.
"""
for level in reversed(self.degradation_levels):
if current_queue_depth >= level["queue_depth"]:
action = level["action"]
if action == "reduce_max_tokens":
return {
"admit": True,
"degraded": True,
"original_max_tokens": "as_requested",
"effective_max_tokens": level["max_tokens"],
"reason": "Queue depth exceeded threshold",
}
elif action == "reject_low_priority":
if request_priority >= level["priority_threshold"]:
return {"admit": False, "reason": "low_priority_rejected"}
else:
return {"admit": True, "degraded": False}
elif action == "reject_all_new":
return {
"admit": False,
"reason": "system_overloaded",
"retry_after_sec": 10,
"http_status": 503,
}
return {"admit": True, "degraded": False}
def circuit_breaker(self, error_rate_pct: float,
window_sec: int = 60) -> dict:
"""
Circuit breaker pattern: stop accepting requests
if error rate is too high.
"""
if error_rate_pct > 50:
return {
"state": "open",
"action": "reject_all",
"reason": f"Error rate {error_rate_pct:.0f}% exceeds 50%",
"retry_after_sec": 30,
}
elif error_rate_pct > 20:
return {
"state": "half_open",
"action": "admit_10_pct",
"reason": f"Error rate {error_rate_pct:.0f}% exceeds 20%",
}
else:
return {"state": "closed", "action": "normal"}
Graceful Degradation Levels
| Queue Depth | Action | Impact on New Requests | Existing Requests |
|---|---|---|---|
| 0-100 | Normal operation | None | None |
| 100-500 | Reduce max tokens to 2048 | Shorter responses | Unaffected |
| 500-800 | Reduce max tokens to 512 | Much shorter responses | Unaffected |
| 800-1000 | Reject low priority | Low-priority rejected | Unaffected |
| 1000+ | Reject all new | All new requests 503 | Drain existing |
Health Check and Monitoring
Production health checks must detect problems before they cause user-visible errors.
class HealthCheckSystem:
"""
Multi-level health check for vLLM serving.
"""
def liveness_check(self) -> dict:
"""
Liveness: is the process running?
Checked by Kubernetes every 10 seconds.
"""
return {"alive": True, "pid": 12345}
def readiness_check(self, engine_state: dict) -> dict:
"""
Readiness: can the system accept new requests?
"""
checks = {
"model_loaded": engine_state.get("model_loaded", False),
"workers_ready": engine_state.get("all_workers_ready", False),
"memory_available": engine_state.get("memory_utilization", 1.0) < 0.98,
"error_rate_ok": engine_state.get("error_rate_pct", 100) < 50,
}
all_ready = all(checks.values())
return {
"ready": all_ready,
"checks": checks,
"http_status": 200 if all_ready else 503,
}
def deep_health_check(self, engine_state: dict) -> dict:
"""
Deep health: detailed system state for monitoring.
Run every 30 seconds, report to Prometheus/Grafana.
"""
return {
"gpu_utilization_pct": engine_state.get("gpu_util", 0),
"gpu_memory_used_gb": engine_state.get("gpu_mem_used", 0),
"gpu_memory_total_gb": engine_state.get("gpu_mem_total", 80),
"kv_cache_utilization_pct": engine_state.get("kv_util", 0),
"running_requests": engine_state.get("running", 0),
"waiting_requests": engine_state.get("waiting", 0),
"requests_per_second": engine_state.get("rps", 0),
"avg_ttft_ms": engine_state.get("avg_ttft", 0),
"p99_ttft_ms": engine_state.get("p99_ttft", 0),
"avg_tpot_ms": engine_state.get("avg_tpot", 0),
"p99_tpot_ms": engine_state.get("p99_tpot", 0),
"error_count_last_minute": engine_state.get("errors_1m", 0),
"preemption_count_last_minute": engine_state.get("preemptions_1m", 0),
"worker_restarts_last_hour": engine_state.get("restarts_1h", 0),
}
Configure Kubernetes liveness probes with a generous timeout (30 seconds) for vLLM. Model loading and worker initialization can take 30-60 seconds. If the liveness probe timeout is shorter than the startup time, Kubernetes will kill and restart the pod in a loop.
Error Handling Configuration
def recommended_error_handling_config() -> dict:
"""
Recommended error handling configuration for production vLLM.
"""
return {
# Memory management
"gpu_memory_utilization": 0.90, # Leave 10% headroom
"max_num_seqs": 256, # Limit concurrent sequences
"max_model_len": 32768, # Cap sequence length
# Preemption
"preemption_mode": "recompute", # Cheaper than swap
"enable_prefix_caching": True, # Reduces re-prefill cost
# Retry
"max_retries": 3,
"retry_base_delay_ms": 100,
"retry_max_delay_ms": 5000,
# Worker recovery
"worker_restart_timeout_sec": 60,
"max_worker_restarts_per_hour": 5,
# Health checks
"liveness_probe_timeout_sec": 30,
"readiness_probe_timeout_sec": 10,
"health_check_interval_sec": 10,
# Degradation
"max_queue_depth": 1000,
"circuit_breaker_error_threshold_pct": 50,
# NaN handling
"nan_check": True,
"nan_action": "retry_with_different_seed",
}
Error Recovery Overhead by Error Type
Error handling in vLLM v1 follows a clear hierarchy: prevent errors through proactive memory management, handle recoverable errors through preemption and retry, and recover from fatal errors through worker restart. The most critical configuration choices are gpu_memory_utilization (leave headroom to avoid OOM), preemption_mode (recompute is cheaper for most workloads), and model weight caching (shared memory or NVMe for fast restart). In a well-configured deployment, most errors are handled transparently — the client sees a slightly delayed response rather than an error.