“CUDA out of memory” is the most common error in deep learning. But nvidia-smi only shows total usage—not what’s consuming memory or why it’s fragmented. Here’s how to actually diagnose GPU memory issues.
PyTorch Memory Statistics
import torch
def print_memory_stats():
"""Print detailed GPU memory statistics."""
if not torch.cuda.is_available():
return
stats = torch.cuda.memory_stats()
print(f"=== Memory Usage ===")
print(f"Allocated: {stats['allocated_bytes.all.current'] / 1e9:.2f} GB")
print(f"Reserved: {stats['reserved_bytes.all.current'] / 1e9:.2f} GB")
print(f"Free (in reserved): {(stats['reserved_bytes.all.current'] - stats['allocated_bytes.all.current']) / 1e9:.2f} GB")
print(f"\n=== Allocation Counts ===")
print(f"Active allocations: {stats['active.all.current']}")
print(f"Allocation requests: {stats['allocation.all.current']}")
print(f"\n=== Fragmentation ===")
print(f"Inactive split blocks: {stats['inactive_split_bytes.all.current'] / 1e6:.1f} MB")
print(f"Number of segments: {stats['num_alloc_retries']}")
# Usage
print_memory_stats()
Memory Snapshot Analysis
PyTorch can record memory snapshots for detailed analysis:
import torch
from torch.cuda import memory
def capture_memory_snapshot(filename="memory_snapshot.pickle"):
"""Capture memory snapshot for analysis."""
# Start recording
torch.cuda.memory._record_memory_history(
max_entries=100000
)
# Run your model/code here
# ...
# Save snapshot
snapshot = torch.cuda.memory._snapshot()
with open(filename, 'wb') as f:
import pickle
pickle.dump(snapshot, f)
# Stop recording
torch.cuda.memory._record_memory_history(enabled=None)
return snapshot
def analyze_snapshot(snapshot):
"""Analyze memory snapshot."""
# Group allocations by stack trace
allocation_sites = {}
for seg in snapshot['segments']:
for block in seg['blocks']:
if block['state'] == 'active_allocated':
# Get allocation stack trace
frames = block.get('frames', [])
if frames:
key = tuple(f['filename'] + ':' + str(f['line']) for f in frames[:3])
if key not in allocation_sites:
allocation_sites[key] = {'count': 0, 'size': 0}
allocation_sites[key]['count'] += 1
allocation_sites[key]['size'] += block['size']
# Print top allocators
sorted_sites = sorted(allocation_sites.items(), key=lambda x: x[1]['size'], reverse=True)
print("Top memory allocations by source:")
for site, info in sorted_sites[:10]:
print(f" {info['size']/1e6:.1f} MB ({info['count']} allocs)")
for frame in site:
print(f" {frame}")
Use torch.cuda.memory._dump_snapshot(snapshot) to generate HTML visualization for interactive exploration.
Finding Memory Leaks
class MemoryTracker:
"""Track memory allocations to find leaks."""
def __init__(self):
self.baseline = None
self.snapshots = []
def mark_baseline(self):
"""Mark current memory as baseline."""
torch.cuda.synchronize()
self.baseline = torch.cuda.memory_allocated()
def check_leak(self, label=""):
"""Check if memory increased since baseline."""
torch.cuda.synchronize()
current = torch.cuda.memory_allocated()
if self.baseline is not None:
delta = current - self.baseline
if delta > 1e6: # > 1MB increase
print(f"[{label}] Memory leak detected: +{delta/1e6:.1f} MB")
return True
return False
def snapshot(self, label=""):
"""Take memory snapshot."""
torch.cuda.synchronize()
self.snapshots.append({
'label': label,
'allocated': torch.cuda.memory_allocated(),
'reserved': torch.cuda.memory_reserved(),
})
# Usage pattern
tracker = MemoryTracker()
tracker.mark_baseline()
for batch in dataloader:
output = model(batch) # Process
loss = criterion(output, target)
loss.backward()
optimizer.step()
optimizer.zero_grad()
tracker.check_leak(f"Batch {i}")
Fragmentation Detection
def check_fragmentation():
"""Detect memory fragmentation."""
allocated = torch.cuda.memory_allocated()
reserved = torch.cuda.memory_reserved()
# Try to allocate a large contiguous block
test_sizes = [1e9, 500e6, 100e6, 50e6] # 1GB, 500MB, etc.
largest_allocatable = 0
for size in test_sizes:
try:
test = torch.empty(int(size // 4), dtype=torch.float32, device='cuda')
largest_allocatable = size
del test
break
except RuntimeError:
continue
free_space = reserved - allocated
fragmentation_ratio = 1 - (largest_allocatable / free_space) if free_space > 0 else 1
print(f"Free space: {free_space/1e9:.2f} GB")
print(f"Largest allocatable: {largest_allocatable/1e9:.2f} GB")
print(f"Fragmentation ratio: {fragmentation_ratio:.1%}")
if fragmentation_ratio > 0.3:
print("WARNING: High fragmentation detected!")
print("Consider: torch.cuda.empty_cache() or restructuring allocations")
return fragmentation_ratio
Typical Memory Fragmentation Sources
(%)Memory-Efficient Patterns
# Pattern 1: Explicit deletion of large tensors
def process_batch(model, batch):
output = model(batch)
loss = criterion(output, batch.labels)
# Delete intermediate tensors explicitly
del output
loss.backward()
# Clear gradients from previous step
del loss
torch.cuda.empty_cache() # Return memory to pool
# Pattern 2: Context manager for temporary memory
@contextmanager
def temporary_memory_scope():
"""Ensure temporary allocations are freed."""
initial = torch.cuda.memory_allocated()
try:
yield
finally:
torch.cuda.synchronize()
torch.cuda.empty_cache()
final = torch.cuda.memory_allocated()
if final > initial + 1e6:
warnings.warn(f"Memory not fully freed: {(final-initial)/1e6:.1f} MB retained")
# Pattern 3: Pre-allocate reusable buffers
class BufferPool:
"""Reusable buffer pool to avoid fragmentation."""
def __init__(self, max_size, dtype=torch.float16, device='cuda'):
self.buffer = torch.empty(max_size, dtype=dtype, device=device)
self.allocated = 0
def allocate(self, size):
if self.allocated + size > len(self.buffer):
raise RuntimeError("Buffer pool exhausted")
tensor = self.buffer[self.allocated:self.allocated + size]
self.allocated += size
return tensor
def reset(self):
self.allocated = 0
Debugging OOM Errors
def debug_oom(model, batch_size_range=(1, 64)):
"""Binary search for maximum batch size."""
low, high = batch_size_range
max_working = 0
while low <= high:
mid = (low + high) // 2
torch.cuda.empty_cache()
try:
# Create dummy batch
batch = torch.randn(mid, *input_shape, device='cuda')
with torch.no_grad():
output = model(batch)
del batch, output
torch.cuda.synchronize()
max_working = mid
low = mid + 1
print(f"Batch size {mid}: OK")
except RuntimeError as e:
if "out of memory" in str(e):
high = mid - 1
print(f"Batch size {mid}: OOM")
else:
raise
print(f"\nMaximum batch size: {max_working}")
# Estimate memory per sample
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
batch = torch.randn(max_working, *input_shape, device='cuda')
with torch.no_grad():
output = model(batch)
peak = torch.cuda.max_memory_allocated()
per_sample = peak / max_working
print(f"Memory per sample: {per_sample/1e6:.1f} MB")
return max_working, per_sample
Memory Debugging Commands Cheat Sheet
| Command | What It Shows | When to Use |
|---|---|---|
| memory_allocated() | Currently used | Track usage over time |
| memory_reserved() | Pool size | Check fragmentation |
| max_memory_allocated() | Peak usage | Size batch appropriately |
| memory_stats() | Detailed breakdown | Debug allocation patterns |
| empty_cache() | Free unused | Before large allocations |
Conclusion
Effective GPU memory debugging requires:
- Baseline tracking: Know your model’s expected memory footprint
- Snapshot analysis: Identify largest allocations and their sources
- Fragmentation monitoring: Detect when memory pool becomes fragmented
- Leak detection: Ensure memory is freed after each batch
- Structured allocation: Pre-allocate buffers, delete explicitly
Most OOM errors stem from fragmentation or retained references, not actual memory exhaustion.