“CUDA out of memory” is the most common error in deep learning. But nvidia-smi only shows total usage—not what’s consuming memory or why it’s fragmented. Here’s how to actually diagnose GPU memory issues.

PyTorch Memory Statistics

import torch

def print_memory_stats():
    """Print detailed GPU memory statistics."""
    if not torch.cuda.is_available():
        return
    
    stats = torch.cuda.memory_stats()
    
    print(f"=== Memory Usage ===")
    print(f"Allocated: {stats['allocated_bytes.all.current'] / 1e9:.2f} GB")
    print(f"Reserved:  {stats['reserved_bytes.all.current'] / 1e9:.2f} GB")
    print(f"Free (in reserved): {(stats['reserved_bytes.all.current'] - stats['allocated_bytes.all.current']) / 1e9:.2f} GB")
    
    print(f"\n=== Allocation Counts ===")
    print(f"Active allocations: {stats['active.all.current']}")
    print(f"Allocation requests: {stats['allocation.all.current']}")
    
    print(f"\n=== Fragmentation ===")
    print(f"Inactive split blocks: {stats['inactive_split_bytes.all.current'] / 1e6:.1f} MB")
    print(f"Number of segments: {stats['num_alloc_retries']}")

# Usage
print_memory_stats()

Memory Snapshot Analysis

PyTorch can record memory snapshots for detailed analysis:

import torch
from torch.cuda import memory

def capture_memory_snapshot(filename="memory_snapshot.pickle"):
    """Capture memory snapshot for analysis."""
    
    # Start recording
    torch.cuda.memory._record_memory_history(
        max_entries=100000
    )
    
    # Run your model/code here
    # ...
    
    # Save snapshot
    snapshot = torch.cuda.memory._snapshot()
    
    with open(filename, 'wb') as f:
        import pickle
        pickle.dump(snapshot, f)
    
    # Stop recording
    torch.cuda.memory._record_memory_history(enabled=None)
    
    return snapshot

def analyze_snapshot(snapshot):
    """Analyze memory snapshot."""
    
    # Group allocations by stack trace
    allocation_sites = {}
    
    for seg in snapshot['segments']:
        for block in seg['blocks']:
            if block['state'] == 'active_allocated':
                # Get allocation stack trace
                frames = block.get('frames', [])
                if frames:
                    key = tuple(f['filename'] + ':' + str(f['line']) for f in frames[:3])
                    if key not in allocation_sites:
                        allocation_sites[key] = {'count': 0, 'size': 0}
                    allocation_sites[key]['count'] += 1
                    allocation_sites[key]['size'] += block['size']
    
    # Print top allocators
    sorted_sites = sorted(allocation_sites.items(), key=lambda x: x[1]['size'], reverse=True)
    
    print("Top memory allocations by source:")
    for site, info in sorted_sites[:10]:
        print(f"  {info['size']/1e6:.1f} MB ({info['count']} allocs)")
        for frame in site:
            print(f"    {frame}")
💡 Visualization

Use torch.cuda.memory._dump_snapshot(snapshot) to generate HTML visualization for interactive exploration.

Finding Memory Leaks

class MemoryTracker:
    """Track memory allocations to find leaks."""
    
    def __init__(self):
        self.baseline = None
        self.snapshots = []
    
    def mark_baseline(self):
        """Mark current memory as baseline."""
        torch.cuda.synchronize()
        self.baseline = torch.cuda.memory_allocated()
    
    def check_leak(self, label=""):
        """Check if memory increased since baseline."""
        torch.cuda.synchronize()
        current = torch.cuda.memory_allocated()
        
        if self.baseline is not None:
            delta = current - self.baseline
            if delta > 1e6:  # > 1MB increase
                print(f"[{label}] Memory leak detected: +{delta/1e6:.1f} MB")
                return True
        return False
    
    def snapshot(self, label=""):
        """Take memory snapshot."""
        torch.cuda.synchronize()
        self.snapshots.append({
            'label': label,
            'allocated': torch.cuda.memory_allocated(),
            'reserved': torch.cuda.memory_reserved(),
        })

# Usage pattern
tracker = MemoryTracker()
tracker.mark_baseline()

for batch in dataloader:
    output = model(batch)  # Process
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    tracker.check_leak(f"Batch {i}")

Fragmentation Detection

def check_fragmentation():
    """Detect memory fragmentation."""
    
    allocated = torch.cuda.memory_allocated()
    reserved = torch.cuda.memory_reserved()
    
    # Try to allocate a large contiguous block
    test_sizes = [1e9, 500e6, 100e6, 50e6]  # 1GB, 500MB, etc.
    
    largest_allocatable = 0
    for size in test_sizes:
        try:
            test = torch.empty(int(size // 4), dtype=torch.float32, device='cuda')
            largest_allocatable = size
            del test
            break
        except RuntimeError:
            continue
    
    free_space = reserved - allocated
    fragmentation_ratio = 1 - (largest_allocatable / free_space) if free_space > 0 else 1
    
    print(f"Free space: {free_space/1e9:.2f} GB")
    print(f"Largest allocatable: {largest_allocatable/1e9:.2f} GB")
    print(f"Fragmentation ratio: {fragmentation_ratio:.1%}")
    
    if fragmentation_ratio > 0.3:
        print("WARNING: High fragmentation detected!")
        print("Consider: torch.cuda.empty_cache() or restructuring allocations")
    
    return fragmentation_ratio

Typical Memory Fragmentation Sources

(%)
Variable batch sizes
35 %
Gradient checkpointing
25 %
Dynamic tensor shapes
20 %
Temporary buffers
15 %
Other
5 %

Memory-Efficient Patterns

# Pattern 1: Explicit deletion of large tensors
def process_batch(model, batch):
    output = model(batch)
    loss = criterion(output, batch.labels)
    
    # Delete intermediate tensors explicitly
    del output
    
    loss.backward()
    
    # Clear gradients from previous step
    del loss
    torch.cuda.empty_cache()  # Return memory to pool

# Pattern 2: Context manager for temporary memory
@contextmanager
def temporary_memory_scope():
    """Ensure temporary allocations are freed."""
    initial = torch.cuda.memory_allocated()
    try:
        yield
    finally:
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        final = torch.cuda.memory_allocated()
        if final > initial + 1e6:
            warnings.warn(f"Memory not fully freed: {(final-initial)/1e6:.1f} MB retained")

# Pattern 3: Pre-allocate reusable buffers
class BufferPool:
    """Reusable buffer pool to avoid fragmentation."""
    
    def __init__(self, max_size, dtype=torch.float16, device='cuda'):
        self.buffer = torch.empty(max_size, dtype=dtype, device=device)
        self.allocated = 0
    
    def allocate(self, size):
        if self.allocated + size > len(self.buffer):
            raise RuntimeError("Buffer pool exhausted")
        tensor = self.buffer[self.allocated:self.allocated + size]
        self.allocated += size
        return tensor
    
    def reset(self):
        self.allocated = 0

Debugging OOM Errors

def debug_oom(model, batch_size_range=(1, 64)):
    """Binary search for maximum batch size."""
    
    low, high = batch_size_range
    max_working = 0
    
    while low <= high:
        mid = (low + high) // 2
        torch.cuda.empty_cache()
        
        try:
            # Create dummy batch
            batch = torch.randn(mid, *input_shape, device='cuda')
            
            with torch.no_grad():
                output = model(batch)
            
            del batch, output
            torch.cuda.synchronize()
            
            max_working = mid
            low = mid + 1
            print(f"Batch size {mid}: OK")
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                high = mid - 1
                print(f"Batch size {mid}: OOM")
            else:
                raise
    
    print(f"\nMaximum batch size: {max_working}")
    
    # Estimate memory per sample
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    batch = torch.randn(max_working, *input_shape, device='cuda')
    with torch.no_grad():
        output = model(batch)
    
    peak = torch.cuda.max_memory_allocated()
    per_sample = peak / max_working
    print(f"Memory per sample: {per_sample/1e6:.1f} MB")
    
    return max_working, per_sample
📊

Memory Debugging Commands Cheat Sheet

CommandWhat It ShowsWhen to Use
memory_allocated() Currently used Track usage over time
memory_reserved() Pool size Check fragmentation
max_memory_allocated() Peak usage Size batch appropriately
memory_stats() Detailed breakdown Debug allocation patterns
empty_cache() Free unused Before large allocations

Conclusion

Effective GPU memory debugging requires:

  1. Baseline tracking: Know your model’s expected memory footprint
  2. Snapshot analysis: Identify largest allocations and their sources
  3. Fragmentation monitoring: Detect when memory pool becomes fragmented
  4. Leak detection: Ensure memory is freed after each batch
  5. Structured allocation: Pre-allocate buffers, delete explicitly

Most OOM errors stem from fragmentation or retained references, not actual memory exhaustion.