GPU Memory Profiling: Finding Leaks and Fragmentation

“CUDA out of memory” is the most common error in deep learning. But nvidia-smi only shows total usage—not what’s consuming memory or why it’s fragmented. Here’s how to actually diagnose GPU memory issues.

PyTorch Memory Statistics

import torch

def print_memory_stats():
    """Print detailed GPU memory statistics."""
    if not torch.cuda.is_available():
        return
    
    stats = torch.cuda.memory_stats()
    
    print(f"=== Memory Usage ===")
    print(f"Allocated: {stats['allocated_bytes.all.current'] / 1e9:.2f} GB")
    print(f"Reserved:  {stats['reserved_bytes.all.current'] / 1e9:.2f} GB")
    print(f"Free (in reserved): {(stats['reserved_bytes.all.current'] - stats['allocated_bytes.all.current']) / 1e9:.2f} GB")
    
    print(f"\n=== Allocation Counts ===")
    print(f"Active allocations: {stats['active.all.current']}")
    print(f"Allocation requests: {stats['allocation.all.current']}")
    
    print(f"\n=== Fragmentation ===")
    print(f"Inactive split blocks: {stats['inactive_split_bytes.all.current'] / 1e6:.1f} MB")
    print(f"Number of segments: {stats['num_alloc_retries']}")

# Usage
print_memory_stats()

Memory Snapshot Analysis

PyTorch can record memory snapshots for detailed analysis:

import torch
from torch.cuda import memory

def capture_memory_snapshot(filename="memory_snapshot.pickle"):
    """Capture memory snapshot for analysis."""
    
    # Start recording
    torch.cuda.memory._record_memory_history(
        max_entries=100000
    )
    
    # Run your model/code here
    # ...
    
    # Save snapshot
    snapshot = torch.cuda.memory._snapshot()
    
    with open(filename, 'wb') as f:
        import pickle
        pickle.dump(snapshot, f)
    
    # Stop recording
    torch.cuda.memory._record_memory_history(enabled=None)
    
    return snapshot

def analyze_snapshot(snapshot):
    """Analyze memory snapshot."""
    
    # Group allocations by stack trace
    allocation_sites = {}
    
    for seg in snapshot['segments']:
        for block in seg['blocks']:
            if block['state'] == 'active_allocated':
                # Get allocation stack trace
                frames = block.get('frames', [])
                if frames:
                    key = tuple(f['filename'] + ':' + str(f['line']) for f in frames[:3])
                    if key not in allocation_sites:
                        allocation_sites[key] = {'count': 0, 'size': 0}
                    allocation_sites[key]['count'] += 1
                    allocation_sites[key]['size'] += block['size']
    
    # Print top allocators
    sorted_sites = sorted(allocation_sites.items(), key=lambda x: x[1]['size'], reverse=True)
    
    print("Top memory allocations by source:")
    for site, info in sorted_sites[:10]:
        print(f"  {info['size']/1e6:.1f} MB ({info['count']} allocs)")
        for frame in site:
            print(f"    {frame}")

💡 Visualization

Use torch.cuda.memory._dump_snapshot(snapshot) to generate HTML visualization for interactive exploration.

Finding Memory Leaks

class MemoryTracker:
    """Track memory allocations to find leaks."""
    
    def __init__(self):
        self.baseline = None
        self.snapshots = []
    
    def mark_baseline(self):
        """Mark current memory as baseline."""
        torch.cuda.synchronize()
        self.baseline = torch.cuda.memory_allocated()
    
    def check_leak(self, label=""):
        """Check if memory increased since baseline."""
        torch.cuda.synchronize()
        current = torch.cuda.memory_allocated()
        
        if self.baseline is not None:
            delta = current - self.baseline
            if delta > 1e6:  # > 1MB increase
                print(f"[{label}] Memory leak detected: +{delta/1e6:.1f} MB")
                return True
        return False
    
    def snapshot(self, label=""):
        """Take memory snapshot."""
        torch.cuda.synchronize()
        self.snapshots.append({
            'label': label,
            'allocated': torch.cuda.memory_allocated(),
            'reserved': torch.cuda.memory_reserved(),
        })

# Usage pattern
tracker = MemoryTracker()
tracker.mark_baseline()

for batch in dataloader:
    output = model(batch)  # Process
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    tracker.check_leak(f"Batch {i}")

Fragmentation Detection

def check_fragmentation():
    """Detect memory fragmentation."""
    
    allocated = torch.cuda.memory_allocated()
    reserved = torch.cuda.memory_reserved()
    
    # Try to allocate a large contiguous block
    test_sizes = [1e9, 500e6, 100e6, 50e6]  # 1GB, 500MB, etc.
    
    largest_allocatable = 0
    for size in test_sizes:
        try:
            test = torch.empty(int(size // 4), dtype=torch.float32, device='cuda')
            largest_allocatable = size
            del test
            break
        except RuntimeError:
            continue
    
    free_space = reserved - allocated
    fragmentation_ratio = 1 - (largest_allocatable / free_space) if free_space > 0 else 1
    
    print(f"Free space: {free_space/1e9:.2f} GB")
    print(f"Largest allocatable: {largest_allocatable/1e9:.2f} GB")
    print(f"Fragmentation ratio: {fragmentation_ratio:.1%}")
    
    if fragmentation_ratio > 0.3:
        print("WARNING: High fragmentation detected!")
        print("Consider: torch.cuda.empty_cache() or restructuring allocations")
    
    return fragmentation_ratio

Typical Memory Fragmentation Sources

(%)

Variable batch sizes

35 %

Gradient checkpointing

25 %

Dynamic tensor shapes

20 %

Temporary buffers

15 %

Other

5 %

Memory-Efficient Patterns

# Pattern 1: Explicit deletion of large tensors
def process_batch(model, batch):
    output = model(batch)
    loss = criterion(output, batch.labels)
    
    # Delete intermediate tensors explicitly
    del output
    
    loss.backward()
    
    # Clear gradients from previous step
    del loss
    torch.cuda.empty_cache()  # Return memory to pool

# Pattern 2: Context manager for temporary memory
@contextmanager
def temporary_memory_scope():
    """Ensure temporary allocations are freed."""
    initial = torch.cuda.memory_allocated()
    try:
        yield
    finally:
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        final = torch.cuda.memory_allocated()
        if final > initial + 1e6:
            warnings.warn(f"Memory not fully freed: {(final-initial)/1e6:.1f} MB retained")

# Pattern 3: Pre-allocate reusable buffers
class BufferPool:
    """Reusable buffer pool to avoid fragmentation."""
    
    def __init__(self, max_size, dtype=torch.float16, device='cuda'):
        self.buffer = torch.empty(max_size, dtype=dtype, device=device)
        self.allocated = 0
    
    def allocate(self, size):
        if self.allocated + size > len(self.buffer):
            raise RuntimeError("Buffer pool exhausted")
        tensor = self.buffer[self.allocated:self.allocated + size]
        self.allocated += size
        return tensor
    
    def reset(self):
        self.allocated = 0

Debugging OOM Errors

def debug_oom(model, batch_size_range=(1, 64)):
    """Binary search for maximum batch size."""
    
    low, high = batch_size_range
    max_working = 0
    
    while low <= high:
        mid = (low + high) // 2
        torch.cuda.empty_cache()
        
        try:
            # Create dummy batch
            batch = torch.randn(mid, *input_shape, device='cuda')
            
            with torch.no_grad():
                output = model(batch)
            
            del batch, output
            torch.cuda.synchronize()
            
            max_working = mid
            low = mid + 1
            print(f"Batch size {mid}: OK")
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                high = mid - 1
                print(f"Batch size {mid}: OOM")
            else:
                raise
    
    print(f"\nMaximum batch size: {max_working}")
    
    # Estimate memory per sample
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    batch = torch.randn(max_working, *input_shape, device='cuda')
    with torch.no_grad():
        output = model(batch)
    
    peak = torch.cuda.max_memory_allocated()
    per_sample = peak / max_working
    print(f"Memory per sample: {per_sample/1e6:.1f} MB")
    
    return max_working, per_sample

📊

Memory Debugging Commands Cheat Sheet

Command	What It Shows	When to Use
memory_allocated()	Currently used	Track usage over time
memory_reserved()	Pool size	Check fragmentation
max_memory_allocated()	Peak usage	Size batch appropriately
memory_stats()	Detailed breakdown	Debug allocation patterns
empty_cache()	Free unused	Before large allocations

Conclusion

Effective GPU memory debugging requires:

Baseline tracking: Know your model’s expected memory footprint
Snapshot analysis: Identify largest allocations and their sources
Fragmentation monitoring: Detect when memory pool becomes fragmented
Leak detection: Ensure memory is freed after each batch
Structured allocation: Pre-allocate buffers, delete explicitly

Most OOM errors stem from fragmentation or retained references, not actual memory exhaustion.

PyTorch Memory Statistics

Memory Snapshot Analysis

Finding Memory Leaks

Fragmentation Detection

Typical Memory Fragmentation Sources

Memory-Efficient Patterns

Debugging OOM Errors

Memory Debugging Commands Cheat Sheet

Conclusion

Fridays with Faraday

Related Posts

Production LLM Profiling with eBPF: Beyond nvidia-smi

Attention Variants Compared: MHA, MQA, GQA, and MLA

CUDA Graphs for Inference: Eliminating CPU Launch Overhead