Let’s optimize a matrix transpose kernel from scratch, applying each optimization technique systematically and measuring the impact.

Naive Implementation

// Version 0: Naive transpose
// Each thread reads one element, writes one element
__global__ void transpose_naive(float* out, const float* in, int N) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < N && y < N) {
        out[x * N + y] = in[y * N + x];
    }
}

Problem: Non-coalesced writes. Adjacent threads write to non-adjacent memory locations.

Memory Coalescing Analysis

// Memory access pattern analysis
// Thread 0: reads  in[0], writes out[0]
// Thread 1: reads  in[1], writes out[N]   <- N elements apart!
// Thread 2: reads  in[2], writes out[2*N]
// ...

// Reads are coalesced (adjacent threads read adjacent addresses)
// Writes are strided (adjacent threads write N elements apart)
// Strided writes use only 1/N of memory bandwidth!
📊

Memory Access Patterns Impact

PatternAchieved BandwidthEfficiency
Coalesced read + write 1.5 TB/s 94%
Coalesced read, strided write 85 GB/s 5%
Strided read + write 45 GB/s 3%
Note: A100-80GB, 4096×4096 matrix

Shared Memory Tile Optimization

// Version 1: Use shared memory to enable coalesced writes
#define TILE_DIM 32
#define BLOCK_ROWS 8

__global__ void transpose_shared(float* out, const float* in, int N) {
    __shared__ float tile[TILE_DIM][TILE_DIM];
    
    int x = blockIdx.x * TILE_DIM + threadIdx.x;
    int y = blockIdx.y * TILE_DIM + threadIdx.y;
    
    // Coalesced read from global memory into shared memory
    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
        if (x < N && (y + j) < N) {
            tile[threadIdx.y + j][threadIdx.x] = in[(y + j) * N + x];
        }
    }
    
    __syncthreads();
    
    // Coalesced write from shared memory to global memory
    x = blockIdx.y * TILE_DIM + threadIdx.x;  // Note: swapped x/y
    y = blockIdx.x * TILE_DIM + threadIdx.y;
    
    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
        if (x < N && (y + j) < N) {
            out[(y + j) * N + x] = tile[threadIdx.x][threadIdx.y + j];
        }
    }
}
10x Speedup

Shared memory tiling improves bandwidth utilization from 5% to 75%, achieving ~10x speedup.

Bank Conflict Elimination

Shared memory has 32 banks. Accessing same bank from multiple threads causes serialization:

// Version 2: Pad shared memory to avoid bank conflicts
#define TILE_DIM 32
#define TILE_DIM_PADDED 33  // +1 padding

__global__ void transpose_no_conflicts(float* out, const float* in, int N) {
    // Add padding to shift columns across banks
    __shared__ float tile[TILE_DIM][TILE_DIM_PADDED];
    
    int x = blockIdx.x * TILE_DIM + threadIdx.x;
    int y = blockIdx.y * TILE_DIM + threadIdx.y;
    
    // Load tile
    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
        tile[threadIdx.y + j][threadIdx.x] = in[(y + j) * N + x];
    }
    
    __syncthreads();
    
    x = blockIdx.y * TILE_DIM + threadIdx.x;
    y = blockIdx.x * TILE_DIM + threadIdx.y;
    
    // Store transposed - no bank conflicts due to padding
    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
        out[(y + j) * N + x] = tile[threadIdx.x][threadIdx.y + j];
    }
}

Optimization Progression (4096×4096 FP32)

(GB/s)
Naive
85 GB/s
+ Shared Memory
1,125 GB/s
+ No Bank Conflicts
1,380 GB/s
+ Occupancy Tuning
1,520 GB/s

Occupancy Analysis

# Analyze kernel occupancy
ncu --metrics sm__warps_active.avg.pct_of_peak_sustained_active \
    ./transpose_benchmark

# Check register and shared memory usage
ncu --metrics launch__registers_per_thread,\
              launch__shared_mem_per_block_static,\
              launch__shared_mem_per_block_dynamic \
    ./transpose_benchmark
// Tune block size for optimal occupancy
// A100 has 108 SMs, 64K registers per SM, 164KB shared memory per SM

// Version 3: Tuned block dimensions
// 32×8 threads = 256 threads per block
// Registers per thread: ~24
// Shared memory: 32×33×4 = 4.2KB per block
// Theoretical occupancy: 100% (2048 threads/SM)

Final Optimized Version

// Version 4: All optimizations combined
template<int TILE_DIM, int BLOCK_ROWS>
__global__ void transpose_optimized(float* __restrict__ out,
                                     const float* __restrict__ in,
                                     int N) {
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];  // +1 for bank conflict avoidance
    
    int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    
    // Use __restrict__ and loop unrolling for ILP
    #pragma unroll
    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
        tile[threadIdx.y + j][threadIdx.x] = in[(yIndex + j) * N + xIndex];
    }
    
    __syncthreads();
    
    xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
    
    #pragma unroll
    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
        out[(yIndex + j) * N + xIndex] = tile[threadIdx.x][threadIdx.y + j];
    }
}

// Launch configuration
dim3 dimGrid((N + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
dim3 dimBlock(TILE_DIM, BLOCK_ROWS);
transpose_optimized<32, 8><<<dimGrid, dimBlock>>>(d_out, d_in, N);
📊

Final Performance Summary

VersionBandwidthSpeedup% Peak
Naive 85 GB/s 1.0x 5%
Shared Memory 1125 GB/s 13.2x 70%
+ Bank Conflict Fix 1380 GB/s 16.2x 86%
+ Occupancy Tuning 1520 GB/s 17.9x 95%
Note: A100-80GB, theoretical peak: 1600 GB/s for copy

Key Takeaways

  1. Memory coalescing is the single biggest factor (~10x)
  2. Shared memory enables reordering access patterns
  3. Bank conflict avoidance gains ~15-20%
  4. Occupancy tuning extracts final ~10%

For compute-bound kernels, focus shifts to instruction-level parallelism and reducing register pressure.