A transformer forward pass with serial execution spends 8 ms on H2D copy, 12 ms on compute, and 8 ms on D2H copy — 28 ms total. With proper stream pipelining, H2D and compute overlap, compute and D2H overlap, and the wall-clock time drops to 14 ms. That 2x speedup costs zero additional hardware and requires changing three lines of code to create streams and specify them in kernel launches. The GPU has three independent engines — SMs for compute, two DMA engines for bidirectional PCIe transfer — but the default stream forces them to run serially. Streams unlock the concurrency the hardware already provides.
All measurements target NVIDIA Ampere (A100-80GB SXM, SM 8.0) unless stated otherwise.
Stream Fundamentals
#include <cuda_runtime.h>
#include <cstdio>
void stream_basics() {
// The default stream (stream 0 / NULL stream)
// All CUDA calls without an explicit stream use this
cudaStream_t default_stream = 0;
// Create explicit streams
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
// Operations in the same stream execute in order
// Operations in different streams may execute concurrently
float *d_a, *d_b;
cudaMalloc(&d_a, 1024 * sizeof(float));
cudaMalloc(&d_b, 1024 * sizeof(float));
float h_data[1024];
// These two copies can overlap (different streams, different directions)
cudaMemcpyAsync(d_a, h_data, 1024 * sizeof(float),
cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(h_data, d_b, 1024 * sizeof(float),
cudaMemcpyDeviceToHost, stream2);
// Wait for a specific stream
cudaStreamSynchronize(stream1); // Blocks host until stream1 is done
// Destroy streams
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaFree(d_a);
cudaFree(d_b);
}
Default Stream Semantics
// The default stream has special synchronization behavior:
// LEGACY behavior (default): stream 0 synchronizes with ALL other streams
// PER-THREAD behavior: each host thread gets its own default stream
// Compile with --default-stream per-thread for per-thread default streams
// Or use cudaStreamPerThread constant
void default_stream_legacy() {
cudaStream_t s1;
cudaStreamCreate(&s1);
float* d_ptr;
cudaMalloc(&d_ptr, 1024 * sizeof(float));
// Legacy: this kernel in stream 0 will wait for everything in s1 to complete
// and block s1 from starting new work until this kernel finishes
kernel_a<<<1, 256>>>(d_ptr); // default stream (legacy: blocking)
kernel_b<<<1, 256, 0, s1>>>(d_ptr); // s1: must wait for kernel_a
cudaStreamDestroy(s1);
cudaFree(d_ptr);
}
void default_stream_nonblocking() {
// Create a non-blocking stream that does NOT sync with default stream
cudaStream_t s1;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
float* d_ptr;
cudaMalloc(&d_ptr, 1024 * sizeof(float));
// With cudaStreamNonBlocking, s1 does NOT sync with stream 0
kernel_a<<<1, 256>>>(d_ptr); // default stream
kernel_b<<<1, 256, 0, s1>>>(d_ptr); // s1: may run concurrently!
cudaStreamDestroy(s1);
cudaFree(d_ptr);
}
The legacy default stream (stream 0) implicitly synchronizes with all other streams. This is the most common cause of unintentional serialization. If you see kernels running serially in Nsight Systems despite being in different streams, check whether any operations use the default stream. Use cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking) or compile with --default-stream per-thread.
CUDA Events: Timing and Synchronization
Events are markers in a stream’s execution timeline. They serve two purposes: timing and inter-stream synchronization.
#include <cuda_runtime.h>
#include <cstdio>
// Purpose 1: Timing
void event_timing() {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float* d_data;
cudaMalloc(&d_data, 1 << 26);
// Record events around the kernel
cudaEventRecord(start); // Records in default stream
my_kernel<<<grid, block>>>(d_data, n);
cudaEventRecord(stop);
// Wait for stop event to complete
cudaEventSynchronize(stop);
float ms;
cudaEventElapsedTime(&ms, start, stop);
printf("Kernel time: %.3f ms\n", ms);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(d_data);
}
// Purpose 2: Inter-stream synchronization
void event_sync() {
cudaStream_t stream_compute, stream_transfer;
cudaStreamCreate(&stream_compute);
cudaStreamCreate(&stream_transfer);
cudaEvent_t data_ready;
cudaEventCreate(&data_ready);
float *d_data, *h_data;
cudaMalloc(&d_data, N * sizeof(float));
cudaMallocHost(&h_data, N * sizeof(float)); // Pinned memory for async
// Stream 1: compute
compute_kernel<<<grid, block, 0, stream_compute>>>(d_data, N);
cudaEventRecord(data_ready, stream_compute); // Mark when compute is done
// Stream 2: transfer — but wait for compute first
cudaStreamWaitEvent(stream_transfer, data_ready); // stream_transfer waits
cudaMemcpyAsync(h_data, d_data, N * sizeof(float),
cudaMemcpyDeviceToHost, stream_transfer);
cudaStreamSynchronize(stream_transfer);
cudaEventDestroy(data_ready);
cudaStreamDestroy(stream_compute);
cudaStreamDestroy(stream_transfer);
}
Event Flags
// Default events record synchronization + timing
cudaEvent_t event_default;
cudaEventCreate(&event_default);
// Disable timing for lower overhead (use when you only need sync)
cudaEvent_t event_notiming;
cudaEventCreateWithFlags(&event_notiming, cudaEventDisableTiming);
// Blocking sync: cudaEventSynchronize will yield the CPU thread
// instead of busy-waiting (reduces CPU usage at cost of latency)
cudaEvent_t event_blocking;
cudaEventCreateWithFlags(&event_blocking, cudaEventBlockingSync);
// IPC event: can be shared across processes
cudaEvent_t event_ipc;
cudaEventCreateWithFlags(&event_ipc, cudaEventInterprocess | cudaEventDisableTiming);
Compute-Transfer Overlap: The Pipeline Pattern
The most impactful use of streams is overlapping data transfer with computation. This requires:
- Pinned (page-locked) host memory
- At least two streams
- Independent work in each stream
#include <cuda_runtime.h>
#include <cstdio>
__global__ void process_chunk(float* data, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n) {
float val = data[idx];
data[idx] = val * val + sqrtf(fabsf(val));
}
}
void pipeline_overlap(float* h_input, float* h_output, int total_n,
int num_chunks) {
int chunk_size = total_n / num_chunks;
size_t chunk_bytes = chunk_size * sizeof(float);
// Allocate device buffers for each pipeline stage
const int NUM_BUFFERS = 2; // Double-buffering
float* d_buffer[NUM_BUFFERS];
for (int i = 0; i < NUM_BUFFERS; i++) {
cudaMalloc(&d_buffer[i], chunk_bytes);
}
cudaStream_t streams[NUM_BUFFERS];
for (int i = 0; i < NUM_BUFFERS; i++) {
cudaStreamCreate(&streams[i]);
}
int block_size = 256;
int grid_size = (chunk_size + block_size - 1) / block_size;
for (int chunk = 0; chunk < num_chunks; chunk++) {
int buf = chunk % NUM_BUFFERS;
int offset = chunk * chunk_size;
// H2D copy for this chunk
cudaMemcpyAsync(d_buffer[buf], h_input + offset, chunk_bytes,
cudaMemcpyHostToDevice, streams[buf]);
// Compute on this chunk
process_chunk<<<grid_size, block_size, 0, streams[buf]>>>(
d_buffer[buf], chunk_size);
// D2H copy of this chunk
cudaMemcpyAsync(h_output + offset, d_buffer[buf], chunk_bytes,
cudaMemcpyDeviceToHost, streams[buf]);
}
cudaDeviceSynchronize();
for (int i = 0; i < NUM_BUFFERS; i++) {
cudaStreamDestroy(streams[i]);
cudaFree(d_buffer[i]);
}
}
int main() {
int n = 1 << 24; // 16M elements
float *h_in, *h_out;
// MUST use pinned memory for async transfers
cudaMallocHost(&h_in, n * sizeof(float));
cudaMallocHost(&h_out, n * sizeof(float));
for (int i = 0; i < n; i++) h_in[i] = (float)i;
pipeline_overlap(h_in, h_out, n, 8); // 8 chunks
cudaFreeHost(h_in);
cudaFreeHost(h_out);
return 0;
}
Pipeline Stages: Serial vs 2-Stream vs 4-Stream (A100, 16M floats, PCIe Gen4)
| Approach | H2D (ms) | Compute (ms) | D2H (ms) | Total (ms) | Overlap |
|---|---|---|---|---|---|
| Serial | 5.2 | 3.1 | 5.2 | 13.5 | None |
| 2 streams | 5.2 | 3.1 | 5.2 | 7.8 | Partial |
| 4 streams | 5.2 | 3.1 | 5.2 | 6.4 | Good |
| 8 streams | 5.2 | 3.1 | 5.2 | 6.2 | Near optimal |
Total Execution Time by Number of Streams
(ms)cudaMemcpyAsync with pageable memory silently falls back to synchronous transfer. You MUST use cudaMallocHost or cudaHostAlloc for the host buffer. Without pinned memory, all streams serialize at the memcpy and you get zero overlap.
Concurrent Kernel Execution
Multiple kernels can execute concurrently on the same GPU if they are in different streams and the GPU has enough resources:
#include <cuda_runtime.h>
#include <cstdio>
__global__ void kernel_small(float* data, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n) data[idx] *= 2.0f;
}
void concurrent_kernels() {
const int NUM_KERNELS = 4;
const int N = 1 << 16; // Small enough that 4 kernels can co-exist on GPU
float* d_data[NUM_KERNELS];
cudaStream_t streams[NUM_KERNELS];
for (int i = 0; i < NUM_KERNELS; i++) {
cudaMalloc(&d_data[i], N * sizeof(float));
cudaStreamCreate(&streams[i]);
}
// Launch 4 kernels concurrently
int block = 256;
int grid = (N + block - 1) / block;
for (int i = 0; i < NUM_KERNELS; i++) {
kernel_small<<<grid, block, 0, streams[i]>>>(d_data[i], N);
}
cudaDeviceSynchronize();
for (int i = 0; i < NUM_KERNELS; i++) {
cudaStreamDestroy(streams[i]);
cudaFree(d_data[i]);
}
}
When Kernels Actually Run Concurrently
Concurrent execution requires:
- Different streams (non-default, or with
cudaStreamNonBlocking) - Sufficient SM resources (warps, registers, shared memory)
- No implicit serialization (default stream operations, same-direction memcpy)
// These will NOT run concurrently:
kernel_a<<<108 * 8, 256>>>(d_a, n); // Fills the entire GPU (108 SMs * 8 blocks)
kernel_b<<<108 * 8, 256, 0, stream1>>>(d_b, n); // No SMs available
// These WILL run concurrently:
kernel_a<<<4, 256, 0, stream1>>>(d_a, n); // Uses 4 SMs
kernel_b<<<4, 256, 0, stream2>>>(d_b, n); // Uses 4 more SMs
// Both fit on 108-SM GPU simultaneously
Concurrent Kernel Execution: Resource-Limited (A100)
| Configuration | Kernel A Blocks | Kernel B Blocks | Concurrent? | Total Time |
|---|---|---|---|---|
| Full GPU each | 864 | 864 | No (serial) | 2T |
| Half GPU each | 432 | 432 | Yes | ~T (overlap) |
| Quarter GPU each | 216 | 216 | Yes | ~T |
| Tiny kernels | 4 | 4 | Yes | ~T |
Stream Priorities
CUDA supports stream priorities since compute capability 3.5:
void stream_priorities() {
// Query priority range
int low_priority, high_priority;
cudaDeviceGetStreamPriorityRange(&low_priority, &high_priority);
// On most GPUs: low_priority = 0, high_priority = -1
// Lower number = higher priority
printf("Priority range: [%d (highest), %d (lowest)]\n",
high_priority, low_priority);
// Create streams with priorities
cudaStream_t stream_high, stream_low;
cudaStreamCreateWithPriority(&stream_high, cudaStreamNonBlocking, high_priority);
cudaStreamCreateWithPriority(&stream_low, cudaStreamNonBlocking, low_priority);
// High-priority stream preempts low-priority at block boundaries
// When an SM finishes a block from the low-priority kernel,
// it picks up a block from the high-priority kernel first
// Use case: inference with batch processing
// High priority: real-time single-request inference
// Low priority: batch processing in the background
inference_kernel<<<grid, block, 0, stream_high>>>(single_request);
batch_kernel<<<large_grid, block, 0, stream_low>>>(batch_data);
cudaStreamDestroy(stream_high);
cudaStreamDestroy(stream_low);
}
Stream priorities affect block scheduling, not instruction scheduling. A running block will not be preempted mid-execution. When an SM finishes a block, it preferentially schedules blocks from higher-priority streams. This means priorities are most effective when kernels have many small blocks (fine-grained scheduling opportunities).
Multi-Stream Patterns for LLM Inference
#include <cuda_runtime.h>
// Pattern: Overlap attention computation with KV cache transfer
struct InferenceStreams {
cudaStream_t compute;
cudaStream_t kv_transfer;
cudaStream_t output_transfer;
cudaEvent_t kv_ready;
cudaEvent_t compute_done;
};
void init_streams(InferenceStreams* s) {
cudaStreamCreate(&s->compute);
cudaStreamCreate(&s->kv_transfer);
cudaStreamCreate(&s->output_transfer);
cudaEventCreateWithFlags(&s->kv_ready, cudaEventDisableTiming);
cudaEventCreateWithFlags(&s->compute_done, cudaEventDisableTiming);
}
void inference_step(InferenceStreams* s,
float* d_input, float* d_kv_cache,
float* d_output, float* h_output,
float* h_next_kv, int seq_len, int hidden_dim) {
// Stream 1: Prefetch next layer's KV cache
cudaMemcpyAsync(d_kv_cache, h_next_kv,
seq_len * hidden_dim * sizeof(float),
cudaMemcpyHostToDevice, s->kv_transfer);
cudaEventRecord(s->kv_ready, s->kv_transfer);
// Stream 0: Attention computation (wait for KV cache)
cudaStreamWaitEvent(s->compute, s->kv_ready);
attention_kernel<<<grid, block, 0, s->compute>>>(
d_input, d_kv_cache, d_output, seq_len, hidden_dim);
cudaEventRecord(s->compute_done, s->compute);
// Stream 2: Transfer output back to host (wait for compute)
cudaStreamWaitEvent(s->output_transfer, s->compute_done);
cudaMemcpyAsync(h_output, d_output,
hidden_dim * sizeof(float),
cudaMemcpyDeviceToHost, s->output_transfer);
}
Pattern: Batch Processing with Double-Buffering
void batched_inference(float** h_inputs, float** h_outputs,
int batch_count, int n) {
const int NUM_BUFFERS = 2;
float* d_in[NUM_BUFFERS], *d_out[NUM_BUFFERS];
cudaStream_t streams[NUM_BUFFERS];
for (int i = 0; i < NUM_BUFFERS; i++) {
cudaMalloc(&d_in[i], n * sizeof(float));
cudaMalloc(&d_out[i], n * sizeof(float));
cudaStreamCreate(&streams[i]);
}
for (int batch = 0; batch < batch_count; batch++) {
int buf = batch % NUM_BUFFERS;
// Pipeline: H2D -> Compute -> D2H, overlapped across buffers
cudaMemcpyAsync(d_in[buf], h_inputs[batch], n * sizeof(float),
cudaMemcpyHostToDevice, streams[buf]);
inference_kernel<<<(n + 255) / 256, 256, 0, streams[buf]>>>(
d_in[buf], d_out[buf], n);
cudaMemcpyAsync(h_outputs[batch], d_out[buf], n * sizeof(float),
cudaMemcpyDeviceToHost, streams[buf]);
}
cudaDeviceSynchronize();
for (int i = 0; i < NUM_BUFFERS; i++) {
cudaStreamDestroy(streams[i]);
cudaFree(d_in[i]);
cudaFree(d_out[i]);
}
}
Stream Callbacks
Stream callbacks execute a host function when all preceding operations in the stream complete:
#include <cuda_runtime.h>
#include <cstdio>
void CUDART_CB my_callback(void* data) {
int* task_id = (int*)data;
printf("Task %d completed on GPU\n", *task_id);
// Can trigger host-side work: signal condition variable, update queue, etc.
// WARNING: Cannot call CUDA API from callback (would deadlock)
}
void callback_demo() {
cudaStream_t stream;
cudaStreamCreate(&stream);
int task_id = 42;
// Enqueue work
my_kernel<<<grid, block, 0, stream>>>(d_data, n);
// Enqueue callback — fires after kernel completes
cudaLaunchHostFunc(stream, my_callback, &task_id);
// More work can follow the callback in the same stream
another_kernel<<<grid, block, 0, stream>>>(d_data, n);
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
}
Host callbacks (cudaLaunchHostFunc) must not call any CUDA API functions. Doing so causes deadlock because the callback runs on a CUDA runtime thread that holds internal locks. Use callbacks only for signaling mechanisms (semaphores, condition variables, atomics) or lightweight host-side bookkeeping.
Measuring Stream Overlap with Events
#include <cuda_runtime.h>
#include <cstdio>
struct StreamTimeline {
cudaEvent_t h2d_start, h2d_end;
cudaEvent_t compute_start, compute_end;
cudaEvent_t d2h_start, d2h_end;
};
void measure_overlap() {
const int NUM_STREAMS = 4;
StreamTimeline tl[NUM_STREAMS];
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) {
cudaStreamCreate(&streams[i]);
cudaEventCreate(&tl[i].h2d_start);
cudaEventCreate(&tl[i].h2d_end);
cudaEventCreate(&tl[i].compute_start);
cudaEventCreate(&tl[i].compute_end);
cudaEventCreate(&tl[i].d2h_start);
cudaEventCreate(&tl[i].d2h_end);
}
float *h_data, *d_data[NUM_STREAMS];
int chunk_size = 1 << 22;
size_t chunk_bytes = chunk_size * sizeof(float);
cudaMallocHost(&h_data, chunk_bytes * NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cudaMalloc(&d_data[i], chunk_bytes);
}
// Launch pipelined work with timing events
for (int i = 0; i < NUM_STREAMS; i++) {
cudaEventRecord(tl[i].h2d_start, streams[i]);
cudaMemcpyAsync(d_data[i], h_data + i * chunk_size, chunk_bytes,
cudaMemcpyHostToDevice, streams[i]);
cudaEventRecord(tl[i].h2d_end, streams[i]);
cudaEventRecord(tl[i].compute_start, streams[i]);
process_kernel<<<(chunk_size + 255) / 256, 256, 0, streams[i]>>>(
d_data[i], chunk_size);
cudaEventRecord(tl[i].compute_end, streams[i]);
cudaEventRecord(tl[i].d2h_start, streams[i]);
cudaMemcpyAsync(h_data + i * chunk_size, d_data[i], chunk_bytes,
cudaMemcpyDeviceToHost, streams[i]);
cudaEventRecord(tl[i].d2h_end, streams[i]);
}
cudaDeviceSynchronize();
// Print timeline
for (int i = 0; i < NUM_STREAMS; i++) {
float h2d_ms, compute_ms, d2h_ms;
cudaEventElapsedTime(&h2d_ms, tl[i].h2d_start, tl[i].h2d_end);
cudaEventElapsedTime(&compute_ms, tl[i].compute_start, tl[i].compute_end);
cudaEventElapsedTime(&d2h_ms, tl[i].d2h_start, tl[i].d2h_end);
printf("Stream %d: H2D=%.2f ms, Compute=%.2f ms, D2H=%.2f ms\n",
i, h2d_ms, compute_ms, d2h_ms);
}
// Total time from first event to last event
float total_ms;
cudaEventElapsedTime(&total_ms, tl[0].h2d_start,
tl[NUM_STREAMS - 1].d2h_end);
printf("Total wall time: %.2f ms\n", total_ms);
// Cleanup
for (int i = 0; i < NUM_STREAMS; i++) {
cudaEventDestroy(tl[i].h2d_start);
cudaEventDestroy(tl[i].h2d_end);
cudaEventDestroy(tl[i].compute_start);
cudaEventDestroy(tl[i].compute_end);
cudaEventDestroy(tl[i].d2h_start);
cudaEventDestroy(tl[i].d2h_end);
cudaStreamDestroy(streams[i]);
cudaFree(d_data[i]);
}
cudaFreeHost(h_data);
}
Stream-Ordered Memory Allocation (CUDA 11.2+)
#include <cuda_runtime.h>
void stream_ordered_alloc() {
cudaStream_t stream;
cudaStreamCreate(&stream);
float* d_temp;
// Allocate memory tied to a stream — freed when stream reaches the free operation
cudaMallocAsync(&d_temp, 1024 * sizeof(float), stream);
// Use the memory in the same stream
my_kernel<<<4, 256, 0, stream>>>(d_temp, 1024);
// Free — the actual deallocation happens after the kernel completes
cudaFreeAsync(d_temp, stream);
// Memory pool caches allocations for reuse
cudaMemPool_t pool;
cudaDeviceGetDefaultMemPool(&pool, 0);
// Configure pool: set release threshold (how much to cache)
uint64_t threshold = 256 * 1024 * 1024; // Keep 256 MB cached
cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
cudaStreamDestroy(stream);
}
Memory Allocation: cudaMalloc vs cudaMallocAsync (A100)
| Method | Latency (first call) | Latency (cached) | Stream-ordered |
|---|---|---|---|
| cudaMalloc | ~50 us | ~50 us | No (synchronizing) |
| cudaMallocAsync | ~50 us | ~1 us | Yes (non-blocking) |
Nsight Systems: Visualizing Stream Concurrency
# Profile with Nsight Systems to visualize stream timelines
nsys profile -o timeline ./my_app
# Open in Nsight Systems GUI:
# - Each stream shown as a horizontal timeline
# - Kernel bars show execution on SMs
# - Memory copy bars show DMA engine usage
# - Overlapping bars = concurrent execution
# CLI summary of stream usage:
nsys stats timeline.nsys-rep
# Look for:
# Gaps between operations in the same stream (underutilization)
# Serial execution across streams (unintended synchronization)
# H2D/D2H overlap with compute (desired)
# Default stream barriers (unintended)
Common Stream Pitfalls
Pitfall 1: Pageable Memory Blocks Async Transfers
// BAD: pageable memory — cudaMemcpyAsync is actually synchronous
float* h_pageable = (float*)malloc(n * sizeof(float));
cudaMemcpyAsync(d_ptr, h_pageable, n * sizeof(float),
cudaMemcpyHostToDevice, stream); // BLOCKS until complete
// GOOD: pinned memory — truly asynchronous
float* h_pinned;
cudaMallocHost(&h_pinned, n * sizeof(float));
cudaMemcpyAsync(d_ptr, h_pinned, n * sizeof(float),
cudaMemcpyHostToDevice, stream); // Returns immediately
Pitfall 2: Over-Synchronizing
// BAD: synchronizing after every operation
for (int i = 0; i < 100; i++) {
kernel<<<grid, block, 0, stream>>>(d_data, n);
cudaStreamSynchronize(stream); // Kills all pipelining
}
// GOOD: synchronize once at the end
for (int i = 0; i < 100; i++) {
kernel<<<grid, block, 0, stream>>>(d_data, n);
}
cudaStreamSynchronize(stream); // Wait for all 100 kernels
Pitfall 3: False Dependencies from Event Recording Order
// Issue order on host matters for stream scheduling
// The GPU processes commands in each stream in FIFO order
// Pattern to maximize overlap:
// Issue ALL H2D copies first, THEN all kernels, THEN all D2H copies
// This lets the DMA engines start early
// GOOD: breadth-first issuing
for (int i = 0; i < N; i++)
cudaMemcpyAsync(d_in[i], h_in[i], bytes, cudaMemcpyHostToDevice, streams[i]);
for (int i = 0; i < N; i++)
kernel<<<grid, block, 0, streams[i]>>>(d_in[i], d_out[i]);
for (int i = 0; i < N; i++)
cudaMemcpyAsync(h_out[i], d_out[i], bytes, cudaMemcpyDeviceToHost, streams[i]);
// LESS OPTIMAL: depth-first issuing
for (int i = 0; i < N; i++) {
cudaMemcpyAsync(d_in[i], h_in[i], bytes, cudaMemcpyHostToDevice, streams[i]);
kernel<<<grid, block, 0, streams[i]>>>(d_in[i], d_out[i]);
cudaMemcpyAsync(h_out[i], d_out[i], bytes, cudaMemcpyDeviceToHost, streams[i]);
}
// Depth-first CAN work but may have slightly less overlap depending on hardware
On modern GPUs with multiple hardware queues, the difference between breadth-first and depth-first issuing is smaller than on older architectures. However, breadth-first still guarantees maximum overlap because it fills all DMA queues before compute queues. When in doubt, use breadth-first.
Putting It Together: Production Stream Manager
#include <cuda_runtime.h>
#include <vector>
class StreamManager {
public:
StreamManager(int num_streams, bool high_priority = false) {
int lo, hi;
cudaDeviceGetStreamPriorityRange(&lo, &hi);
int priority = high_priority ? hi : lo;
streams_.resize(num_streams);
events_.resize(num_streams);
for (int i = 0; i < num_streams; i++) {
cudaStreamCreateWithPriority(&streams_[i],
cudaStreamNonBlocking, priority);
cudaEventCreateWithFlags(&events_[i], cudaEventDisableTiming);
}
}
~StreamManager() {
for (auto& s : streams_) cudaStreamDestroy(s);
for (auto& e : events_) cudaEventDestroy(e);
}
cudaStream_t get(int idx) { return streams_[idx % streams_.size()]; }
void record(int idx) {
cudaEventRecord(events_[idx % events_.size()],
streams_[idx % streams_.size()]);
}
void wait(int wait_stream_idx, int event_idx) {
cudaStreamWaitEvent(streams_[wait_stream_idx % streams_.size()],
events_[event_idx % events_.size()]);
}
void sync_all() {
for (auto& s : streams_) cudaStreamSynchronize(s);
}
int count() const { return streams_.size(); }
private:
std::vector<cudaStream_t> streams_;
std::vector<cudaEvent_t> events_;
};
// Usage:
void run_pipeline(StreamManager& sm, float** h_data, float** d_data,
int n, int batches) {
int block = 256;
int grid = (n + block - 1) / block;
size_t bytes = n * sizeof(float);
for (int b = 0; b < batches; b++) {
int s = b % sm.count();
cudaMemcpyAsync(d_data[s], h_data[b], bytes,
cudaMemcpyHostToDevice, sm.get(s));
process_kernel<<<grid, block, 0, sm.get(s)>>>(d_data[s], n);
cudaMemcpyAsync(h_data[b], d_data[s], bytes,
cudaMemcpyDeviceToHost, sm.get(s));
}
sm.sync_all();
}
Stream Configuration Impact on Pipeline Throughput
(GB/s effective)Summary
Streams enable concurrent execution on the GPU by separating independent operations into parallel command queues. The three hardware engines (compute SMs, H2D DMA, D2H DMA) operate independently, so work in different streams can overlap. Events provide inter-stream synchronization (cudaStreamWaitEvent) and timing (cudaEventElapsedTime). The key requirements for overlap are: pinned host memory for async transfers, non-default streams (or cudaStreamNonBlocking), and independent data between streams. Stream-ordered memory allocation (cudaMallocAsync) avoids the synchronizing cost of cudaMalloc. For most applications, 2-4 streams with double-buffering provides near-optimal overlap. Profile with Nsight Systems to verify actual concurrency.