Pre-CUDA 9, performing a reduction on the first 16 threads of a warp required implicit warp-lockstep assumptions, manual mask management, and code that broke on Volta when independent thread scheduling was introduced. Cooperative Groups provides tiled_partition<16> which creates a typed sub-warp tile that synchronizes correctly across all architectures. The tile supports .sync(), .shfl_down(), and .reduce() operations. The same API extends to grid-wide synchronization: grid_group provides a typed handle for synchronizing all thread blocks in a kernel, eliminating the need for multi-kernel launches when a global barrier is required. Cooperative Groups unifies synchronization from 1 thread to 10,000 threads under a single type-safe API.
All measurements target NVIDIA Ampere (A100-80GB SXM, SM 8.0) unless stated otherwise.
The Cooperative Groups Type Hierarchy
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void demonstrate_groups() {
// Thread block group (replaces __syncthreads scope)
cg::thread_block block = cg::this_thread_block();
// Warp-level tile (compile-time size, must be power of 2, max 32)
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
cg::thread_block_tile<16> half_warp = cg::tiled_partition<16>(block);
cg::thread_block_tile<8> quarter_warp = cg::tiled_partition<8>(block);
cg::thread_block_tile<4> mini_tile = cg::tiled_partition<4>(block);
// Coalesced group: only the active threads in a divergent branch
if (threadIdx.x % 2 == 0) {
cg::coalesced_group active = cg::coalesced_threads();
// active.size() == blockDim.x / 2
// active.thread_rank() == contiguous rank among active threads
}
// Properties available on all group types:
// group.size() — number of threads in the group
// group.thread_rank() — this thread's index within the group (0-based)
// group.sync() — synchronize all threads in the group
}
Grid-Level and Multi-Grid Groups
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// Grid group requires cooperative launch
__global__ void grid_cooperative_kernel(float* data, int n) {
cg::grid_group grid = cg::this_grid();
// grid.size() — total threads in the grid
// grid.thread_rank() — global thread index
// grid.is_valid() — true if launched cooperatively
// grid.sync() — barrier across ALL blocks in the grid
int idx = grid.thread_rank();
if (idx < n) {
data[idx] *= 2.0f;
}
grid.sync(); // All blocks must reach this point
if (idx < n) {
data[idx] += 1.0f;
}
}
// Launch must use cooperative API
void launch_cooperative() {
int block_size = 256;
int num_blocks;
// Query max blocks that can be co-resident for grid.sync() to work
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&num_blocks, grid_cooperative_kernel, block_size, 0);
int num_sms;
cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0);
// Total blocks must fit on the GPU simultaneously
int grid_size = num_blocks * num_sms;
void* args[] = { &data, &n };
cudaLaunchCooperativeKernel(
(void*)grid_cooperative_kernel, grid_size, block_size, args);
}
cudaLaunchCooperativeKernel requires that ALL blocks in the grid can be resident on the GPU simultaneously. This means gridDim * blockDim must not exceed maxActiveBlocks * numSMs * threadsPerBlock. If the grid is too large, the launch fails. Check device property cooperativeLaunch for support.
Thread Block Tiles: Sub-Warp Programming
The thread_block_tile template provides a clean API for sub-warp operations. The tile size is a compile-time constant and must be a power of 2:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// Reduction within a tile of arbitrary power-of-2 size
template <int TILE_SIZE>
__device__ float tile_reduce_sum(cg::thread_block_tile<TILE_SIZE> tile, float val) {
for (int offset = TILE_SIZE / 2; offset > 0; offset >>= 1) {
val += tile.shfl_down(val, offset);
}
return val; // Only thread 0 of the tile has the final sum
}
// Broadcast from one tile member to all others
template <int TILE_SIZE>
__device__ float tile_broadcast(cg::thread_block_tile<TILE_SIZE> tile,
float val, int src_rank) {
return tile.shfl(val, src_rank);
}
// Ballot: which tile members satisfy a predicate
template <int TILE_SIZE>
__device__ unsigned tile_ballot(cg::thread_block_tile<TILE_SIZE> tile, bool predicate) {
return tile.ballot(predicate); // Returns a TILE_SIZE-bit mask
}
__global__ void tile_demo(const float* input, float* output, int n) {
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<8> tile8 = cg::tiled_partition<8>(block);
int global_idx = threadIdx.x + blockIdx.x * blockDim.x;
if (global_idx >= n) return;
float val = input[global_idx];
// Each group of 8 threads computes its own sum
float tile_sum = tile_reduce_sum(tile8, val);
// Thread 0 of each tile writes the result
if (tile8.thread_rank() == 0) {
int tile_idx = global_idx / 8;
output[tile_idx] = tile_sum;
}
}
Available Operations on thread_block_tile
template <int Size>
__device__ void tile_operations_demo(cg::thread_block_tile<Size> tile) {
float val = (float)tile.thread_rank();
// Shuffle operations (same as __shfl_*_sync but scoped to tile)
float from_0 = tile.shfl(val, 0); // Broadcast from rank 0
float shifted = tile.shfl_down(val, 1); // Shift down by 1
float shifted_up = tile.shfl_up(val, 1); // Shift up by 1
float xor_swap = tile.shfl_xor(val, 1); // XOR shuffle
// Collective operations
unsigned ballot = tile.ballot(val > 2.0f); // Predicate vote
bool all_true = tile.all(val >= 0.0f); // Universal quantifier
bool any_true = tile.any(val > 10.0f); // Existential quantifier
// Match operation (which threads have the same value)
unsigned match = tile.match_any(val); // Bitmask of matching threads
// Sync (usually not needed for power-of-2 tiles within a warp,
// but required for correctness guarantees post-Volta)
tile.sync();
// Meta information
int rank = tile.thread_rank(); // 0 to Size-1
int size = tile.size(); // = Size
unsigned mask = tile.meta_group_size(); // Number of tiles in parent
unsigned idx = tile.meta_group_rank(); // This tile's index in parent
}
Practical Example: Tile-Based Reduction
Compare three approaches to block-level reduction:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// Approach 1: Classic shared memory reduction
__global__ void reduce_classic(const float* input, float* output, int n) {
__shared__ float sdata[256];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (idx < n) ? input[idx] : 0.0f;
__syncthreads();
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = sdata[0];
}
// Approach 2: Warp-tile reduction (no shared memory for final warp)
__global__ void reduce_warp_tile(const float* input, float* output, int n) {
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
__shared__ float warp_results[32]; // One per warp (max 32 warps per block)
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float val = (idx < n) ? input[idx] : 0.0f;
// Step 1: Reduce within each warp using tile shuffle
for (int offset = warp.size() / 2; offset > 0; offset >>= 1) {
val += warp.shfl_down(val, offset);
}
// Step 2: Warp leaders write to shared memory
int warp_id = tid / 32;
if (warp.thread_rank() == 0) {
warp_results[warp_id] = val;
}
block.sync();
// Step 3: First warp reduces the warp results
int num_warps = blockDim.x / 32;
if (warp_id == 0) {
val = (tid < num_warps) ? warp_results[tid] : 0.0f;
for (int offset = warp.size() / 2; offset > 0; offset >>= 1) {
val += warp.shfl_down(val, offset);
}
if (tid == 0) output[blockIdx.x] = val;
}
}
// Approach 3: Hierarchical tile reduction (4-thread tiles -> 32-thread warps -> block)
__global__ void reduce_hierarchical_tiles(const float* input, float* output, int n) {
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<4> tile4 = cg::tiled_partition<4>(block);
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float val = (idx < n) ? input[idx] : 0.0f;
// Level 1: Reduce within 4-thread tiles
for (int offset = 2; offset > 0; offset >>= 1) {
val += tile4.shfl_down(val, offset);
}
// Level 2: Only tile leaders participate in warp reduction
// 32/4 = 8 tile leaders per warp
if (tile4.thread_rank() == 0) {
// Use warp shuffle across the 8 leaders
// Leaders are at warp lanes 0, 4, 8, 12, 16, 20, 24, 28
for (int offset = 16; offset > 0; offset >>= 1) {
val += warp.shfl_down(val, offset);
}
}
__shared__ float warp_results[32];
int warp_id = threadIdx.x / 32;
if (threadIdx.x % 32 == 0) {
warp_results[warp_id] = val;
}
block.sync();
if (warp_id == 0 && threadIdx.x < blockDim.x / 32) {
val = warp_results[threadIdx.x];
for (int offset = warp.size() / 2; offset > 0; offset >>= 1) {
val += warp.shfl_down(val, offset);
}
if (threadIdx.x == 0) output[blockIdx.x] = val;
}
}
Block Reduction: Classic vs Cooperative Groups (A100, 256 threads/block, 16M elements)
| Approach | Time (us) | Bandwidth (GB/s) | Shared Mem |
|---|---|---|---|
| Classic (shared memory tree) | 52 | 1230 | 1 KB |
| Warp-tile + shared | 43 | 1490 | 128 B |
| Hierarchical tiles | 45 | 1420 | 128 B |
Coalesced Groups: Handling Divergence
When threads diverge (take different branches), coalesced_threads() creates a group containing only the active threads in the current branch:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void divergent_work(const int* flags, float* data, float* results, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= n) return;
if (flags[idx]) {
// Only threads where flags[idx] != 0 are active here
cg::coalesced_group active = cg::coalesced_threads();
float val = data[idx];
// Reduce across ONLY the active threads
for (int offset = active.size() / 2; offset > 0; offset >>= 1) {
val += active.shfl_down(val, offset);
}
// active.thread_rank() gives contiguous ranks 0..active.size()-1
if (active.thread_rank() == 0) {
// This thread has the sum of all active threads' values
int warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
atomicAdd(&results[warp_id], val);
}
}
}
Labeled Partitions
Create groups based on a label — all threads with the same label form a group:
__global__ void labeled_partition_demo(const int* categories,
const float* values,
float* category_sums,
int n, int num_categories) {
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;
int category = categories[idx];
float val = values[idx];
// Create a partition where threads with the same category are grouped
cg::coalesced_group same_category = cg::labeled_partition(warp, category);
// Reduce within each category group
for (int offset = same_category.size() / 2; offset > 0; offset >>= 1) {
val += same_category.shfl_down(val, offset);
}
// Leader of each category group atomically adds to global
if (same_category.thread_rank() == 0) {
atomicAdd(&category_sums[category], val);
}
}
coalesced_threads() and labeled_partition() have non-trivial overhead because they must determine the active mask at runtime. For performance-critical inner loops where the active mask is known at compile time, prefer thread_block_tile with explicit masking. Use coalesced groups when the divergence pattern is data-dependent and cannot be predicted.
Grid-Level Synchronization
Grid sync allows all blocks to synchronize mid-kernel, eliminating the need for multi-kernel launches:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// Single-kernel two-pass reduction using grid.sync()
__global__ void grid_reduce(const float* input, float* block_sums,
float* result, int n) {
cg::grid_group grid = cg::this_grid();
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
__shared__ float sdata[32]; // One per warp
// Phase 1: Each block reduces its chunk
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float val = (idx < n) ? input[idx] : 0.0f;
// Warp-level reduction
for (int offset = 16; offset > 0; offset >>= 1) {
val += warp.shfl_down(val, offset);
}
int warp_id = threadIdx.x / 32;
if (warp.thread_rank() == 0) sdata[warp_id] = val;
block.sync();
// First warp reduces warp results
if (warp_id == 0) {
int num_warps = blockDim.x / 32;
val = (threadIdx.x < num_warps) ? sdata[threadIdx.x] : 0.0f;
for (int offset = 16; offset > 0; offset >>= 1) {
val += warp.shfl_down(val, offset);
}
if (threadIdx.x == 0) block_sums[blockIdx.x] = val;
}
// Synchronize ALL blocks — no thread proceeds until all blocks complete Phase 1
grid.sync();
// Phase 2: Block 0 reduces the block sums
if (blockIdx.x == 0) {
int num_blocks = gridDim.x;
val = 0.0f;
for (int i = threadIdx.x; i < num_blocks; i += blockDim.x) {
val += block_sums[i];
}
for (int offset = 16; offset > 0; offset >>= 1) {
val += warp.shfl_down(val, offset);
}
if (warp_id == 0) sdata[warp_id] = val;
block.sync();
if (threadIdx.x == 0) {
float total = 0.0f;
int num_warps = blockDim.x / 32;
for (int i = 0; i < num_warps; i++) total += sdata[i];
*result = total;
}
}
}
void launch_grid_reduce(const float* d_input, float* d_block_sums,
float* d_result, int n) {
int block_size = 256;
int max_blocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks, grid_reduce, block_size, 0);
int num_sms;
cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0);
int grid_size = max_blocks * num_sms;
// Ensure we don't launch more blocks than needed
int needed = (n + block_size - 1) / block_size;
if (grid_size > needed) grid_size = needed;
void* args[] = { (void*)&d_input, (void*)&d_block_sums, (void*)&d_result, &n };
cudaLaunchCooperativeKernel(
(void*)grid_reduce, grid_size, block_size, args);
}
Grid Sync vs Two-Kernel Approach
// Alternative: two separate kernel launches (no cooperative groups needed)
__global__ void pass1_block_reduce(const float* input, float* block_sums, int n) {
// Same as Phase 1 above
// ...
}
__global__ void pass2_final_reduce(const float* block_sums, float* result,
int num_blocks) {
// Same as Phase 2 above
// ...
}
void launch_two_pass(const float* d_input, float* d_block_sums,
float* d_result, int n) {
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size;
pass1_block_reduce<<<grid_size, block_size>>>(d_input, d_block_sums, n);
pass2_final_reduce<<<1, 256>>>(d_block_sums, d_result, grid_size);
}
Grid Reduction: Cooperative vs Two-Kernel (A100, 16M elements)
| Approach | Time (us) | Notes |
|---|---|---|
| Two-kernel launch | 48 | Includes kernel launch overhead (~5 us) |
| Cooperative grid.sync() | 44 | Single launch, limited grid size |
| atomicAdd (baseline) | 82 | All threads atomic to single address |
Grid-Level Cooperative Algorithms: Jacobi Iteration
Grid sync shines in iterative algorithms that alternate between computation and global synchronization:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// Jacobi iteration for 2D Laplace equation
// Without cooperative groups: need 2 kernel launches per iteration
// With cooperative groups: single kernel, grid.sync() between iterations
__global__ void jacobi_cooperative(float* u, float* u_new,
float* diff, int nx, int ny,
int num_iterations) {
cg::grid_group grid = cg::this_grid();
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
int idx = grid.thread_rank();
int total_threads = grid.size();
for (int iter = 0; iter < num_iterations; iter++) {
// Each thread updates one or more grid points
for (int i = idx; i < nx * ny; i += total_threads) {
int x = i % nx;
int y = i / nx;
if (x > 0 && x < nx - 1 && y > 0 && y < ny - 1) {
float new_val = 0.25f * (
u[y * nx + (x - 1)] +
u[y * nx + (x + 1)] +
u[(y - 1) * nx + x] +
u[(y + 1) * nx + x]
);
u_new[i] = new_val;
}
}
grid.sync(); // All threads must finish before swapping
// Swap pointers (all threads do this identically)
float* temp = u;
u = u_new;
u_new = temp;
grid.sync(); // Ensure swap is visible before next iteration
}
}
Jacobi Iteration: Cooperative vs Multi-Kernel (A100, 4096x4096 grid)
(ms per 100 iterations)Multi-Grid Groups (Multi-GPU)
For multi-GPU cooperative launches, use multi_grid_group:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void multi_gpu_kernel(float* local_data, float* peer_data, int n) {
cg::multi_grid_group multi_grid = cg::this_multi_grid();
int device_rank = multi_grid.grid_rank(); // Which GPU (0, 1, ...)
int num_devices = multi_grid.num_grids(); // Total GPU count
// Local computation
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
local_data[idx] *= 2.0f;
}
// Synchronize across ALL GPUs
multi_grid.sync();
// Now safe to read peer data (via NVLink/PCIe P2P)
if (idx < n) {
local_data[idx] += peer_data[idx];
}
}
// Launch requires cudaLaunchCooperativeKernelMultiDevice
void launch_multi_gpu(float** d_local, float** d_peer, int n, int num_gpus) {
cudaLaunchParams params[8]; // Max 8 GPUs
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size;
for (int i = 0; i < num_gpus; i++) {
params[i].func = (void*)multi_gpu_kernel;
params[i].gridDim = grid_size;
params[i].blockDim = block_size;
params[i].sharedMem = 0;
params[i].stream = nullptr;
void* args[] = { &d_local[i], &d_peer[i], &n };
params[i].args = args;
}
cudaLaunchCooperativeKernelMultiDevice(params, num_gpus);
}
Multi-grid cooperative launches require all GPUs to be the same model, support P2P access, and be connected via NVLink or PCIe. The grid size on each GPU is constrained by co-residency requirements. This feature is deprecated in CUDA 12.x in favor of cudaLaunchKernelEx with cluster support on Hopper.
Thread Block Clusters (Hopper, SM 9.0)
On Hopper GPUs (SM 9.0), cooperative groups integrate with Thread Block Clusters — groups of thread blocks that can synchronize and share distributed shared memory:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// Requires SM 9.0+ and CUDA 12.0+
__global__ void __cluster_dims__(2, 1, 1) // Cluster of 2 blocks
cluster_kernel(float* data, int n) {
cg::cluster_group cluster = cg::this_cluster();
cg::thread_block block = cg::this_thread_block();
__shared__ float local_smem[256];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n) {
local_smem[threadIdx.x] = data[idx];
}
block.sync();
// Synchronize all blocks in the cluster
cluster.sync();
// Access distributed shared memory from peer block in the cluster
unsigned int peer_block_rank = (cluster.block_rank() + 1) % cluster.num_blocks();
float* peer_smem = cluster.map_shared_rank(local_smem, peer_block_rank);
// Now peer_smem points to the other block's shared memory
if (idx < n) {
data[idx] = local_smem[threadIdx.x] + peer_smem[threadIdx.x];
}
}
Performance Patterns and Best Practices
Pattern: Persistent Kernel with Grid Sync
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// Process a stream of work items with a single persistent kernel
__global__ void persistent_worker(float* work_queue, float* results,
int* queue_head, int total_items) {
cg::grid_group grid = cg::this_grid();
cg::thread_block block = cg::this_thread_block();
int tid = grid.thread_rank();
int grid_size = grid.size();
// Each thread processes items in a strided loop
for (int base = 0; base < total_items; base += grid_size) {
int item_idx = base + tid;
if (item_idx < total_items) {
// Process work item
float val = work_queue[item_idx];
results[item_idx] = val * val + 1.0f;
}
// Optional: grid sync if subsequent items depend on previous results
grid.sync();
}
}
Pattern: Cooperative Scan (Prefix Sum)
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void cooperative_scan(float* data, float* block_sums, int n) {
cg::grid_group grid = cg::this_grid();
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
extern __shared__ float sdata[];
// Phase 1: Block-level inclusive scan
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float val = (idx < n) ? data[idx] : 0.0f;
// Warp-level scan
for (int offset = 1; offset < 32; offset <<= 1) {
float received = warp.shfl_up(val, offset);
if (warp.thread_rank() >= offset) val += received;
}
int warp_id = threadIdx.x / 32;
int lane = threadIdx.x % 32;
// Store warp results
if (lane == 31) sdata[warp_id] = val;
block.sync();
// Scan warp sums (first warp only)
if (warp_id == 0 && threadIdx.x < blockDim.x / 32) {
float warp_sum = sdata[threadIdx.x];
for (int offset = 1; offset < blockDim.x / 32; offset <<= 1) {
float received = warp.shfl_up(warp_sum, offset);
if (warp.thread_rank() >= offset) warp_sum += received;
}
sdata[threadIdx.x] = warp_sum;
}
block.sync();
// Add warp prefix to each element
if (warp_id > 0) val += sdata[warp_id - 1];
// Store block result and block sum
if (idx < n) data[idx] = val;
if (threadIdx.x == blockDim.x - 1) block_sums[blockIdx.x] = val;
grid.sync();
// Phase 2: Scan block sums (block 0 only)
if (blockIdx.x == 0) {
for (int i = threadIdx.x; i < gridDim.x; i += blockDim.x) {
// Simple serial scan for block sums (few enough blocks)
}
}
grid.sync();
// Phase 3: Add block prefix to all elements
if (blockIdx.x > 0 && idx < n) {
data[idx] += block_sums[blockIdx.x - 1];
}
}
Compilation and Linking Requirements
# Cooperative groups require separate compilation for grid-level features
# and -rdc=true (relocatable device code) for grid.sync()
# Basic cooperative groups (tile, block):
nvcc -arch=sm_80 kernel.cu -o kernel
# Grid-level cooperative launch:
nvcc -arch=sm_80 -rdc=true kernel.cu -o kernel -lcudadevrt
# Multi-grid (deprecated):
nvcc -arch=sm_80 -rdc=true kernel.cu -o kernel -lcudadevrt
# Cluster (Hopper):
nvcc -arch=sm_90 -rdc=true kernel.cu -o kernel -lcudadevrt
thread_block_tile: Always prefer over raw __shfl_*_sync — same performance, better readability, compile-time tile size. coalesced_group: Use for data-dependent divergence where you need to operate on active threads. grid_group: Use for iterative algorithms (Jacobi, PageRank, BFS) that need global barriers without multi-kernel overhead. cluster_group: Hopper-only, use for distributed shared memory across nearby blocks.
Synchronization Overhead by Scope (A100)
(nanoseconds)Summary
Cooperative Groups provides a type-safe hierarchy for thread synchronization in CUDA. thread_block_tile replaces raw warp intrinsics with compile-time-sized tiles that support shuffle, ballot, and reduction operations. coalesced_group handles data-dependent divergence by grouping only active threads. grid_group enables grid-wide synchronization in a single kernel launch, eliminating multi-kernel overhead for iterative algorithms. The API composes naturally — tiles within warps within blocks within grids — and the compiler generates the same hardware instructions as hand-written intrinsics. For new CUDA code, cooperative groups should be the default synchronization mechanism.