Dynamo’s 2026 roadmap centers on three hardware advances shipping this year. CXL 3.0 memory expanders add a new tier between GPU HBM (3.35 TB/s) and host DRAM (50 GB/s via PCIe): CXL memory at 100-400 GB/s lets you extend KV cache from 80GB to 1TB without hitting DRAM’s latency wall. NVLink Switch connects 576 GPUs in a single coherent fabric, eliminating InfiniBand for multi-node all-reduce—reducing TP=16 communication latency from 80μs (IB) to 4μs (NVLink). Fully disaggregated prefill uses dedicated NVIDIA Grace CPUs for prompt processing, freeing GPU SMs for pure decode. Each unlocks 2-4x capacity gains.
CXL Memory for KV Cache Tiering
CXL 3.0 memory expanders provide a new tier between HBM and host DRAM with approximately 10-20x lower bandwidth than HBM but 5-10x higher bandwidth than PCIe-attached DRAM.
class CXLMemoryTier:
"""
CXL memory tier for KV cache overflow.
Memory hierarchy for KV cache in 2026:
Tier 0: GPU HBM (3.35 TB/s on H100, ~5 TB/s on B200)
Tier 1: CXL-attached memory (100-400 GB/s)
Tier 2: Host DRAM via PCIe (32-64 GB/s)
Tier 3: NVMe SSD (7 GB/s)
"""
def __init__(self, config: dict):
self.tiers = {
"hbm": {
"capacity_gb": config.get("hbm_gb", 80),
"bandwidth_tb_s": config.get("hbm_bw", 3.35),
"latency_ns": 100,
"cost_per_gb_month": 10.0, # relative cost
},
"cxl": {
"capacity_gb": config.get("cxl_gb", 512),
"bandwidth_tb_s": config.get("cxl_bw", 0.2),
"latency_ns": 300,
"cost_per_gb_month": 1.5,
},
"dram": {
"capacity_gb": config.get("dram_gb", 1024),
"bandwidth_tb_s": config.get("dram_bw", 0.05),
"latency_ns": 1000,
"cost_per_gb_month": 0.5,
},
}
def optimal_kv_placement(
self,
total_kv_gb: float,
attention_pattern: str = "causal"
) -> dict:
"""
Determine optimal KV cache placement across tiers.
Strategy: Keep recently accessed KV blocks in HBM,
older blocks in CXL, rarely accessed in DRAM.
"""
hbm_cap = self.tiers["hbm"]["capacity_gb"] * 0.5 # 50% for KV
cxl_cap = self.tiers["cxl"]["capacity_gb"]
dram_cap = self.tiers["dram"]["capacity_gb"]
if total_kv_gb <= hbm_cap:
return {
"hbm_gb": total_kv_gb,
"cxl_gb": 0,
"dram_gb": 0,
"strategy": "all_hbm",
"latency_impact": "none"
}
elif total_kv_gb <= hbm_cap + cxl_cap:
# Recent tokens in HBM, older in CXL
hbm_alloc = hbm_cap
cxl_alloc = total_kv_gb - hbm_cap
return {
"hbm_gb": hbm_alloc,
"cxl_gb": cxl_alloc,
"dram_gb": 0,
"strategy": "hbm_cxl",
"latency_impact": "moderate (CXL attention has 3x latency)"
}
else:
hbm_alloc = hbm_cap
cxl_alloc = cxl_cap
dram_alloc = min(total_kv_gb - hbm_cap - cxl_cap, dram_cap)
return {
"hbm_gb": hbm_alloc,
"cxl_gb": cxl_alloc,
"dram_gb": dram_alloc,
"strategy": "hbm_cxl_dram",
"latency_impact": "high (DRAM attention has 10x+ latency)"
}
def attention_latency_with_tiers(
self,
seq_len: int,
hbm_tokens: int,
cxl_tokens: int,
dram_tokens: int,
kv_bytes_per_token: int
) -> dict:
"""
Estimate attention latency when KV cache spans multiple tiers.
"""
# Attention reads ALL KV entries for the sequence
# Bandwidth-bound: time = data_size / bandwidth
hbm_data_gb = hbm_tokens * kv_bytes_per_token / 1e9
cxl_data_gb = cxl_tokens * kv_bytes_per_token / 1e9
dram_data_gb = dram_tokens * kv_bytes_per_token / 1e9
hbm_time_ms = hbm_data_gb / self.tiers["hbm"]["bandwidth_tb_s"] * 1000
cxl_time_ms = cxl_data_gb / self.tiers["cxl"]["bandwidth_tb_s"] * 1000
dram_time_ms = dram_data_gb / self.tiers["dram"]["bandwidth_tb_s"] * 1000
# With prefetching, CXL and HBM reads can partially overlap
total_ms = hbm_time_ms + cxl_time_ms * 0.7 + dram_time_ms
return {
"hbm_time_ms": hbm_time_ms,
"cxl_time_ms": cxl_time_ms,
"dram_time_ms": dram_time_ms,
"total_attention_ms": total_ms,
"vs_all_hbm_ms": (hbm_data_gb + cxl_data_gb + dram_data_gb) /
self.tiers["hbm"]["bandwidth_tb_s"] * 1000,
}
KV Cache Memory Tier Comparison
| Tier | Bandwidth | Latency | Capacity | Cost/GB/Month |
|---|---|---|---|---|
| GPU HBM3e | 5.0 TB/s | 100 ns | 80-192 GB | $10.00 |
| CXL 3.0 | 100-400 GB/s | 300 ns | 256-2048 GB | $1.50 |
| Host DRAM (DDR5) | 50-100 GB/s | 1,000 ns | 512-4096 GB | $0.50 |
| NVMe SSD | 7-14 GB/s | 10,000 ns | 4-32 TB | $0.05 |
KV Cache Cost per GB per Month
CXL fills the critical gap between HBM and DRAM. For long-context serving (32K-128K tokens), KV cache can exceed available HBM. CXL provides 6-7x the cost efficiency of HBM with only 3x the latency, making 128K context windows economically viable for high-throughput serving.
NVLink Switch: Multi-Node GPU Fabric
NVLink Switch (NVSwitch 4.0 and beyond) extends NVLink connectivity beyond a single node, creating a unified GPU fabric across racks.
class NVLinkSwitchFabric:
"""
NVLink Switch fabric for multi-node GPU interconnect.
Current (2025): NVLink within node only (8 GPUs, 900 GB/s per GPU)
NVLink Switch: NVLink across nodes (32-256 GPUs, 900 GB/s)
"""
def __init__(self, config: dict):
self.gpus_per_node = config.get("gpus_per_node", 8)
self.nodes = config.get("nodes", 4)
self.total_gpus = self.gpus_per_node * self.nodes
self.intra_node_bw_gb_s = config.get("intra_nvlink_bw", 900)
self.inter_node_bw_gb_s = config.get("inter_node_bw", 900)
# NVLink Switch: inter-node bandwidth matches intra-node
def allreduce_comparison(self, data_size_gb: float) -> dict:
"""
Compare allreduce performance: InfiniBand vs NVLink Switch.
With InfiniBand (current):
- Intra-node: NVLink (900 GB/s)
- Inter-node: InfiniBand (400 Gb/s = 50 GB/s)
- Bottleneck: inter-node link
With NVLink Switch:
- All links: NVLink (900 GB/s)
- No inter-node bottleneck
"""
# Ring allreduce: 2 * (N-1)/N * data_size / bandwidth
N = self.total_gpus
# InfiniBand: limited by inter-node hops
ib_bw_gb_s = 50 # 400 Gb/s InfiniBand
# Effective bandwidth is limited by the slowest link
# With 4 nodes, there are 3 inter-node hops
ib_effective_bw = ib_bw_gb_s # Bottleneck
ib_time_ms = 2 * (N - 1) / N * data_size_gb / ib_effective_bw * 1000
# NVLink Switch: uniform bandwidth
nvswitch_time_ms = (
2 * (N - 1) / N * data_size_gb / self.inter_node_bw_gb_s * 1000
)
return {
"ib_allreduce_ms": ib_time_ms,
"nvswitch_allreduce_ms": nvswitch_time_ms,
"speedup": ib_time_ms / nvswitch_time_ms,
"total_gpus": N,
"data_size_gb": data_size_gb,
}
def expert_parallelism_improvement(
self,
num_experts: int,
tokens_per_batch: int,
expert_param_gb: float
) -> dict:
"""
MoE expert parallelism benefits from NVLink Switch.
MoE all-to-all communication:
Each GPU sends tokens to experts on other GPUs.
With InfiniBand: bottleneck on inter-node all-to-all.
With NVLink Switch: uniform all-to-all bandwidth.
"""
# All-to-all data volume per GPU
# Each token goes to top-K experts, distributed across GPUs
top_k = 8 # DeepSeek-V3
tokens_per_gpu = tokens_per_batch / self.total_gpus
data_per_gpu_gb = tokens_per_gpu * top_k * 4096 * 2 / 1e9 # hidden_dim * FP16
# InfiniBand all-to-all
ib_bw = 50 # GB/s
ib_time_ms = data_per_gpu_gb * (self.total_gpus - 1) / ib_bw * 1000
# NVLink Switch all-to-all
nv_time_ms = data_per_gpu_gb * (self.total_gpus - 1) / self.inter_node_bw_gb_s * 1000
return {
"ib_all_to_all_ms": ib_time_ms,
"nvswitch_all_to_all_ms": nv_time_ms,
"speedup": ib_time_ms / nv_time_ms,
"impact": "Critical for MoE models across nodes"
}
NVLink Switch vs InfiniBand (32 GPUs, 4 Nodes)
| Operation | InfiniBand 400G | NVLink Switch | Speedup |
|---|---|---|---|
| AllReduce 1GB | 40 ms | 2.2 ms | 18x |
| AllReduce 100MB | 4 ms | 0.22 ms | 18x |
| All-to-All (MoE) | 25 ms | 1.4 ms | 18x |
| Point-to-Point 1GB | 20 ms | 1.1 ms | 18x |
NVLink Switch provides 18x the bandwidth of InfiniBand 400G for inter-node communication. This is transformative for MoE models where all-to-all expert routing currently bottlenecks on inter-node bandwidth. DeepSeek-V3 on NVLink Switch fabric could use full expert parallelism across 32+ GPUs without communication overhead dominating compute.
Disaggregated Prefill-Decode Architecture
Dynamo’s 2026 architecture fully separates prefill and decode onto different GPU pools optimized for each phase.
class DisaggregatedArchitecture:
"""
Disaggregated prefill-decode serving architecture.
Prefill: compute-bound (matrix multiply heavy)
Decode: memory-bandwidth-bound (KV cache reads)
Current: same GPU does both (suboptimal for each)
Disaggregated: specialized GPU pools for each phase
"""
def __init__(self):
self.prefill_pool = {
"gpu_type": "H100 (compute-optimized)",
"optimization": "Large batch prefill, high SM utilization",
"kv_cache_strategy": "Generate KV, transfer to decode pool",
}
self.decode_pool = {
"gpu_type": "H200 or CXL-extended (memory-optimized)",
"optimization": "Maximum KV cache capacity, bandwidth-optimized",
"kv_cache_strategy": "Receive KV from prefill, serve decode",
}
def kv_transfer_analysis(
self,
seq_len: int,
num_layers: int,
kv_heads: int,
head_dim: int,
transfer_bandwidth_gb_s: float
) -> dict:
"""
Analyze KV cache transfer cost between prefill and decode pools.
"""
kv_bytes_per_token = 2 * num_layers * kv_heads * head_dim * 2 # K+V, FP16
total_kv_bytes = seq_len * kv_bytes_per_token
total_kv_gb = total_kv_bytes / 1e9
transfer_time_ms = total_kv_gb / transfer_bandwidth_gb_s * 1000
return {
"kv_size_gb": total_kv_gb,
"transfer_time_ms": transfer_time_ms,
"transfer_bandwidth": f"{transfer_bandwidth_gb_s} GB/s",
"amortized_per_output_token_ms": transfer_time_ms / 100,
# Amortized over ~100 output tokens per request
}
def utilization_improvement(self) -> dict:
"""
Quantify GPU utilization improvement from disaggregation.
"""
return {
"colocated": {
"prefill_utilization": 85, # High during prefill
"decode_utilization": 30, # Low during decode (memory-bound)
"average_utilization": 45, # Weighted average
"problem": "During decode, compute units are ~70% idle"
},
"disaggregated": {
"prefill_pool_utilization": 80, # Always doing prefill
"decode_pool_utilization": 70, # Memory bandwidth saturated
"average_utilization": 75,
"improvement": "66% higher average utilization"
}
}
def cost_comparison(
self,
requests_per_second: float,
avg_input_tokens: int,
avg_output_tokens: int
) -> dict:
"""
Compare cost of colocated vs disaggregated serving.
"""
# Colocated: each GPU handles both phases
# Utilization ~45% average
colocated_gpus_needed = requests_per_second / 5 # 5 rps per GPU
colocated_cost = colocated_gpus_needed * 3.00 # H100 cost/hr
# Disaggregated: specialized pools
# Prefill pool: compute-efficient, higher throughput
prefill_gpus = requests_per_second / 15 # 15 rps per GPU (prefill only)
# Decode pool: memory-optimized, higher batch density
decode_gpus = requests_per_second / 8 # 8 rps per GPU (decode only)
disagg_cost = prefill_gpus * 3.00 + decode_gpus * 4.00 # H100 + H200
return {
"colocated_gpus": colocated_gpus_needed,
"colocated_cost_per_hour": colocated_cost,
"disagg_prefill_gpus": prefill_gpus,
"disagg_decode_gpus": decode_gpus,
"disagg_cost_per_hour": disagg_cost,
"cost_savings_pct": (1 - disagg_cost / colocated_cost) * 100
}
Colocated vs Disaggregated Serving (100 rps, Llama 70B)
| Metric | Colocated | Disaggregated | Improvement |
|---|---|---|---|
| Total GPUs | 20 | 7 prefill + 13 decode | 0 (same) |
| Avg GPU Utilization | 45% | 75% | +67% |
| Hourly Cost | $60.00 | $73.00 | -22% (mixed GPU) |
| Cost/1M tokens | $1.67 | $1.01 | -39% |
| P99 TTFT | 800 ms | 400 ms | -50% |
GPU Utilization: Colocated vs Disaggregated
KV Cache Transfer Protocol
The critical new component in disaggregated serving: transferring KV cache from prefill to decode GPUs.
class KVTransferProtocol:
"""
Protocol for transferring KV cache between GPU pools.
Uses RDMA over NVLink Switch or InfiniBand.
"""
def __init__(self, transport: str = "nvlink_switch"):
self.transports = {
"nvlink_switch": {"bandwidth_gb_s": 900, "latency_us": 1},
"infiniband_400g": {"bandwidth_gb_s": 50, "latency_us": 5},
"rdma_roce": {"bandwidth_gb_s": 25, "latency_us": 10},
}
self.transport = self.transports[transport]
def transfer_kv_cache(
self,
kv_size_bytes: int,
src_gpu: int,
dst_gpu: int,
pipeline_chunks: int = 4
) -> dict:
"""
Transfer KV cache with pipelining.
Instead of transferring all KV at once, split into chunks
and pipeline with decode computation.
"""
total_gb = kv_size_bytes / 1e9
chunk_gb = total_gb / pipeline_chunks
# Per-chunk transfer time
chunk_transfer_ms = chunk_gb / self.transport["bandwidth_gb_s"] * 1000
setup_latency_ms = self.transport["latency_us"] / 1000
# Without pipelining: serial transfer
serial_time_ms = total_gb / self.transport["bandwidth_gb_s"] * 1000
# With pipelining: first chunk latency + overlap
# Decode can start as soon as first chunk arrives
pipelined_time_ms = chunk_transfer_ms + setup_latency_ms
# Remaining chunks transfer in parallel with decode
return {
"serial_transfer_ms": serial_time_ms,
"pipelined_first_chunk_ms": pipelined_time_ms,
"total_with_pipeline_ms": pipelined_time_ms + (pipeline_chunks - 1) * chunk_transfer_ms * 0.3,
"bandwidth_used": f"{self.transport['bandwidth_gb_s']} GB/s",
"chunks": pipeline_chunks,
}
# Compare transports for 4K context Llama 70B
kv_size = 4096 * 80 * 2 * 8 * 128 * 2 # 4K tokens, 80 layers, KV, 8 heads, 128 dim, FP16
for transport in ["nvlink_switch", "infiniband_400g", "rdma_roce"]:
proto = KVTransferProtocol(transport)
result = proto.transfer_kv_cache(kv_size, 0, 1)
print(f"{transport}: serial={result['serial_transfer_ms']:.2f}ms, "
f"pipelined={result['pipelined_first_chunk_ms']:.2f}ms")
KV Cache Transfer Time (4K Context, 70B Model)
| Transport | Bandwidth | Serial Transfer | Pipelined (First Chunk) |
|---|---|---|---|
| NVLink Switch | 900 GB/s | 0.15 ms | 0.04 ms |
| InfiniBand 400G | 50 GB/s | 2.62 ms | 0.66 ms |
| RoCE v2 | 25 GB/s | 5.24 ms | 1.31 ms |
Smart Routing with Workload Classification
Future Dynamo routing classifies requests to direct them to the optimal GPU pool.
class WorkloadClassifierRouter:
"""
Classifies incoming requests and routes to optimal pool.
"""
def classify_request(self, request: dict) -> dict:
"""
Classify request by compute profile.
Short input + long output: decode-heavy (memory pool)
Long input + short output: prefill-heavy (compute pool)
Long input + long output: both phases significant
"""
input_tokens = request.get("estimated_input_tokens", 100)
max_output_tokens = request.get("max_tokens", 1024)
prefill_ratio = input_tokens / (input_tokens + max_output_tokens)
if prefill_ratio > 0.7:
profile = "prefill_heavy"
optimal_pool = "compute_optimized"
elif prefill_ratio < 0.3:
profile = "decode_heavy"
optimal_pool = "memory_optimized"
else:
profile = "balanced"
optimal_pool = "general"
return {
"profile": profile,
"optimal_pool": optimal_pool,
"prefill_ratio": prefill_ratio,
"estimated_prefill_ms": input_tokens * 0.5,
"estimated_decode_ms": max_output_tokens * 25,
}
def adaptive_pool_sizing(self, traffic_history: list) -> dict:
"""
Dynamically resize prefill and decode pools based on traffic.
"""
prefill_heavy_pct = sum(
1 for r in traffic_history if r["profile"] == "prefill_heavy"
) / len(traffic_history) * 100
decode_heavy_pct = sum(
1 for r in traffic_history if r["profile"] == "decode_heavy"
) / len(traffic_history) * 100
balanced_pct = 100 - prefill_heavy_pct - decode_heavy_pct
return {
"prefill_heavy_pct": prefill_heavy_pct,
"decode_heavy_pct": decode_heavy_pct,
"balanced_pct": balanced_pct,
"recommended_prefill_pool_fraction": prefill_heavy_pct / 100 * 0.4 + 0.3,
"recommended_decode_pool_fraction": decode_heavy_pct / 100 * 0.4 + 0.3,
}
Speculative Execution with CXL-Backed KV Cache
CXL memory enables speculative execution patterns that are impossible with HBM-only serving.
def speculative_kv_with_cxl() -> dict:
"""
Use CXL memory for speculative KV cache:
- Pre-compute KV cache for likely follow-up queries
- Store speculative KV in CXL (cheap, large)
- Promote to HBM on cache hit
"""
return {
"concept": (
"Pre-compute and cache KV for common system prompts, "
"frequent user prefixes, and predicted follow-ups in CXL memory. "
"On cache hit, promote to HBM for fast decode."
),
"economics": {
"hbm_kv_cost": "10x base",
"cxl_kv_cost": "1.5x base",
"hit_rate_needed_for_breakeven": "15%",
"typical_hit_rate_chatbot": "40-70%",
"typical_hit_rate_code": "20-40%",
},
"latency": {
"cxl_to_hbm_promote_ms": 0.5, # 4KB block
"vs_full_prefill_ms": 50, # 500 token prefill
"speedup_on_hit": "100x TTFT improvement",
},
"capacity": {
"hbm_only_cached_prefixes": 100, # 80GB budget
"cxl_extended_cached_prefixes": 5000, # 512GB CXL
"50x_more_cached_prefixes": True,
}
}
CXL-backed speculative KV caching can store 50x more prefix caches than HBM alone. For chatbot workloads with 40-70% prefix cache hit rates, this translates to 100x TTFT improvement on cache hits (0.5ms promotion vs 50ms prefill). The cost of CXL memory is 6.7x lower per GB than HBM, making this economically viable.
Timeline and Hardware Dependencies
def roadmap_timeline() -> dict:
"""
Dynamo feature roadmap with hardware dependencies.
"""
return {
"2025_h2": {
"features": [
"Disaggregated prefill-decode (InfiniBand)",
"Basic CXL KV offloading (CXL 2.0)",
"Multi-model serving (router + workers)",
],
"hardware": "H100/H200 + InfiniBand HDR/NDR",
"status": "In development / early preview",
},
"2026_h1": {
"features": [
"NVLink Switch multi-node fabric",
"CXL 3.0 KV cache tiering",
"Workload-classified routing",
"Speculative KV caching with CXL",
],
"hardware": "B200/B300 + NVLink Switch + CXL 3.0 memory expanders",
"status": "Planned",
},
"2026_h2": {
"features": [
"Fully disaggregated architecture (prefill + decode + KV store)",
"Cross-datacenter KV cache replication",
"Heterogeneous GPU pool management (mix H100/H200/B200)",
],
"hardware": "Mixed GPU fleet + CXL 3.0 + NVLink Switch",
"status": "Roadmap",
},
"2027": {
"features": [
"CXL memory pooling (shared CXL fabric)",
"Compute-storage disaggregation (KV cache as service)",
"Autonomous workload optimization (ML-driven routing)",
],
"hardware": "CXL 3.1 fabric + next-gen NVLink",
"status": "Vision",
}
}
Dynamo Roadmap Timeline
| Timeframe | Key Feature | Hardware Dependency | Impact |
|---|---|---|---|
| 2025 H2 | Disaggregated prefill-decode | H100 + InfiniBand | 30-40% cost reduction |
| 2026 H1 | NVLink Switch multi-node | B200 + NVLink Switch | 18x inter-node bandwidth |
| 2026 H1 | CXL 3.0 KV tiering | CXL 3.0 expanders | 50x more cached prefixes |
| 2026 H2 | Fully disaggregated | Mixed fleet + CXL | 2x overall efficiency |
| 2027 | KV cache as a service | CXL 3.1 fabric | Stateless GPU serving |
Projected Cost per 1M Tokens (70B Model) Over Time
The trajectory is clear: each architectural improvement roughly halves cost per token. CXL makes KV cache economically abundant, NVLink Switch eliminates the inter-node communication bottleneck, and disaggregated serving matches GPU types to workload phases. By 2027, stateless GPU serving with KV cache as a separate service will fundamentally change how LLM inference infrastructure is designed — GPUs become pure compute engines, and memory becomes a separately managed, tiered resource.