Shared GPU clusters serve multiple tenants on the same hardware. Each tenant expects data privacy: their prompts, KV cache entries, model adapters, and outputs must not leak to other tenants. NVIDIA Dynamo implements multi-tenant isolation at the request routing layer, the KV cache manager, the scheduling engine, and the metrics pipeline. This post covers each isolation mechanism, the performance overhead of isolation, and the architectural trade-offs between strict isolation and resource efficiency.
Tenant Model: Namespaces and Identity
Dynamo organizes tenants into namespaces, each with its own resource quotas, model configurations, and access credentials.
from dataclasses import dataclass, field
import hashlib
import secrets
@dataclass
class TenantNamespace:
"""
A tenant namespace in Dynamo.
Each namespace is an isolation boundary.
"""
tenant_id: str
namespace: str
api_key_hash: str # SHA-256 of API key
model_id: str # Which model this tenant uses
max_concurrent_requests: int = 64
max_tokens_per_minute: int = 100_000
max_kv_cache_gb: float = 10.0
priority: int = 1 # 1=highest, 5=lowest
lora_adapter_id: str = "" # Optional per-tenant LoRA
allowed_gpu_pools: list = field(default_factory=list)
# Isolation settings
dedicated_gpu_pool: bool = False # True = no GPU sharing
encrypt_kv_cache: bool = False # True = AES-256 encrypted KV
def create_tenant(tenant_id: str, model_id: str, tier: str) -> TenantNamespace:
"""Create a new tenant with tier-appropriate settings."""
api_key = secrets.token_hex(32)
api_key_hash = hashlib.sha256(api_key.encode()).hexdigest()
tier_configs = {
"free": {
"max_concurrent_requests": 4,
"max_tokens_per_minute": 10_000,
"max_kv_cache_gb": 1.0,
"priority": 5,
"dedicated_gpu_pool": False,
"encrypt_kv_cache": False,
},
"standard": {
"max_concurrent_requests": 64,
"max_tokens_per_minute": 100_000,
"max_kv_cache_gb": 10.0,
"priority": 3,
"dedicated_gpu_pool": False,
"encrypt_kv_cache": False,
},
"enterprise": {
"max_concurrent_requests": 256,
"max_tokens_per_minute": 1_000_000,
"max_kv_cache_gb": 80.0,
"priority": 1,
"dedicated_gpu_pool": True,
"encrypt_kv_cache": True,
},
}
config = tier_configs[tier]
return TenantNamespace(
tenant_id=tenant_id,
namespace=f"ns-{tenant_id}",
api_key_hash=api_key_hash,
model_id=model_id,
**config
)
Tenant Tier Configuration
| Feature | Free | Standard | Enterprise |
|---|---|---|---|
| Concurrent Requests | 4 | 64 | 256 |
| Tokens/Minute | 10K | 100K | 1M |
| KV Cache Quota | 1 GB | 10 GB | 80 GB |
| Dedicated GPU Pool | No | No | Yes |
| Encrypted KV Cache | No | No | Yes |
| Priority | 5 (lowest) | 3 | 1 (highest) |
Request Routing with Tenant Affinity
The router must direct requests to appropriate GPU workers while maintaining tenant isolation.
class TenantAwareRouter:
"""
Routes requests to GPU workers with tenant isolation.
"""
def __init__(self, gpu_pools: dict, tenant_registry: dict):
self.gpu_pools = gpu_pools # pool_id -> list of worker_ids
self.tenant_registry = tenant_registry # tenant_id -> TenantNamespace
self.tenant_worker_map = {} # tenant_id -> set of worker_ids
def route_request(self, request: dict) -> dict:
"""
Route a request to the appropriate worker.
Isolation rules:
1. Enterprise tenants: route to dedicated pool only
2. Standard tenants: route to shared pool, track worker assignment
3. Free tenants: route to shared pool, lowest priority
"""
tenant_id = request["tenant_id"]
tenant = self.tenant_registry[tenant_id]
# Step 1: Determine eligible workers
if tenant.dedicated_gpu_pool:
eligible_workers = self.gpu_pools.get(
f"dedicated-{tenant_id}", []
)
if not eligible_workers:
return {"error": "No dedicated workers available"}
elif tenant.allowed_gpu_pools:
eligible_workers = []
for pool_id in tenant.allowed_gpu_pools:
eligible_workers.extend(self.gpu_pools.get(pool_id, []))
else:
eligible_workers = self.gpu_pools.get("shared", [])
# Step 2: Prefer workers that already have this tenant's KV cache
# (tenant affinity for cache reuse)
affinity_workers = self.tenant_worker_map.get(tenant_id, set())
preferred = [w for w in eligible_workers if w in affinity_workers]
if preferred:
# Route to worker with most available capacity
selected = self._select_by_capacity(preferred, tenant.priority)
else:
selected = self._select_by_capacity(eligible_workers, tenant.priority)
# Step 3: Track tenant-worker assignment
if tenant_id not in self.tenant_worker_map:
self.tenant_worker_map[tenant_id] = set()
self.tenant_worker_map[tenant_id].add(selected)
return {
"worker_id": selected,
"tenant_id": tenant_id,
"namespace": tenant.namespace,
"priority": tenant.priority,
"kv_cache_quota_gb": tenant.max_kv_cache_gb,
}
def _select_by_capacity(self, workers: list, priority: int) -> str:
"""Select worker with most available capacity at given priority."""
# In practice: query each worker's load via gRPC health check
# Here simplified to first available
return workers[0] if workers else ""
Tenant affinity routing is a performance optimization, not a security boundary. When a worker already has tenant A’s KV cache prefix cached, routing subsequent tenant A requests to the same worker avoids redundant prefill computation. This can reduce TTFT by 50-80% for repeated system prompts.
KV Cache Isolation
The KV cache is the primary data privacy concern: it contains the full content of prompts and generated text in compressed form.
class IsolatedKVCacheManager:
"""
KV cache manager with per-tenant isolation.
Each tenant's KV cache blocks are tracked separately.
"""
def __init__(self, total_gpu_memory_gb: float, block_size_tokens: int = 16):
self.block_size = block_size_tokens
self.total_blocks = int(
(total_gpu_memory_gb * 1e9) / (block_size_tokens * 2 * 80 * 128 * 2)
) # Assuming 80 layers, 128 head_dim, FP16
# Per-tenant block tracking
self.tenant_blocks = {} # tenant_id -> set of block_ids
self.tenant_quotas = {} # tenant_id -> max_blocks
self.free_blocks = set(range(self.total_blocks))
# Block metadata: which tenant owns each block
self.block_owner = {} # block_id -> tenant_id
def allocate_blocks(self, tenant_id: str, num_blocks: int) -> list:
"""
Allocate KV cache blocks for a specific tenant.
Enforces per-tenant quota.
"""
# Check quota
current_usage = len(self.tenant_blocks.get(tenant_id, set()))
max_allowed = self.tenant_quotas.get(tenant_id, self.total_blocks)
if current_usage + num_blocks > max_allowed:
raise MemoryError(
f"Tenant {tenant_id} quota exceeded: "
f"{current_usage + num_blocks} > {max_allowed} blocks"
)
if len(self.free_blocks) < num_blocks:
# Evict blocks from lowest-priority tenants
self._evict_lowest_priority(num_blocks - len(self.free_blocks))
allocated = []
for _ in range(num_blocks):
block_id = self.free_blocks.pop()
allocated.append(block_id)
self.block_owner[block_id] = tenant_id
if tenant_id not in self.tenant_blocks:
self.tenant_blocks[tenant_id] = set()
self.tenant_blocks[tenant_id].add(block_id)
return allocated
def free_blocks_for_request(self, tenant_id: str, block_ids: list):
"""
Free blocks when a request completes.
CRITICAL: Zero memory before returning to pool.
"""
for block_id in block_ids:
# Verify ownership
if self.block_owner.get(block_id) != tenant_id:
raise SecurityError(
f"Block {block_id} not owned by tenant {tenant_id}"
)
# ZERO the GPU memory before returning to free pool
# This prevents data leakage between tenants
self._zero_block_memory(block_id)
self.tenant_blocks[tenant_id].discard(block_id)
del self.block_owner[block_id]
self.free_blocks.add(block_id)
def _zero_block_memory(self, block_id: int):
"""
Zero out GPU memory for a block.
This is the critical isolation primitive.
In CUDA: cudaMemsetAsync(block_ptr, 0, block_size_bytes, stream)
Cost: ~0.01ms per block on A100
"""
pass # CUDA memset in actual implementation
def _evict_lowest_priority(self, num_blocks_needed: int):
"""Evict blocks from lowest-priority tenants."""
pass # Priority-based eviction
def get_tenant_usage(self, tenant_id: str) -> dict:
"""Report KV cache usage for a tenant."""
used = len(self.tenant_blocks.get(tenant_id, set()))
quota = self.tenant_quotas.get(tenant_id, self.total_blocks)
return {
"blocks_used": used,
"blocks_quota": quota,
"utilization_pct": (used / quota * 100) if quota > 0 else 0,
"memory_used_gb": used * self.block_size * 2 * 80 * 128 * 2 / 1e9,
}
Memory zeroing is non-negotiable for multi-tenant isolation. Without zeroing, a newly allocated KV cache block may contain residual data from a previous tenant’s request. On an A100, zeroing a 16-token KV block (2 * 80 layers * 128 head_dim * 2 bytes = 40KB) takes approximately 0.01ms. For a typical request allocating 256 blocks, the total zeroing overhead is 2.5ms — a small price for data privacy.
KV Cache Zeroing Overhead per Request
Scheduling with Tenant Fairness
The scheduler must balance throughput with per-tenant fairness guarantees.
class TenantFairScheduler:
"""
Scheduler that enforces per-tenant rate limits and fairness.
Uses weighted fair queuing across tenants.
"""
def __init__(self):
self.tenant_queues = {} # tenant_id -> list of pending requests
self.tenant_tokens_used = {} # tenant_id -> tokens used this minute
self.tenant_active_requests = {} # tenant_id -> count
def enqueue(self, request: dict, tenant: TenantNamespace) -> dict:
"""
Add request to tenant queue with rate limit check.
"""
tenant_id = tenant.tenant_id
# Check concurrent request limit
active = self.tenant_active_requests.get(tenant_id, 0)
if active >= tenant.max_concurrent_requests:
return {"status": "rejected", "reason": "concurrent_limit",
"retry_after_ms": 100}
# Check token rate limit
tokens_used = self.tenant_tokens_used.get(tenant_id, 0)
estimated_tokens = request.get("max_tokens", 1024)
if tokens_used + estimated_tokens > tenant.max_tokens_per_minute:
return {"status": "rate_limited", "reason": "token_limit",
"retry_after_ms": 1000}
# Enqueue with priority
if tenant_id not in self.tenant_queues:
self.tenant_queues[tenant_id] = []
self.tenant_queues[tenant_id].append(request)
return {"status": "queued", "position": len(self.tenant_queues[tenant_id])}
def select_next_batch(self, max_batch_size: int) -> list:
"""
Select next batch of requests using weighted fair queuing.
Weight = 1 / priority (higher priority = higher weight)
Each tenant gets weight-proportional share of batch slots.
"""
batch = []
tenant_weights = {}
for tenant_id, queue in self.tenant_queues.items():
if queue:
tenant = self._get_tenant(tenant_id)
tenant_weights[tenant_id] = 1.0 / tenant.priority
total_weight = sum(tenant_weights.values())
if total_weight == 0:
return batch
# Allocate batch slots proportional to weight
for tenant_id, weight in sorted(
tenant_weights.items(),
key=lambda x: -x[1] # Highest weight first
):
slots = max(1, int(max_batch_size * weight / total_weight))
queue = self.tenant_queues[tenant_id]
for _ in range(min(slots, len(queue))):
if len(batch) >= max_batch_size:
break
request = queue.pop(0)
request["_tenant_id"] = tenant_id
batch.append(request)
return batch
def _get_tenant(self, tenant_id: str) -> TenantNamespace:
"""Lookup tenant configuration."""
pass # Registry lookup
Batch Composition Example (max_batch=32, 3 Tenants)
| Tenant | Priority | Weight | Batch Slots | Queue Depth |
|---|---|---|---|---|
| Enterprise A | 1 | 1.0 | 16 | 50 |
| Standard B | 3 | 0.33 | 10 | 120 |
| Free C | 5 | 0.20 | 6 | 200 |
Encrypted KV Cache for Enterprise Tenants
Enterprise tenants can opt into encrypted KV cache, where all cached key-value pairs are encrypted at rest on the GPU.
import struct
class EncryptedKVCache:
"""
AES-256-CTR encrypted KV cache.
Each tenant has a unique encryption key.
"""
def __init__(self, tenant_key: bytes):
# tenant_key: 32 bytes (256 bits)
assert len(tenant_key) == 32
self.key = tenant_key
# Counter for CTR mode
self.counter = 0
def encrypt_kv_block(self, kv_data: bytes) -> tuple:
"""
Encrypt a KV cache block.
In practice, this runs on the GPU using CUDA AES implementation.
The encryption is done in-place on the GPU memory.
Performance: AES-256-CTR on A100 achieves ~400 GB/s
A 40KB KV block encrypts in ~0.0001ms
"""
nonce = struct.pack(">Q", self.counter)
self.counter += 1
# AES-256-CTR encryption (simplified)
# In CUDA: custom kernel or cuTLS library
encrypted = bytearray(len(kv_data)) # placeholder
return bytes(encrypted), nonce
def decrypt_kv_block(self, encrypted_data: bytes, nonce: bytes) -> bytes:
"""
Decrypt KV cache block for attention computation.
Must decrypt before every attention operation.
"""
# AES-256-CTR decryption
decrypted = bytearray(len(encrypted_data)) # placeholder
return bytes(decrypted)
def overhead_estimate(self, num_blocks: int, block_size_bytes: int) -> dict:
"""
Estimate encryption overhead.
AES-256-CTR on A100: ~400 GB/s throughput
"""
total_bytes = num_blocks * block_size_bytes
encrypt_time_ms = (total_bytes / (400 * 1e9)) * 1000
decrypt_time_ms = encrypt_time_ms # Symmetric
# Decrypt happens on every attention step
# For a 2048-token sequence with 80 layers:
# 2048/16 = 128 blocks, decrypted 80 times = 10240 decryptions
attention_decrypt_time_ms = decrypt_time_ms * 80
return {
"encrypt_per_step_ms": encrypt_time_ms,
"decrypt_per_attention_ms": decrypt_time_ms,
"total_attention_overhead_ms": attention_decrypt_time_ms,
"throughput_reduction_pct": attention_decrypt_time_ms / 10 * 100
# Rough: if attention takes 10ms, this is the overhead
}
# Example: 2048 tokens, 40KB per block, 128 blocks
enc = EncryptedKVCache(b'\x00' * 32)
overhead = enc.overhead_estimate(128, 40960)
Encrypted KV Cache Performance Impact
| Metric | Unencrypted | AES-256-CTR | Overhead |
|---|---|---|---|
| KV Write (per step) | 0 ms | 0.01 ms | +0.01 ms |
| KV Read (per attention) | 0 ms | 0.01 ms | +0.01 ms |
| Full Forward Pass (80L) | 10 ms | 10.8 ms | +8% |
| E2E Latency (2K tokens) | 500 ms | 540 ms | +8% |
| Throughput | 1000 tok/s | 925 tok/s | -7.5% |
AES-256-CTR encryption on modern GPUs (A100, H100) runs at 400+ GB/s, which makes per-block encryption nearly free. The overhead comes from the fact that KV cache must be decrypted before every attention computation across all 80 layers. For a typical 70B model, encrypted KV cache adds approximately 8% to forward pass latency.
Audit Logging and Compliance
Multi-tenant systems require comprehensive audit logging for compliance (SOC 2, HIPAA, GDPR).
class TenantAuditLogger:
"""
Audit logger for multi-tenant compliance.
Logs all tenant-relevant operations without logging content.
"""
def log_request(self, event: dict) -> dict:
"""
Log a request event. NEVER log prompt or response content.
Only log metadata.
"""
audit_record = {
"timestamp_utc": event["timestamp"],
"tenant_id": event["tenant_id"],
"request_id": event["request_id"],
"event_type": event["type"], # "request_start", "request_end", etc.
# Metadata only — no content
"input_tokens": event.get("input_tokens", 0),
"output_tokens": event.get("output_tokens", 0),
"latency_ms": event.get("latency_ms", 0),
"worker_id": event.get("worker_id", ""),
"model_id": event.get("model_id", ""),
# Isolation verification
"kv_blocks_allocated": event.get("kv_blocks", 0),
"kv_blocks_zeroed": event.get("kv_blocks_zeroed", 0),
"encryption_enabled": event.get("encrypted", False),
# Rate limit state
"tokens_remaining_this_minute": event.get("tokens_remaining", 0),
"concurrent_requests": event.get("concurrent", 0),
}
# Content hash for audit trail (proves what was processed
# without storing the content)
if "content_hash" in event:
audit_record["content_sha256"] = event["content_hash"]
return audit_record
def log_isolation_violation(self, event: dict) -> dict:
"""
Log any isolation boundary violation attempt.
These are high-severity security events.
"""
return {
"severity": "CRITICAL",
"timestamp_utc": event["timestamp"],
"tenant_id": event["tenant_id"],
"violation_type": event["type"],
"details": event.get("details", ""),
"action_taken": "request_rejected",
}
Network Isolation: gRPC Channel Separation
At the network layer, tenant requests flow through isolated gRPC channels.
class TenantNetworkIsolation:
"""
Network-level isolation between tenants.
Each tenant gets logically separate gRPC channels.
"""
def __init__(self):
self.tenant_channels = {} # tenant_id -> channel config
def configure_tenant_channel(self, tenant_id: str, tier: str) -> dict:
"""
Configure network isolation for a tenant.
"""
if tier == "enterprise":
return {
"channel_type": "dedicated_tls",
"mtls_enabled": True,
"client_cert_required": True,
"encryption": "TLS 1.3",
"bandwidth_limit_mbps": None, # Unlimited
"connection_pool_size": 32,
"keepalive_ms": 10000,
}
elif tier == "standard":
return {
"channel_type": "shared_tls",
"mtls_enabled": False,
"client_cert_required": False,
"encryption": "TLS 1.3",
"bandwidth_limit_mbps": 1000,
"connection_pool_size": 8,
"keepalive_ms": 30000,
}
else: # free
return {
"channel_type": "shared_tls",
"mtls_enabled": False,
"client_cert_required": False,
"encryption": "TLS 1.3",
"bandwidth_limit_mbps": 100,
"connection_pool_size": 2,
"keepalive_ms": 60000,
}
Network Bandwidth Allocation by Tier
LoRA Adapter Isolation
Per-tenant LoRA adapters add another isolation dimension: each tenant’s fine-tuned weights must be kept separate.
class LoRAAdapterManager:
"""
Manage per-tenant LoRA adapters with isolation.
"""
def __init__(self, base_model_params: int):
self.base_params = base_model_params
self.loaded_adapters = {} # tenant_id -> adapter_weights
self.adapter_metadata = {} # tenant_id -> metadata
def load_adapter(self, tenant_id: str, adapter_path: str,
rank: int, alpha: float) -> dict:
"""
Load a tenant-specific LoRA adapter.
Adapter weights are stored in tenant-isolated GPU memory.
"""
# LoRA params per layer: 2 * rank * d (A and B matrices)
# For attention: 4 * 2 * rank * d per layer
adapter_params = 4 * 2 * rank * 8192 * 80 # Llama 70B dimensions
adapter_memory_gb = adapter_params * 2 / 1e9 # FP16
self.adapter_metadata[tenant_id] = {
"rank": rank,
"alpha": alpha,
"params": adapter_params,
"memory_gb": adapter_memory_gb,
"loaded": True,
}
return {
"tenant_id": tenant_id,
"adapter_params": adapter_params,
"adapter_memory_gb": adapter_memory_gb,
"base_model_params": self.base_params,
"overhead_pct": adapter_memory_gb / (self.base_params * 2 / 1e9) * 100,
}
def forward_with_adapter(self, tenant_id: str, hidden_states: list,
layer_idx: int) -> list:
"""
Apply tenant-specific LoRA during forward pass.
h' = h + (alpha/rank) * B @ A @ h
CRITICAL: Only apply the correct tenant's adapter.
In batched inference, different requests in the same batch
may use different adapters.
"""
# In S-LoRA / Punica: batched multi-adapter inference
# Uses custom CUDA kernels to apply different adapters
# to different requests in the same batch
pass
def batched_multi_adapter_overhead(self, num_tenants_in_batch: int,
rank: int) -> dict:
"""
Overhead of serving multiple adapters in one batch.
"""
# S-LoRA overhead: roughly proportional to num_unique_adapters
# Custom gather kernel: ~0.1ms per adapter per layer
overhead_per_layer_ms = 0.1 * num_tenants_in_batch
total_overhead_ms = overhead_per_layer_ms * 80 # 80 layers
return {
"per_layer_overhead_ms": overhead_per_layer_ms,
"total_overhead_ms": total_overhead_ms,
"pct_of_forward_pass": total_overhead_ms / 10 * 100,
"recommendation": "Batch requests by adapter when possible"
}
Multi-Adapter Serving Overhead (70B Model, Rank 16)
| Adapters in Batch | Per-Layer Overhead | Total Overhead | Throughput Impact |
|---|---|---|---|
| 1 (single tenant) | 0.1 ms | 8 ms | -3% |
| 4 tenants | 0.4 ms | 32 ms | -12% |
| 8 tenants | 0.8 ms | 64 ms | -22% |
| 16 tenants | 1.6 ms | 128 ms | -38% |
Multi-adapter batching has significant overhead when many unique adapters are active in the same batch. Dynamo mitigates this by grouping requests by adapter in the scheduler: requests using the same LoRA adapter are batched together, and adapter switches happen between batches rather than within them.
Metrics and Observability Isolation
Per-tenant metrics must be tracked separately without cross-contamination.
class TenantMetricsCollector:
"""
Per-tenant metrics collection with isolation.
Each tenant sees only their own metrics.
"""
def __init__(self):
self.metrics = {} # tenant_id -> metrics dict
def record_request(self, tenant_id: str, request_metrics: dict):
"""Record per-tenant request metrics."""
if tenant_id not in self.metrics:
self.metrics[tenant_id] = {
"total_requests": 0,
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_latency_ms": 0,
"p50_ttft_ms": [],
"p99_ttft_ms": [],
"rejected_requests": 0,
"rate_limited_requests": 0,
"kv_cache_peak_gb": 0,
}
m = self.metrics[tenant_id]
m["total_requests"] += 1
m["total_input_tokens"] += request_metrics.get("input_tokens", 0)
m["total_output_tokens"] += request_metrics.get("output_tokens", 0)
m["total_latency_ms"] += request_metrics.get("latency_ms", 0)
def get_tenant_dashboard(self, tenant_id: str) -> dict:
"""
Return metrics visible to a specific tenant.
NEVER include other tenants' data.
"""
m = self.metrics.get(tenant_id, {})
if not m:
return {"error": "No metrics available"}
avg_latency = m["total_latency_ms"] / max(m["total_requests"], 1)
return {
"tenant_id": tenant_id,
"requests_served": m["total_requests"],
"tokens_processed": m["total_input_tokens"] + m["total_output_tokens"],
"avg_latency_ms": avg_latency,
"rejected_requests": m["rejected_requests"],
"rate_limited_requests": m["rate_limited_requests"],
# NO cross-tenant data exposed
}
def get_operator_dashboard(self) -> dict:
"""
Operator view: aggregate across all tenants.
Individual tenant content/prompts never visible.
"""
total_requests = sum(m["total_requests"] for m in self.metrics.values())
total_tokens = sum(
m["total_input_tokens"] + m["total_output_tokens"]
for m in self.metrics.values()
)
return {
"total_tenants": len(self.metrics),
"total_requests": total_requests,
"total_tokens": total_tokens,
"per_tenant_summary": {
tid: {"requests": m["total_requests"]}
for tid, m in self.metrics.items()
}
}
Performance Summary: Isolation Overhead
def total_isolation_overhead() -> dict:
"""
Summary of all isolation mechanisms and their cost.
"""
return {
"kv_cache_zeroing": {
"latency_ms": 2.5, # per request (256 blocks)
"where": "On block deallocation",
"can_be_async": True,
},
"kv_cache_encryption": {
"latency_ms": 4.0, # per forward pass (enterprise only)
"where": "Every attention computation",
"can_be_async": False,
},
"tenant_routing": {
"latency_ms": 0.1,
"where": "Request ingress",
"can_be_async": False,
},
"rate_limiting": {
"latency_ms": 0.05,
"where": "Request ingress",
"can_be_async": False,
},
"lora_adapter_switch": {
"latency_ms": 8.0, # per forward pass (if multi-adapter)
"where": "Forward pass",
"can_be_async": False,
},
"audit_logging": {
"latency_ms": 0.5,
"where": "Request completion",
"can_be_async": True,
},
"total_standard_tier": {
"overhead_ms": 3.15,
"overhead_pct": "0.5-3% (depends on request length)",
},
"total_enterprise_tier": {
"overhead_ms": 15.15,
"overhead_pct": "2-8% (depends on request length)",
},
}
Isolation Overhead by Component (ms per request)
Multi-tenant isolation in Dynamo is a layered defense: namespace separation at the API layer, KV cache isolation at the memory layer, tenant-aware scheduling at the compute layer, and audit logging at the compliance layer. The total overhead ranges from 0.5% (standard tier, long requests) to 8% (enterprise tier with encryption, short requests). The key design principle is that isolation primitives are always-on and cannot be bypassed by application logic — they are enforced at the infrastructure layer, not the model layer.