After three generations of dense transformers, Meta admitted defeat: Llama 4 is a MoE model. The economics forced the pivot — training a 405B dense model costs 5x more FLOPs than training a 400B MoE with 37B active parameters, and even Meta’s 600K H100 cluster has limits. Llama 4 Scout (109B total, 17B active) and Maverick (400B total, 17B active) prove that Meta can no longer afford to ignore MoE’s training efficiency, regardless of serving complexity.
The Llama Trajectory
From Dense to MoE
def llama_evolution():
"""
The Llama family's architectural progression.
"""
models = {
'llama_1': {
'release': 'February 2023',
'sizes': ['7B', '13B', '33B', '65B'],
'architecture': 'Dense transformer',
'attention': 'Multi-head attention (no GQA)',
'activation': 'SiLU',
'positional': 'RoPE',
'training_tokens': '1.0-1.4T',
'multimodal': False,
'moe': False,
},
'llama_2': {
'release': 'July 2023',
'sizes': ['7B', '13B', '70B'],
'architecture': 'Dense transformer',
'attention': 'GQA (70B only)',
'activation': 'SiLU',
'positional': 'RoPE',
'training_tokens': '2T',
'multimodal': False,
'moe': False,
},
'llama_3': {
'release': 'April 2024',
'sizes': ['8B', '70B'],
'architecture': 'Dense transformer',
'attention': 'GQA (all sizes)',
'activation': 'SwiGLU',
'positional': 'RoPE',
'training_tokens': '15T',
'multimodal': False,
'moe': False,
},
'llama_3_1': {
'release': 'July 2024',
'sizes': ['8B', '70B', '405B'],
'architecture': 'Dense transformer',
'attention': 'GQA',
'activation': 'SwiGLU',
'positional': 'RoPE (extended to 128K)',
'training_tokens': '15T+',
'multimodal': False,
'moe': False,
},
'llama_4': {
'release': 'April 2025',
'sizes': ['Scout (17B active)', 'Maverick (17B active)', 'Behemoth (288B active)'],
'architecture': 'Mixture-of-Experts',
'attention': 'GQA + architectural innovations',
'activation': 'SwiGLU',
'positional': 'RoPE',
'training_tokens': 'Unknown (likely 20T+)',
'multimodal': True,
'moe': True,
},
}
# Key shift: Llama 4 is BOTH multimodal AND MoE
# Previous Llama models were neither
return models
Why Meta Adopted MoE
The Efficiency Argument
def moe_efficiency_argument():
"""
Meta has more GPUs than almost anyone (24,576 H100s in one cluster).
So why adopt MoE? The efficiency gap is too large to ignore.
"""
efficiency_comparison = {
'dense_405B': {
'total_params': 405e9,
'active_params': 405e9,
'flops_per_token': 810e9, # ~2 * params
'training_cost_estimated': '$100M+',
'training_tokens': '15T',
'kv_cache_per_token': '2048 bytes (GQA-8)',
},
'moe_equivalent': {
'total_params': '~2T (estimated)',
'active_params': '~50B per token',
'flops_per_token': '~100e9', # 8x less compute per token
'training_cost_estimated': '$20-30M (at same quality)',
'training_tokens': '15T+',
'kv_cache_per_token': 'Depends on attention design',
},
}
# The math is compelling:
# Dense 405B: 810 GFLOP per token
# MoE 2T (50B active): 100 GFLOP per token
# Same quality, 8x less compute per token
# Even Meta cannot ignore 8x efficiency
return efficiency_comparison
def meta_gpu_resources():
"""
Meta's training infrastructure context.
"""
infrastructure = {
'2024': {
'h100_count': 24576,
'cluster': 'Grand Teton',
'network': '400 Gbps RoCE',
'storage': 'Tectonic distributed filesystem',
'training_capacity': '~3.5e24 FLOPs per day',
},
'2025': {
'h100_count': '~100,000+ (including B200/H200)',
'cluster': 'Multiple clusters',
'note': 'Even with this scale, MoE saves months of training time',
},
}
# With 24K H100s, training Llama 3.1 405B took ~54 days
# An MoE model at same quality would take ~7-10 days
# Meta can train MORE models per year with MoE
return infrastructure
Meta’s adoption of MoE is not about saving money — it is about training speed. With MoE, Meta can iterate faster: train a frontier model in weeks instead of months. This lets them explore more architectural variants, data mixes, and training recipes per year. The competitive advantage shifts from “who has the most GPUs” to “who can run the most experiments.”
Llama 4 Architecture: What We Know
Scout and Maverick
class Llama4ScoutConfig:
"""
Llama 4 Scout: the smaller MoE model.
Based on announced and published details.
"""
def __init__(self):
# Base transformer
self.d_model = 5120
self.num_layers = 48
self.num_heads = 40
self.head_dim = 128
self.num_kv_heads = 8
# MoE configuration
self.num_experts = 16
self.top_k = 1 # Only 1 expert per token
self.expert_d_ff = 8192
# Total vs active parameters
self.total_params = 109e9 # 109B total
self.active_params = 17e9 # 17B active per token
# Context
self.max_context = 10_000_000 # 10M token context
# Multimodal
self.multimodal = True
self.modalities = ['text', 'image']
self.vision_encoder = 'Native (not bolted-on ViT)'
class Llama4MaverickConfig:
"""
Llama 4 Maverick: the larger MoE model.
"""
def __init__(self):
self.d_model = 5120
self.num_layers = 48
self.num_heads = 40
self.head_dim = 128
self.num_kv_heads = 8
# MoE configuration — same expert count as Scout
self.num_experts = 128
self.top_k = 1
self.expert_d_ff = 8192
# Total vs active parameters
self.total_params = 402e9 # 402B total
self.active_params = 17e9 # 17B active (same as Scout)
# Context
self.max_context = 1_000_000 # 1M token context
# Multimodal
self.multimodal = True
class Llama4BehemothConfig:
"""
Llama 4 Behemoth: the flagship model (still in training as of early 2025).
"""
def __init__(self):
# Significantly larger
self.total_params = 2000e9 # ~2T total (estimated)
self.active_params = 288e9 # 288B active
# This model competes with GPT-4o and Claude 3.5 Opus
self.target_quality = 'Frontier (GPT-4o+ level)'
self.status = 'In training'
Architectural Surprises
def llama4_architectural_analysis():
"""
Notable design decisions in Llama 4.
"""
analysis = {
'top_1_routing': {
'what': 'Scout uses top-1 expert routing (only 1 expert per token)',
'why_surprising': 'DeepSeek V3 uses top-8. Mixtral uses top-2. '
'Top-1 is the most aggressive sparsity.',
'tradeoff': 'Maximum parameter efficiency, but each token '
'gets less expert capacity. Offset by more total experts.',
'precedent': 'Switch Transformer (2022) also used top-1.',
},
'massive_context': {
'what': 'Scout supports 10M token context',
'why_surprising': 'Previous max was 128K-1M for most models. '
'10M is an order of magnitude increase.',
'implementation': 'Likely requires specialized attention (not full O(N^2)). '
'Ring attention, chunked prefill, or sparse attention.',
},
'native_multimodal': {
'what': 'Image understanding built into the model natively, '
'not bolted on as a separate ViT adapter',
'why_surprising': 'All previous open-source multimodal models '
'(LLaVA, InternVL) use a separate vision encoder.',
'advantage': 'Tighter integration, potentially better cross-modal reasoning',
},
'same_active_params': {
'what': 'Scout and Maverick have the same 17B active params '
'despite 109B vs 402B total params',
'implication': 'Maverick has 128 experts vs Scout 16, '
'but the same per-token compute. '
'Quality improvement comes entirely from '
'more expert specialization, not more compute.',
},
}
return analysis
Llama 4 Scout Early Benchmark Results
| Benchmark | Scout (17B active) | Llama 3.1 8B | Gemma 3 12B | GPT-4o-mini |
|---|---|---|---|---|
| MMLU (0-shot) | 79.6% | 73.0% | 78.5% | 82.0% |
| GPQA Diamond | 44.5% | 32.8% | 42.1% | 39.4% |
| IFEval | 87.1% | 80.4% | 83.5% | 84.8% |
| Coding (MBPP EvalPlus) | 68.0% | 63.4% | 67.2% | 72.1% |
| Active Params | 17B | 8B | 12B | ~8B (est.) |
Early Llama 4 results show competitive but not dominant performance. Scout’s 17B active parameters compete well against 8-12B dense models but do not clearly surpass them. The true value proposition lies in the larger Maverick and Behemoth models, where the MoE architecture provides much higher total knowledge capacity per inference FLOP.
Multimodal Architecture
Native vs Adapter-Based Multimodal
def multimodal_approaches():
"""
Two approaches to multimodal LLMs:
1. Adapter-based: train a text LLM, then bolt on a vision encoder
2. Native: train with multimodal data from the start
"""
adapter_based = {
'examples': ['LLaVA', 'InternVL', 'Qwen-VL'],
'architecture': (
'Pretrained ViT -> Linear projection -> Pretrained LLM'
),
'training': [
'Step 1: Pretrain text LLM on text data',
'Step 2: Freeze ViT and LLM, train projection layer on image-text pairs',
'Step 3: Unfreeze LLM, fine-tune end-to-end on image-text data',
],
'pros': [
'Can leverage existing strong text LLMs',
'Modular: swap ViT or LLM independently',
'Cheaper: text pretraining is done once, shared across variants',
],
'cons': [
'Vision encoder is a separate component with its own distribution',
'Projection layer is a bottleneck',
'Limited cross-modal understanding in early layers',
],
}
native_multimodal = {
'examples': ['Llama 4', 'Gemini', 'Fuyu'],
'architecture': (
'Single transformer processes interleaved text and image tokens'
),
'training': [
'Step 1: Tokenize images into patches directly (no separate ViT)',
'Step 2: Train on interleaved text + image data from the start',
'All layers see both modalities throughout training',
],
'pros': [
'Deeper cross-modal integration',
'No projection bottleneck',
'MoE experts can specialize per modality naturally',
],
'cons': [
'Cannot reuse existing text-only pretraining',
'More expensive: must train from scratch with multimodal data',
'Image tokenization quality depends on patch size',
],
}
return adapter_based, native_multimodal
def llama4_vision_processing():
"""
How Llama 4 processes images (based on available information).
"""
vision_config = {
'tokenization': {
'method': 'Patch-based image tokenization',
'patch_size': 14,
'image_resolution': '336x336 (resizable)',
'tokens_per_image': 576, # (336/14)^2
'projection': 'Linear projection from patch features to d_model',
},
'integration': {
'method': 'Interleaved text and image tokens in the same sequence',
'format': '[TEXT_TOKENS] [IMAGE_PATCH_TOKENS] [TEXT_TOKENS]',
'attention': 'Image patches attend to text and vice versa',
},
'moe_interaction': {
'routing': 'Image tokens routed through same MoE layers as text',
'specialization': 'Some experts expected to specialize for visual features',
'shared_expert': 'Likely handles universal features across modalities',
},
}
return vision_config
Training Strategy Predictions
def training_strategy_analysis():
"""
Predicted training strategy for Llama 4 based on Meta's published
research and infrastructure capabilities.
"""
predicted_training = {
'stage_1_pretraining': {
'data_volume': '20-30T tokens (text + image)',
'data_composition': {
'web_text': 0.50,
'code': 0.15,
'image_text_pairs': 0.10,
'interleaved_multimodal': 0.05,
'math': 0.08,
'multilingual': 0.07,
'books_academic': 0.05,
},
'hardware': '~50,000+ GPUs (H100 + next-gen)',
'training_time': '2-3 months (estimated)',
'precision': 'BF16 with FP8 expert computation',
},
'stage_2_long_context': {
'method': 'Progressive context extension',
'steps': [
'4K -> 32K (standard RoPE extension)',
'32K -> 256K (NTK-aware RoPE scaling)',
'256K -> 1M/10M (ring attention + chunked prefill)',
],
'data': 'Long documents, books, codebases',
},
'stage_3_post_training': {
'sft': 'Instruction following + chat + multimodal QA',
'alignment': 'RLHF or DPO for helpfulness and safety',
'reasoning': 'Possible GRPO-style reasoning training (following R1)',
},
}
return predicted_training
def parallelism_strategy():
"""
Distributed training strategy for Llama 4.
"""
strategy = {
'expert_parallelism': {
'what': 'Distribute experts across GPUs',
'for_scout_16e': '16 experts on 8 GPUs (2 per GPU) or 16 GPUs (1 per GPU)',
'for_maverick_128e': '128 experts on 16-32 GPUs (4-8 per GPU)',
'for_behemoth': 'Hundreds of GPUs for expert parallelism',
},
'tensor_parallelism': {
'what': 'Split attention heads across GPUs',
'typical': '8-way tensor parallelism',
},
'pipeline_parallelism': {
'what': 'Split layers across GPU groups',
'typical': '4-8 pipeline stages',
},
'data_parallelism': {
'what': 'Replicate model across GPU groups for different data',
'typical': 'FSDP (Fully Sharded Data Parallel)',
},
'total_gpu_count': {
'scout': '~4,000-8,000 GPUs',
'maverick': '~8,000-16,000 GPUs',
'behemoth': '~30,000-50,000 GPUs',
},
}
return strategy
What Llama 4 Signals for the Ecosystem
def ecosystem_implications():
"""
Llama 4 as MoE sends important signals to the open-weight ecosystem.
"""
signals = {
'moe_is_now_the_default': {
'signal': 'Meta, the largest open-weight contributor, has adopted MoE. '
'This validates MoE as the standard architecture.',
'implication': 'Inference frameworks, fine-tuning tools, and deployment '
'infrastructure will optimize for MoE first.',
'prediction': 'By late 2025, MoE support will be a baseline requirement '
'for any serious inference framework.',
},
'dense_models_are_legacy': {
'signal': 'Llama 3 was likely the last major dense-only release from Meta.',
'implication': 'New dense models will focus on small scales (sub-10B) '
'where MoE overhead is not justified.',
'prediction': 'Dense models above 70B will become rare by 2026.',
},
'multimodal_is_standard': {
'signal': 'Llama 4 is natively multimodal. Text-only is no longer sufficient.',
'implication': 'Open-source community needs multimodal training data, '
'evaluation benchmarks, and fine-tuning tools.',
'prediction': 'All frontier models in 2026 will be multimodal.',
},
'open_weight_moe_competition': {
'signal': 'Llama 4 competes directly with DeepSeek V3 in the open MoE space.',
'implication': 'Users choose between Meta (ecosystem, integration) '
'and DeepSeek (efficiency, innovation).',
'prediction': 'This competition drives rapid improvement in both.',
},
}
return signals
Serving Infrastructure Changes
def serving_infrastructure_impact():
"""
Llama 4 MoE changes how the community serves models.
"""
changes = {
'vllm_moe_optimization': {
'before': 'vLLM MoE support was experimental (Mixtral)',
'after': 'Must be production-grade for Llama 4',
'needed': [
'Expert parallelism across GPUs',
'Expert offloading to CPU/disk',
'Quantized expert serving (INT4/INT8)',
'Batched expert computation',
],
},
'consumer_gpu_deployment': {
'before': 'Llama 3 8B fits on 1 RTX 4090 easily',
'after': 'Llama 4 Scout (109B total) needs quantization + offloading',
'strategy': {
'full_precision': '~220 GB -> 3x A100 80GB',
'int8': '~110 GB -> 2x A100 80GB',
'int4': '~55 GB -> 1x A100 80GB (tight)',
'int4_offload': '~20 GB VRAM + 35 GB RAM -> 1x RTX 4090',
},
},
'cloud_api_serving': {
'before': 'Llama 3.1 405B: 8x A100 80GB minimum',
'after': 'Llama 4 Behemoth (~2T): 32-64 GPUs for full precision',
'optimization': 'Expert parallelism + INT4 quantization -> 8-16 GPUs',
},
}
return changes
Llama 4 Serving Requirements
| Model | Total Params | Active Params | FP16 Memory | INT4 Memory | Min GPUs (INT4) |
|---|---|---|---|---|---|
| Scout | 109B | 17B | ~220 GB | ~55 GB | 1x A100 |
| Maverick | 402B | 17B | ~800 GB | ~200 GB | 3x A100 |
| Behemoth | ~2T | 288B | ~4 TB | ~1 TB | 16x A100 |
| (Llama 3.1 405B) | 405B | 405B | ~810 GB | ~200 GB | 3x A100 |
Llama 4 Maverick stores 402B parameters but activates only 17B per token — same compute as a 17B dense model. In INT4 quantization, it requires ~200 GB (3x A100 80GB), the same as Llama 3.1 405B in INT4. But Maverick contains nearly 4x more total knowledge from its 128 experts. The MoE architecture gives substantially more knowledge per inference FLOP.
Fine-Tuning Implications
def finetuning_moe_llama4():
"""
Fine-tuning Llama 4 requires MoE-specific techniques.
"""
approaches = {
'full_finetune': {
'feasibility': 'Requires same GPU count as training (impractical for most)',
'params_updated': 'All 109-402B parameters',
'recommendation': 'Only for Meta-scale labs',
},
'lora_all_experts': {
'method': 'Add LoRA adapters to all expert FFN layers',
'params_added': '~0.5-2% of total (500M-4B for Maverick)',
'memory': 'Still need all expert weights in memory for forward pass',
'recommendation': 'Good quality but memory-intensive',
},
'lora_shared_only': {
'method': 'Add LoRA only to attention + shared expert, freeze routed experts',
'params_added': '~0.1% of total',
'memory': 'Lower than full LoRA but experts still needed for forward pass',
'recommendation': 'Memory-efficient, works well for domain adaptation',
},
'expert_selective_lora': {
'method': 'Add LoRA only to the most-activated experts for your domain',
'params_added': '~0.05-0.2% of total',
'memory': 'Can offload inactive experts during fine-tuning',
'recommendation': 'Best tradeoff for domain-specific fine-tuning',
},
}
return approaches
Looking Ahead: Llama 5 and Beyond
def meta_roadmap_predictions():
"""
Where Meta's LLM strategy goes after Llama 4.
"""
predictions = {
'llama_5_2026': {
'architecture': 'MoE with 512+ experts (following DeepSeek trend)',
'multimodal': 'Text + image + video + audio (4 modalities)',
'reasoning': 'Built-in reasoning (GRPO or successor RL method)',
'context': '10M+ tokens standard',
'open_weights': 'Yes — Meta has committed to open weights',
'confidence': 0.70,
},
'meta_inference_hardware': {
'prediction': 'Meta develops custom MoE inference chips (MTIA successor)',
'reasoning': 'MoE is now their standard architecture. '
'Custom silicon for expert dispatch is the logical next step.',
'confidence': 0.60,
},
'ecosystem_dominance': {
'prediction': 'Llama remains the most-used open-weight model family',
'reasoning': 'Meta ecosystem (PyTorch, torchtune, Llama ecosystem) '
'creates strong lock-in.',
'threat': 'DeepSeek (better efficiency) and Qwen (better multilingual)',
'confidence': 0.55,
},
}
return predictions
Llama 4 represents Meta’s acknowledgment that MoE is the future of large language models. The shift from dense to MoE, combined with native multimodal support, positions Meta to remain competitive with DeepSeek and closed-source labs. For the open-source community, Llama 4 means MoE is no longer an exotic architecture — it is the new standard. The tooling, infrastructure, and best practices built around Llama 4 MoE will shape how the community builds and deploys models for years to come.