DeepSeek-R1 matches o1-preview on AIME 2024 (79.8% vs 83.3%) at a fraction of the training cost. The breakthrough: GRPO, a simplified RL algorithm that eliminates the critic network and value function that make PPO unstable at scale. R1 learned to generate internal chain-of-thought without any human-annotated reasoning traces — the RL reward signal alone was sufficient to discover multi-step decomposition, self-correction, and structured problem-solving. This is the first evidence that reasoning emerges from optimization pressure, not supervised imitation.
Base Architecture: DeepSeek V3
What R1 Starts With
class DeepSeekV3Architecture:
"""
R1 uses DeepSeek V3 as its base model.
All architectural innovations (MLA, MoE, FP8) carry over.
"""
def __init__(self):
# Transformer backbone
self.num_layers = 61
self.d_model = 7168
self.num_heads = 128
self.head_dim = 128
# Multi-head Latent Attention (MLA)
self.kv_lora_rank = 512
self.q_lora_rank = 1536
self.qk_rope_dim = 64
self.qk_nope_dim = 128
# Mixture of Experts
self.num_routed_experts = 256
self.num_shared_experts = 1
self.top_k = 8
self.expert_d_ff = 2048
# Key: R1 does NOT change the architecture.
# The entire improvement comes from training.
self.architecture_changes_for_r1 = None # None — identical to V3
def what_r1_adds():
"""
R1's contribution is entirely in the training pipeline:
1. RL training with GRPO
2. Reasoning trace generation
3. Multi-stage distillation
4. Cold-start data for initial reasoning ability
"""
return {
'architecture_changes': 'None',
'training_changes': 'GRPO RL, multi-stage pipeline',
'data_changes': 'Cold-start reasoning data + RL-generated traces',
'inference_changes': 'Extended generation with chain-of-thought',
}
GRPO: Group Relative Policy Optimization
Why Not PPO?
def ppo_vs_grpo():
"""
PPO (Proximal Policy Optimization) is the standard RL algorithm
for RLHF. It requires a separate value/critic model.
Problems with PPO for reasoning:
1. Critic model is expensive (same size as policy)
2. Critic must estimate value of partial reasoning traces
3. Training is unstable for long-horizon reasoning
4. Memory: need policy + critic + reference model in VRAM
"""
ppo_requirements = {
'models_in_memory': [
'Policy model (671B total params)',
'Value/critic model (similar size)',
'Reference model (frozen copy of policy)',
],
'total_memory': '3x model size',
'training_signal': 'Per-token advantage estimates from critic',
'stability': 'Sensitive to critic accuracy',
}
grpo_requirements = {
'models_in_memory': [
'Policy model (671B total params)',
'Reference model (frozen copy of policy)',
],
'total_memory': '2x model size (no critic)',
'training_signal': 'Group-relative rewards (no critic needed)',
'stability': 'More stable — no critic estimation errors',
}
return ppo_requirements, grpo_requirements
GRPO Algorithm
import torch
import torch.nn.functional as F
class GRPO:
"""
Group Relative Policy Optimization.
Key idea: instead of using a critic to estimate baselines,
generate a GROUP of outputs for each prompt and use the
group's mean reward as the baseline.
This eliminates the critic entirely.
"""
def __init__(self, policy_model, ref_model, reward_fn, config):
self.policy = policy_model
self.ref_model = ref_model # Frozen reference
self.reward_fn = reward_fn
self.config = config
self.group_size = config.group_size # G = number of outputs per prompt
self.kl_coeff = config.kl_coeff
self.clip_range = config.clip_range
def training_step(self, prompts):
"""
For each prompt, generate G completions, score them,
compute group-relative advantages, update policy.
"""
# Step 1: Generate G completions per prompt
all_completions = []
all_log_probs = []
for prompt in prompts:
completions, log_probs = self._generate_group(prompt)
all_completions.append(completions)
all_log_probs.append(log_probs)
# Step 2: Score completions with reward function
all_rewards = []
for prompt, completions in zip(prompts, all_completions):
rewards = [self.reward_fn(prompt, completion) for completion in completions]
all_rewards.append(torch.tensor(rewards))
# Step 3: Compute group-relative advantages
advantages = self._compute_advantages(all_rewards)
# Step 4: Policy gradient update with clipping
loss = self._compute_loss(prompts, all_completions, all_log_probs, advantages)
return loss
def _generate_group(self, prompt):
"""Generate G diverse completions for a single prompt."""
self.policy.eval()
completions = []
log_probs = []
with torch.no_grad():
for _ in range(self.group_size):
completion, lp = self.policy.generate(
prompt,
temperature=self.config.sampling_temperature,
max_tokens=self.config.max_completion_length,
return_log_probs=True,
)
completions.append(completion)
log_probs.append(lp)
self.policy.train()
return completions, log_probs
def _compute_advantages(self, all_rewards):
"""
Group-relative advantage: A_i = (R_i - mean(R_group)) / std(R_group)
No critic needed — the group mean IS the baseline.
"""
advantages = []
for rewards in all_rewards:
mean_reward = rewards.mean()
std_reward = rewards.std() + 1e-8
# Normalize within the group
normalized = (rewards - mean_reward) / std_reward
advantages.append(normalized)
return advantages
def _compute_loss(self, prompts, all_completions, all_log_probs, advantages):
"""
GRPO loss = -E[min(ratio * A, clip(ratio, 1-eps, 1+eps) * A)] + kl_penalty
Where ratio = pi_theta(a|s) / pi_old(a|s)
"""
total_loss = 0.0
num_samples = 0
for prompt_idx, (prompt, completions, old_log_probs, advs) in enumerate(
zip(prompts, all_completions, all_log_probs, advantages)
):
for comp_idx, (completion, old_lp, adv) in enumerate(
zip(completions, old_log_probs, advs)
):
# Current policy log probability
new_lp = self.policy.compute_log_prob(prompt, completion)
# Ratio
ratio = torch.exp(new_lp - old_lp)
# Clipped surrogate objective
surr1 = ratio * adv
surr2 = torch.clamp(ratio, 1 - self.clip_range, 1 + self.clip_range) * adv
policy_loss = -torch.min(surr1, surr2).mean()
# KL penalty against reference model
ref_lp = self.ref_model.compute_log_prob(prompt, completion)
kl = (new_lp - ref_lp).mean()
kl_loss = self.kl_coeff * kl
total_loss += policy_loss + kl_loss
num_samples += 1
return total_loss / num_samples
GRPO eliminates the critic model, saving 33% of memory during RL training. For a 671B MoE model, this means fitting the training pipeline on fewer GPUs. The group-relative baseline has lower variance than a learned critic for reasoning tasks because correct/incorrect answers provide clear signal.
The Multi-Stage Training Pipeline
Overview
def r1_training_stages():
"""
DeepSeek-R1 uses a multi-stage training pipeline:
Stage 1: Base model pre-training (DeepSeek V3)
Stage 2: Cold-start SFT (teach basic reasoning format)
Stage 3: RL with GRPO (improve reasoning through trial and error)
Stage 4: Rejection sampling + SFT (distill RL improvements)
Stage 5: Final RL alignment (safety + helpfulness)
"""
stages = {
'stage_1': {
'name': 'Base Pre-training',
'model': 'DeepSeek V3',
'method': 'Next token prediction',
'data': '14.8T tokens (web, code, math, multilingual)',
'tokens_trained': '14.8T',
'purpose': 'Build world knowledge and language ability',
},
'stage_2': {
'name': 'Cold-Start SFT',
'model': 'V3 + reasoning format',
'method': 'Supervised fine-tuning',
'data': 'Thousands of examples with reasoning traces',
'tokens_trained': '~1B (small)',
'purpose': 'Teach the model to produce reasoning traces '
'(think step by step, show work)',
},
'stage_3': {
'name': 'RL with GRPO',
'model': 'Cold-start model as initial policy',
'method': 'GRPO with rule-based rewards',
'data': 'Math, code, science prompts (no human-labeled reasoning)',
'tokens_trained': '~50-100B (RL rollouts)',
'purpose': 'Improve reasoning quality through trial and error',
},
'stage_4': {
'name': 'Rejection Sampling + SFT',
'model': 'Best RL checkpoint',
'method': 'Generate many solutions, keep correct ones, SFT on them',
'data': 'RL-generated correct reasoning traces',
'tokens_trained': '~10B',
'purpose': 'Stabilize RL gains, improve consistency',
},
'stage_5': {
'name': 'Final RL Alignment',
'model': 'Stage 4 model',
'method': 'GRPO with helpfulness + safety rewards',
'data': 'General prompts + safety red-teaming',
'tokens_trained': '~5B',
'purpose': 'Align with human preferences while preserving reasoning',
},
}
return stages
Stage 2: Cold-Start Data
def cold_start_data():
"""
The cold-start data teaches the model the FORMAT of reasoning,
not the CONTENT. The RL stage handles content.
Cold-start examples use a specific structure:
<think> ... reasoning steps ... </think>
<answer> ... final answer ... </answer>
"""
example = {
'prompt': 'What is the sum of the first 100 positive integers?',
'reasoning_trace': (
'<think>\n'
'I need to find 1 + 2 + 3 + ... + 100.\n'
'I recall the formula for the sum of the first n positive integers: '
'S = n(n+1)/2.\n'
'Substituting n = 100: S = 100 * 101 / 2 = 5050.\n'
'Let me verify with a smaller case: n=5 gives 5*6/2 = 15.\n'
'Indeed 1+2+3+4+5 = 15. The formula works.\n'
'</think>\n'
'<answer>\n'
'The sum of the first 100 positive integers is 5050.\n'
'</answer>'
),
}
cold_start_properties = {
'num_examples': 'Thousands (not millions)',
'source': 'Curated by humans OR generated by a capable model '
'and filtered',
'diversity': 'Math, science, logic, code — various reasoning types',
'key_property': 'Teaches FORMAT, not reasoning ability. '
'The cold-start model reasons poorly but in the right format.',
}
return example, cold_start_properties
Stage 3: RL Reward Functions
class ReasoningRewardFunction:
"""
Rule-based rewards for reasoning RL — no learned reward model.
This is a key difference from standard RLHF.
"""
def __init__(self):
self.rewards = {
'correct_answer': 1.0,
'incorrect_answer': -0.5,
'format_violation': -0.2,
'length_penalty_per_1k_tokens': -0.01,
}
def compute_reward(self, prompt, completion, ground_truth=None):
"""
Compute reward for a reasoning completion.
"""
reward = 0.0
# 1. Answer correctness (most important)
if ground_truth is not None:
predicted_answer = self._extract_answer(completion)
if self._check_correctness(predicted_answer, ground_truth):
reward += self.rewards['correct_answer']
else:
reward += self.rewards['incorrect_answer']
# 2. Format compliance
if not self._check_format(completion):
reward += self.rewards['format_violation']
# 3. Length penalty (discourage unnecessarily verbose reasoning)
num_tokens = len(completion.split())
reward += self.rewards['length_penalty_per_1k_tokens'] * (num_tokens / 1000)
return reward
def _extract_answer(self, completion):
"""Extract the answer from between <answer> tags."""
import re
match = re.search(r'<answer>(.*?)</answer>', completion, re.DOTALL)
if match:
return match.group(1).strip()
return completion.strip().split('\n')[-1]
def _check_correctness(self, predicted, ground_truth):
"""
Check if the predicted answer matches ground truth.
For math: parse numerical values and compare.
For code: run the code and check output.
"""
# Numerical comparison (with tolerance)
try:
pred_val = float(predicted.replace(',', ''))
true_val = float(str(ground_truth).replace(',', ''))
return abs(pred_val - true_val) < 1e-6
except ValueError:
pass
# String comparison (normalized)
return predicted.strip().lower() == str(ground_truth).strip().lower()
def _check_format(self, completion):
"""Check that completion follows the think/answer format."""
has_think = '<think>' in completion and '</think>' in completion
has_answer = '<answer>' in completion and '</answer>' in completion
return has_think and has_answer
class CodeExecutionReward(ReasoningRewardFunction):
"""Extended reward for code problems: actually execute the code."""
def compute_reward(self, prompt, completion, test_cases=None):
base_reward = super().compute_reward(prompt, completion)
if test_cases is None:
return base_reward
code = self._extract_code(completion)
if code is None:
return base_reward - 0.3 # No code found
# Execute code against test cases
passed = 0
for test_input, expected_output in test_cases:
try:
actual_output = self._safe_execute(code, test_input)
if actual_output == expected_output:
passed += 1
except Exception:
pass
pass_rate = passed / len(test_cases)
return base_reward + pass_rate * 0.5 # Bonus for passing tests
def _extract_code(self, completion):
import re
match = re.search(r'```python\n(.*?)```', completion, re.DOTALL)
return match.group(1) if match else None
def _safe_execute(self, code, test_input, timeout=5):
"""Execute code in a sandbox with timeout."""
pass # Production uses containerized execution
R1 Training Pipeline Results
| Stage | AIME 2024 | MATH-500 | Codeforces | Training Cost |
|---|---|---|---|---|
| Stage 1 (V3 base) | 15.6% | 75.2% | 40th percentile | 5.576M |
| Stage 2 (Cold-start SFT) | 18.3% | 78.1% | 42nd percentile | +small |
| Stage 3 (GRPO RL) | 71.0% | 94.8% | 93rd percentile | +significant |
| Stage 4 (Rejection sampling + SFT) | 72.6% | 95.2% | 94th percentile | +moderate |
| Stage 5 (Final alignment) | 71.5% | 94.3% | 92nd percentile | +small |
Stage 3 (GRPO RL) provides the largest improvement: AIME 2024 jumps from 18.3% to 71.0%. This is the core contribution. Stage 5 (alignment) causes a slight quality regression on reasoning benchmarks — the standard alignment tax. DeepSeek chose to accept this for safety.
How Reasoning Traces Emerge
R1-Zero: Pure RL Without Cold-Start
def r1_zero_experiment():
"""
DeepSeek-R1-Zero: apply GRPO directly to the base V3 model
WITHOUT cold-start SFT. No reasoning format is taught.
Result: the model spontaneously develops reasoning traces
during RL training. It learns to "think" because thinking
leads to correct answers, which leads to higher reward.
"""
r1_zero_observations = {
'step_0': {
'behavior': 'Model gives short, direct answers (no reasoning)',
'math_accuracy': 15.6,
'avg_response_length': 50,
},
'step_5000': {
'behavior': 'Model starts occasionally showing work',
'math_accuracy': 22.1,
'avg_response_length': 120,
},
'step_20000': {
'behavior': 'Model consistently shows step-by-step reasoning',
'math_accuracy': 45.3,
'avg_response_length': 350,
},
'step_50000': {
'behavior': 'Model develops self-checking behavior '
'(verifies its own steps)',
'math_accuracy': 58.7,
'avg_response_length': 600,
},
'step_100000': {
'behavior': 'Model shows emergent planning: outlines approach '
'before executing',
'math_accuracy': 66.2,
'avg_response_length': 800,
},
}
# Key insight: reasoning emerges from reward optimization alone.
# No human wrote these reasoning traces.
# The model discovered that showing its work improves accuracy.
return r1_zero_observations
Why Reasoning Traces Help the Model
def why_reasoning_helps():
"""
Chain-of-thought reasoning improves accuracy because:
1. It extends the effective computation per problem
2. Intermediate steps create checkpoints for self-correction
3. The model can decompose hard problems into easier subproblems
"""
analysis = {
'extended_computation': {
'explanation': "A direct answer uses ~50 tokens of 'thinking'. "
"A reasoning trace uses 500+ tokens. Each token "
"is a forward pass through 37B active parameters. "
"More tokens = more compute per problem.",
'compute_comparison': {
'direct_answer': '50 tokens * 37B active = 1.85T FLOPs',
'with_reasoning': '500 tokens * 37B active = 18.5T FLOPs',
'ratio': '10x more compute per problem',
},
},
'self_correction': {
'explanation': "The model can write a step, see it in its context, "
"and correct it in the next step. This is impossible "
"with direct answers.",
'example': "Step 1: 15 * 17 = 245. Wait, let me recheck: "
"15 * 17 = 15 * 10 + 15 * 7 = 150 + 105 = 255.",
},
'decomposition': {
'explanation': "Hard problems become sequences of easy problems. "
"Each step is within the model's reliable capability.",
'example': "Problem: integral of x^2 * e^x dx. "
"Step 1: Use integration by parts. "
"Step 2: Let u = x^2, dv = e^x dx. "
"Step 3: du = 2x dx, v = e^x. "
"... each step is simple.",
},
}
return analysis
Comparison with OpenAI o1
def r1_vs_o1_comparison():
"""
DeepSeek-R1 vs OpenAI o1: architectural and training differences.
"""
comparison = {
'base_architecture': {
'r1': 'DeepSeek V3 (671B MoE, 37B active)',
'o1': 'Unknown (likely GPT-4 variant, rumored 1.8T MoE)',
'advantage': 'o1 (likely larger base model)',
},
'training_method': {
'r1': 'GRPO (no critic model)',
'o1': 'Unknown (likely PPO or similar with learned reward)',
'advantage': 'R1 (simpler, more memory-efficient)',
},
'reasoning_format': {
'r1': 'Visible reasoning traces (user can see thinking)',
'o1': 'Hidden reasoning (user sees summary only)',
'advantage': 'R1 (transparency and debuggability)',
},
'reward_signal': {
'r1': 'Rule-based (correctness checking, code execution)',
'o1': 'Unknown (likely combination of rule-based and learned)',
'advantage': 'Unknown',
},
'open_weights': {
'r1': 'Yes — fully open (model weights, some training details)',
'o1': 'No — API access only',
'advantage': 'R1 (research community can build on it)',
},
'cost': {
'r1': '$2.19/M input tokens, $8.19/M output tokens (API)',
'o1': '$15/M input tokens, $60/M output tokens (API)',
'advantage': 'R1 (7-8x cheaper)',
},
}
return comparison
DeepSeek-R1 vs OpenAI o1: Benchmark Comparison
| Benchmark | R1 | o1 | o1-mini | Winner |
|---|---|---|---|---|
| AIME 2024 | 79.8% | 83.3% | 63.6% | o1 (+3.5%) |
| MATH-500 | 97.3% | 96.4% | 90.0% | R1 (+0.9%) |
| Codeforces | 96.3 pctl | 96.6 pctl | 92.4 pctl | Tie |
| GPQA Diamond | 71.5% | 78.0% | 60.0% | o1 (+6.5%) |
| MMLU | 90.8% | 92.3% | 85.2% | o1 (+1.5%) |
| LiveCodeBench | 65.9% | 63.4% | 52.1% | R1 (+2.5%) |
| Cost per query | $0.08 | $0.60 | $0.24 | R1 (7.5x cheaper) |
R1 matches or exceeds o1 on math and code benchmarks while being 7.5x cheaper per query. o1 retains an advantage on knowledge-intensive tasks (GPQA, MMLU) — likely due to a larger base model with more stored knowledge. The cost difference makes R1 the practical choice for most applications.
Distillation: R1 to Smaller Models
def r1_distillation():
"""
DeepSeek also distills R1's reasoning into smaller dense models.
This creates a family of reasoning models at different scales.
"""
distilled_models = {
'R1-Distill-Qwen-1.5B': {
'base': 'Qwen 2.5 1.5B',
'distillation_data': 'R1 reasoning traces (800K samples)',
'math_500': 83.9,
'aime_2024': 28.9,
'note': 'A 1.5B model with reasoning ability — remarkable',
},
'R1-Distill-Qwen-7B': {
'base': 'Qwen 2.5 7B',
'distillation_data': 'R1 reasoning traces (800K samples)',
'math_500': 92.8,
'aime_2024': 55.5,
'note': 'Outperforms many 70B models on math',
},
'R1-Distill-Qwen-14B': {
'base': 'Qwen 2.5 14B',
'distillation_data': 'R1 reasoning traces (800K samples)',
'math_500': 93.9,
'aime_2024': 69.7,
'note': 'Approaches R1-full on math benchmarks',
},
'R1-Distill-Qwen-32B': {
'base': 'Qwen 2.5 32B',
'distillation_data': 'R1 reasoning traces (800K samples)',
'math_500': 95.9,
'aime_2024': 72.6,
'note': 'Nearly matches R1-full at 1/20th the active params',
},
'R1-Distill-Llama-8B': {
'base': 'Llama 3.1 8B',
'distillation_data': 'R1 reasoning traces (800K samples)',
'math_500': 89.1,
'aime_2024': 50.4,
'note': 'Llama backbone, competitive with Qwen-7B distillation',
},
'R1-Distill-Llama-70B': {
'base': 'Llama 3.3 70B',
'distillation_data': 'R1 reasoning traces (800K samples)',
'math_500': 94.5,
'aime_2024': 70.0,
'note': 'Strong reasoning at 70B dense scale',
},
}
distillation_method = {
'data_generation': 'Generate 800K reasoning traces from R1-full',
'filtering': 'Keep only traces that lead to correct answers',
'training': 'Standard SFT on the filtered traces',
'no_rl_needed': 'Distillation captures RL improvements via SFT alone',
}
return distilled_models, distillation_method
Distillation Pipeline
class R1DistillationPipeline:
"""Generate reasoning data from R1 and train smaller models."""
def __init__(self, teacher_model, student_model, config):
self.teacher = teacher_model
self.student = student_model
self.config = config
def generate_training_data(self, prompts):
"""
Generate reasoning traces from the teacher model.
Filter for correctness.
"""
training_pairs = []
for prompt in prompts:
# Generate multiple completions
completions = []
for _ in range(self.config.num_samples_per_prompt):
completion = self.teacher.generate(
prompt,
temperature=0.7,
max_tokens=8192,
)
completions.append(completion)
# Filter: keep only correct completions
correct_completions = []
for comp in completions:
if self._verify_correctness(prompt, comp):
correct_completions.append(comp)
# Select the shortest correct completion (most efficient reasoning)
if correct_completions:
best = min(correct_completions, key=len)
training_pairs.append({
'prompt': prompt,
'completion': best,
})
return training_pairs
def train_student(self, training_data, num_epochs=3):
"""Standard SFT on reasoning traces."""
optimizer = torch.optim.AdamW(
self.student.parameters(),
lr=self.config.lr,
weight_decay=0.01,
)
for epoch in range(num_epochs):
for batch in self._make_batches(training_data):
loss = self._compute_sft_loss(batch)
loss.backward()
torch.nn.utils.clip_grad_norm_(self.student.parameters(), 1.0)
optimizer.step()
optimizer.zero_grad()
def _verify_correctness(self, prompt, completion):
"""Check if the reasoning trace produces the correct answer."""
answer = self._extract_answer(completion)
ground_truth = self._get_ground_truth(prompt)
return answer == ground_truth
def _compute_sft_loss(self, batch):
"""Standard next-token prediction loss on reasoning traces."""
input_ids = batch['input_ids']
labels = batch['labels']
logits = self.student(input_ids).logits
return F.cross_entropy(
logits[:, :-1].reshape(-1, logits.size(-1)),
labels[:, 1:].reshape(-1),
ignore_index=-100,
)
def _extract_answer(self, completion):
import re
match = re.search(r'<answer>(.*?)</answer>', completion, re.DOTALL)
return match.group(1).strip() if match else None
def _get_ground_truth(self, prompt):
pass # Look up from dataset
def _make_batches(self, data):
pass # Standard batching
R1 Distilled Model Quality (MATH-500)
What Makes R1 Significant
def r1_significance():
"""
R1's significance is not just benchmark numbers.
It's the demonstration that reasoning can be trained
with RL on verifiable tasks, without human-labeled reasoning data.
"""
key_contributions = {
'no_human_reasoning_data': {
'what': 'RL rewards are rule-based (correct/incorrect). '
'No human wrote step-by-step reasoning traces for training.',
'why_important': 'Human reasoning data is expensive and limited. '
'RL can generate unlimited reasoning traces.',
},
'grpo_efficiency': {
'what': 'GRPO eliminates the critic model from RL training.',
'why_important': 'Makes RL tractable for 671B-parameter models. '
'PPO would require 3x memory.',
},
'open_weights': {
'what': 'Full model weights released publicly.',
'why_important': 'Research community can reproduce, study, and '
'improve upon R1. Closed models (o1) prevent this.',
},
'distillation_works': {
'what': '1.5B model achieves 83.9% on MATH-500 after distillation.',
'why_important': 'Reasoning ability transfers to tiny models. '
'Enables deployment on consumer hardware.',
},
'cost_efficiency': {
'what': '7-8x cheaper than o1 at comparable quality.',
'why_important': 'Makes reasoning AI accessible to more applications.',
},
}
return key_contributions
The R1 paper demonstrates a clean, reproducible path to reasoning AI: start with a strong base model, teach it the reasoning format with minimal data, then let RL discover how to reason effectively through trial and error. The absence of human-labeled reasoning traces is the most important insight — it means reasoning ability can scale with compute rather than with expensive human annotation.