llama.cpp is the dominant framework for CPU-based LLM inference. It uses a custom quantization format called GGUF (GPT-Generated Unified Format) that packs quantized weights into fixed-size blocks optimized for SIMD processing on x86 (AVX2/AVX-512) and ARM (NEON) architectures. Unlike GPU quantization (which targets tensor cores), GGUF quantization targets CPU vector units that process 256-bit or 512-bit vectors in a single instruction.
This post documents every GGUF quantization type: the block structure, bit layout, scale factor encoding, dequantization procedure, and SIMD implementation. We then benchmark perplexity and throughput across all types.
Block-Based Quantization Architecture
GGUF organizes weights into fixed-size blocks. Each block contains a header (scale factors, mins) followed by packed quantized values. The block size is chosen so that a block fits naturally into SIMD registers.
The Fundamental Block Structure
Block Layout (generic):
+-----------+--------+---------------------+
| Scale(s) | Min(s) | Packed quant values |
| (FP16/FP32) | (bit-packed INT4/5/8)|
+-----------+--------+---------------------+
The two key design principles:
-
Fixed block size: Every block has the same byte size for a given type, enabling pointer arithmetic without per-block metadata.
-
Scale factors at block granularity: One or two scale factors per block, amortizing metadata overhead over many weights.
import struct
import numpy as np
# Block sizes for each quantization type
GGUF_BLOCK_SIZES = {
'Q4_0': {'weights_per_block': 32, 'bytes_per_block': 18},
'Q4_1': {'weights_per_block': 32, 'bytes_per_block': 20},
'Q5_0': {'weights_per_block': 32, 'bytes_per_block': 22},
'Q5_1': {'weights_per_block': 32, 'bytes_per_block': 24},
'Q8_0': {'weights_per_block': 32, 'bytes_per_block': 34},
'Q8_1': {'weights_per_block': 32, 'bytes_per_block': 36},
'Q2_K': {'weights_per_block': 256, 'bytes_per_block': 84},
'Q3_K': {'weights_per_block': 256, 'bytes_per_block': 110},
'Q4_K': {'weights_per_block': 256, 'bytes_per_block': 144},
'Q5_K': {'weights_per_block': 256, 'bytes_per_block': 176},
'Q6_K': {'weights_per_block': 256, 'bytes_per_block': 210},
}
def effective_bits(qtype):
info = GGUF_BLOCK_SIZES[qtype]
return info['bytes_per_block'] * 8 / info['weights_per_block']
for qtype in GGUF_BLOCK_SIZES:
eff = effective_bits(qtype)
print(f" {qtype:>5s}: {eff:.2f} bits/weight")
Q4_0: 4.50 bits/weight
Q4_1: 5.00 bits/weight
Q5_0: 5.50 bits/weight
Q5_1: 6.00 bits/weight
Q8_0: 8.50 bits/weight
Q8_1: 9.00 bits/weight
Q2_K: 2.62 bits/weight
Q3_K: 3.44 bits/weight
Q4_K: 4.50 bits/weight
Q5_K: 5.50 bits/weight
Q6_K: 6.56 bits/weight
Legacy Quantization Types: Q4_0, Q4_1, Q8_0
Q4_0: Symmetric 4-bit, 32 Weights per Block
def quantize_q4_0_block(weights):
"""Quantize 32 FP32 weights to Q4_0 format.
Block layout (18 bytes):
- 2 bytes: FP16 delta (scale factor)
- 16 bytes: 32 x 4-bit unsigned values packed as 16 bytes
Encoding: q[i] = round(w[i] / delta + 8), clamped to [0, 15]
Decoding: w[i] = (q[i] - 8) * delta
"""
assert len(weights) == 32
amax = np.max(np.abs(weights))
delta = amax / 8.0 if amax > 0 else 1.0
delta_fp16 = np.float16(delta)
# Quantize to [0, 15]
quantized = np.round(weights / float(delta_fp16) + 8.0)
quantized = np.clip(quantized, 0, 15).astype(np.uint8)
# Pack pairs into bytes (low nibble first)
packed = np.zeros(16, dtype=np.uint8)
for i in range(16):
packed[i] = (quantized[2*i+1] << 4) | quantized[2*i]
# Build block: [delta_fp16 (2 bytes)] [packed (16 bytes)]
block = struct.pack('<e', float(delta_fp16)) + packed.tobytes()
return block # 18 bytes total
def dequantize_q4_0_block(block):
"""Dequantize a Q4_0 block to 32 FP32 values."""
delta = struct.unpack('<e', block[:2])[0]
packed = np.frombuffer(block[2:], dtype=np.uint8)
values = np.zeros(32, dtype=np.float32)
for i in range(16):
low = packed[i] & 0x0F
high = (packed[i] >> 4) & 0x0F
values[2*i] = (float(low) - 8.0) * delta
values[2*i+1] = (float(high) - 8.0) * delta
return values
Q4_1: Asymmetric 4-bit with Min Value
def quantize_q4_1_block(weights):
"""Quantize 32 FP32 weights to Q4_1 format.
Block layout (20 bytes):
- 2 bytes: FP16 delta (scale)
- 2 bytes: FP16 min value
- 16 bytes: 32 x 4-bit unsigned values
Encoding: q[i] = round((w[i] - min) / delta), clamped to [0, 15]
Decoding: w[i] = q[i] * delta + min
"""
assert len(weights) == 32
wmin = np.min(weights)
wmax = np.max(weights)
delta = (wmax - wmin) / 15.0 if wmax > wmin else 1.0
delta_fp16 = np.float16(delta)
min_fp16 = np.float16(wmin)
quantized = np.round((weights - float(min_fp16)) / float(delta_fp16))
quantized = np.clip(quantized, 0, 15).astype(np.uint8)
packed = np.zeros(16, dtype=np.uint8)
for i in range(16):
packed[i] = (quantized[2*i+1] << 4) | quantized[2*i]
block = (struct.pack('<e', float(delta_fp16)) +
struct.pack('<e', float(min_fp16)) +
packed.tobytes())
return block # 20 bytes
def dequantize_q4_1_block(block):
"""Dequantize a Q4_1 block to 32 FP32 values."""
delta = struct.unpack('<e', block[:2])[0]
min_val = struct.unpack('<e', block[2:4])[0]
packed = np.frombuffer(block[4:], dtype=np.uint8)
values = np.zeros(32, dtype=np.float32)
for i in range(16):
low = packed[i] & 0x0F
high = (packed[i] >> 4) & 0x0F
values[2*i] = float(low) * delta + min_val
values[2*i+1] = float(high) * delta + min_val
return values
Q8_0: Symmetric 8-bit
def quantize_q8_0_block(weights):
"""Quantize 32 FP32 weights to Q8_0 format.
Block layout (34 bytes):
- 2 bytes: FP16 delta (scale factor)
- 32 bytes: 32 x INT8 signed values
Encoding: q[i] = round(w[i] / delta), clamped to [-128, 127]
Decoding: w[i] = q[i] * delta
"""
assert len(weights) == 32
amax = np.max(np.abs(weights))
delta = amax / 127.0 if amax > 0 else 1.0
delta_fp16 = np.float16(delta)
quantized = np.round(weights / float(delta_fp16))
quantized = np.clip(quantized, -128, 127).astype(np.int8)
block = struct.pack('<e', float(delta_fp16)) + quantized.tobytes()
return block # 34 bytes
def dequantize_q8_0_block(block):
"""Dequantize a Q8_0 block to 32 FP32 values."""
delta = struct.unpack('<e', block[:2])[0]
quantized = np.frombuffer(block[2:], dtype=np.int8)
return quantized.astype(np.float32) * delta
The K-Quant Family: Q4_K, Q5_K, Q6_K
The K-quant types (introduced in llama.cpp PR #1684 by ikawrakow) use a two-level hierarchical scaling scheme: a super-block of 256 weights divided into 8 sub-blocks of 32 weights each. The super-block has a master scale and min, and each sub-block has a 6-bit relative scale and min.
Q4_K Block Structure
def quantize_q4_k_block(weights):
"""Quantize 256 FP32 weights to Q4_K format.
Super-block layout (144 bytes):
- 2 bytes: FP16 d (super-block scale)
- 2 bytes: FP16 dmin (super-block min scale)
- 12 bytes: 8 x 6-bit sub-block scales (packed)
- 12 bytes: 8 x 6-bit sub-block mins (packed)
- 128 bytes: 256 x 4-bit quantized values (packed)
The 6-bit scales/mins are stored packed:
- First 4 scales: lower 4 bits in bytes 0-3, upper 2 bits in bytes 8-9
- Last 4 scales: lower 4 bits in bytes 4-7, upper 2 bits in bytes 10-11
"""
assert len(weights) == 256
# Split into 8 sub-blocks of 32
sub_blocks = weights.reshape(8, 32)
# Compute per-sub-block range
sub_maxes = np.max(sub_blocks, axis=1)
sub_mins = np.min(sub_blocks, axis=1)
sub_ranges = sub_maxes - sub_mins
# Super-block scale: quantize the sub-block ranges to 6 bits (0-63)
max_range = np.max(sub_ranges)
d = max_range / 63.0 if max_range > 0 else 1.0
max_min = np.max(np.abs(sub_mins))
dmin = max_min / 63.0 if max_min > 0 else 1.0
# 6-bit sub-block scales and mins
sub_scales_6bit = np.round(sub_ranges / d).clip(0, 63).astype(np.uint8)
sub_mins_6bit = np.round(np.abs(sub_mins) / dmin).clip(0, 63).astype(np.uint8)
# Reconstruct effective scale and min per sub-block
eff_scales = sub_scales_6bit.astype(np.float32) * d
eff_mins = sub_mins_6bit.astype(np.float32) * dmin
# Quantize each sub-block to 4 bits [0, 15]
quantized = np.zeros(256, dtype=np.uint8)
for sb in range(8):
for i in range(32):
idx = sb * 32 + i
if eff_scales[sb] > 0:
q = round((weights[idx] + eff_mins[sb]) / eff_scales[sb] * 15.0)
else:
q = 0
quantized[idx] = max(0, min(15, q))
return {
'd': np.float16(d),
'dmin': np.float16(dmin),
'sub_scales': sub_scales_6bit,
'sub_mins': sub_mins_6bit,
'quantized': quantized,
}
The K-quant hierarchy uses 6-bit sub-block scales quantized by a FP16 super-block scale. This gives each sub-block its own effective scale (fine granularity) while keeping the metadata overhead low: 12 bytes for 8 scales (6 bits each) plus 2 bytes for the FP16 super-scale = 14 bytes for 256 weights = 0.44 bits/weight overhead. Compare this to per-group FP16 scales at group_size=32: 2 bytes per 32 weights = 0.5 bits/weight.
Q5_K: 5-bit K-Quant
def quantize_q5_k_block(weights):
"""Quantize 256 FP32 weights to Q5_K format.
Super-block layout (176 bytes):
- 2 bytes: FP16 d
- 2 bytes: FP16 dmin
- 12 bytes: 8 x 6-bit sub-block scales
- 12 bytes: 8 x 6-bit sub-block mins
- 128 bytes: 256 x 4-bit low nibbles (packed)
- 32 bytes: 256 x 1-bit high bits (packed)
Each weight gets 5 bits: 4 from the nibble + 1 from the high-bit array.
Value range: [0, 31]
"""
assert len(weights) == 256
sub_blocks = weights.reshape(8, 32)
sub_maxes = np.max(sub_blocks, axis=1)
sub_mins = np.min(sub_blocks, axis=1)
sub_ranges = sub_maxes - sub_mins
d = np.max(sub_ranges) / 63.0 if np.max(sub_ranges) > 0 else 1.0
dmin = np.max(np.abs(sub_mins)) / 63.0 if np.max(np.abs(sub_mins)) > 0 else 1.0
sub_scales_6bit = np.round(sub_ranges / d).clip(0, 63).astype(np.uint8)
sub_mins_6bit = np.round(np.abs(sub_mins) / dmin).clip(0, 63).astype(np.uint8)
eff_scales = sub_scales_6bit.astype(np.float32) * d
eff_mins = sub_mins_6bit.astype(np.float32) * dmin
# Quantize to 5 bits [0, 31]
quantized_5bit = np.zeros(256, dtype=np.uint8)
for sb in range(8):
for i in range(32):
idx = sb * 32 + i
if eff_scales[sb] > 0:
q = round((weights[idx] + eff_mins[sb]) / eff_scales[sb] * 31.0)
else:
q = 0
quantized_5bit[idx] = max(0, min(31, q))
# Split into low 4 bits and high 1 bit
low_nibbles = quantized_5bit & 0x0F
high_bits = (quantized_5bit >> 4) & 0x01
return {
'd': np.float16(d),
'dmin': np.float16(dmin),
'sub_scales': sub_scales_6bit,
'sub_mins': sub_mins_6bit,
'low_nibbles': low_nibbles,
'high_bits': high_bits,
}
Q4_K_S vs Q4_K_M: The Mixed Strategy
llama.cpp offers two variants of each K-quant type:
- Q4_K_S (Small): All layers use Q4_K
- Q4_K_M (Medium): Attention layers use Q6_K, MLP layers use Q4_K
The medium variant uses higher precision for attention layers because they are more sensitive to quantization error (attention patterns are determined by subtle differences in Q/K dot products).
def assign_quant_type_q4_k_m(layer_name, layer_type):
"""Assign quantization type for Q4_K_M mixed strategy.
Returns the GGUF quant type to use for this layer.
"""
# Attention output projection and value projection: Q6_K
if 'attn' in layer_name and ('o_proj' in layer_name or 'v_proj' in layer_name):
return 'Q6_K'
# Attention Q, K projections: Q4_K
if 'attn' in layer_name:
return 'Q4_K'
# MLP layers: Q4_K
if 'mlp' in layer_name:
return 'Q4_K'
# Embedding and head: Q6_K
if layer_type in ['embedding', 'lm_head']:
return 'Q6_K'
return 'Q4_K'
def assign_quant_type_q5_k_m(layer_name, layer_type):
"""Assign quantization type for Q5_K_M mixed strategy."""
if 'attn' in layer_name and ('o_proj' in layer_name or 'v_proj' in layer_name):
return 'Q6_K'
if layer_type in ['embedding', 'lm_head']:
return 'Q6_K'
return 'Q5_K'
GGUF Quant Type Assignment in Q4_K_M
| Layer Type | Quant Type | Bits/Weight | Rationale |
|---|---|---|---|
| attn.q_proj | Q4_K | 4.50 | Q/K less sensitive |
| attn.k_proj | Q4_K | 4.50 | Q/K less sensitive |
| attn.v_proj | Q6_K | 6.56 | Value projection sensitive |
| attn.o_proj | Q6_K | 6.56 | Output projection sensitive |
| mlp.gate_proj | Q4_K | 4.50 | MLP tolerates 4-bit |
| mlp.up_proj | Q4_K | 4.50 | MLP tolerates 4-bit |
| mlp.down_proj | Q4_K | 4.50 | MLP tolerates 4-bit |
| token_embd | Q6_K | 6.56 | Embedding sensitive |
| output (lm_head) | Q6_K | 6.56 | Final projection sensitive |
Importance Matrix (imatrix) Quantization
Newer versions of llama.cpp support importance-matrix guided quantization. The imatrix captures the squared activation magnitudes per weight channel, allowing the quantizer to allocate more precision to important weights:
def compute_importance_matrix(model, calibration_data):
"""Compute per-weight importance scores from calibration data.
For each linear layer with weight W and input activation X:
importance[i][j] = mean(X[:, j]^2) across calibration samples
Weights multiplied by frequently large activations are more important.
"""
importance = {}
hooks = []
def make_hook(name):
def hook(module, input_data, output):
x = input_data[0].detach().float()
x_flat = x.reshape(-1, x.shape[-1])
# Sum of squared activations per input channel
sq_sum = (x_flat ** 2).sum(dim=0) # (C_in,)
if name not in importance:
importance[name] = sq_sum
else:
importance[name] += sq_sum
return hook
for name, mod in model.named_modules():
if hasattr(mod, 'weight'):
hooks.append(mod.register_forward_hook(make_hook(name)))
model.eval()
total_tokens = 0
for batch in calibration_data:
with torch.no_grad():
model(batch)
total_tokens += batch.numel()
for h in hooks:
h.remove()
# Normalize
for name in importance:
importance[name] /= total_tokens
return importance
def quantize_with_imatrix(weights, importance_scores, bits, block_size=32):
"""Quantize weights using importance-weighted error minimization.
Instead of minimizing uniform MSE, minimize importance-weighted MSE:
loss = sum(importance[j] * (w[j] - w_hat[j])^2)
In practice, this means scaling the weights by sqrt(importance)
before computing the block scale factor, so that important weights
get finer quantization grid spacing.
"""
C_out, C_in = weights.shape
imp = importance_scores # (C_in,)
# Scale weights by sqrt(importance) for scale computation
scaled_weights = weights * np.sqrt(imp)[np.newaxis, :]
# Compute per-block scales using the importance-scaled weights
# This gives important channels more influence on the scale factor
num_blocks = C_in // block_size
quantized = np.zeros_like(weights, dtype=np.int8)
for row in range(C_out):
for b in range(num_blocks):
start = b * block_size
end = start + block_size
block = weights[row, start:end]
imp_block = imp[start:end]
# Importance-weighted max
weighted_abs_max = np.max(np.abs(block) * np.sqrt(imp_block))
# ... (quantize with weighted scale)
return quantized
SIMD-Optimized Dequantization
The real performance of GGUF quantization comes from SIMD-optimized dot product kernels. Here is the AVX2 implementation for Q4_0:
// Q4_0 dot product: compute dot(dequantize(q4_block), f32_vector)
// using x86 AVX2 (256-bit SIMD)
#include <immintrin.h>
static float vec_dot_q4_0_avx2(
const block_q4_0* restrict x, // Q4_0 blocks
const float* restrict y, // FP32 vector
int n // Number of elements
) {
int nb = n / 32; // Number of blocks
__m256 acc = _mm256_setzero_ps();
for (int i = 0; i < nb; i++) {
// Load scale factor
float d = GGML_FP16_TO_FP32(x[i].d);
__m256 vd = _mm256_set1_ps(d);
// Load 16 bytes of packed INT4 values
__m128i packed = _mm_loadu_si128(
(const __m128i*)x[i].qs
);
// Unpack low nibbles (first 16 values)
__m128i low_mask = _mm_set1_epi8(0x0F);
__m128i low = _mm_and_si128(packed, low_mask);
// Unpack high nibbles (next 16 values)
__m128i high = _mm_and_si128(
_mm_srli_epi16(packed, 4), low_mask
);
// Subtract 8 to get signed values [-8, 7]
__m128i eight = _mm_set1_epi8(8);
low = _mm_sub_epi8(low, eight);
high = _mm_sub_epi8(high, eight);
// Convert to FP32 and multiply with y values
// Process 8 values at a time (AVX2 = 8 floats)
for (int j = 0; j < 4; j++) {
// Extract 8 INT8 values, convert to FP32
// ... (expand INT8 -> INT32 -> FP32)
// Load 8 y values
__m256 vy = _mm256_loadu_ps(y + i*32 + j*8);
// dequantized = int_val * d
// dot += dequantized * y = int_val * d * y
__m256 vx = /* converted int values */;
acc = _mm256_fmadd_ps(
_mm256_mul_ps(vx, vd), vy, acc
);
}
}
// Horizontal sum of accumulator
float result = 0;
float tmp[8];
_mm256_storeu_ps(tmp, acc);
for (int i = 0; i < 8; i++) result += tmp[i];
return result;
}
ARM NEON Implementation
// Q4_0 dot product using ARM NEON (128-bit SIMD)
static float vec_dot_q4_0_neon(
const block_q4_0* restrict x,
const float* restrict y,
int n
) {
int nb = n / 32;
float32x4_t acc0 = vdupq_n_f32(0);
float32x4_t acc1 = vdupq_n_f32(0);
for (int i = 0; i < nb; i++) {
float d = GGML_FP16_TO_FP32(x[i].d);
float32x4_t vd = vdupq_n_f32(d);
// Load packed INT4 values
uint8x16_t packed = vld1q_u8(x[i].qs);
// Extract nibbles
uint8x16_t low = vandq_u8(packed, vdupq_n_u8(0x0F));
uint8x16_t high = vshrq_n_u8(packed, 4);
// Convert to signed by subtracting 8
int8x16_t low_s = vsubq_s8(
vreinterpretq_s8_u8(low), vdupq_n_s8(8)
);
int8x16_t high_s = vsubq_s8(
vreinterpretq_s8_u8(high), vdupq_n_s8(8)
);
// Widen to 16-bit, then 32-bit, convert to float, multiply
// Process 4 values at a time (NEON = 4 floats)
// ... (similar expansion and FMA as AVX2)
}
// Reduce accumulators
return vaddvq_f32(acc0) + vaddvq_f32(acc1);
}
On a modern x86 CPU with AVX2, the Q4_0 dot product achieves approximately 40% of the FP32 dot product throughput. The overhead comes from the nibble extraction and INT-to-FP conversion. K-quants are slightly slower per element due to the two-level scale lookup, but the higher quality per bit makes them the better choice.
Perplexity and Throughput Benchmarks
Llama-2 7B Perplexity by GGUF Quant Type (WikiText-2)
| Type | Bits/Weight | Model Size (GB) | Perplexity | Delta vs FP16 |
|---|---|---|---|---|
| FP16 | 16.00 | 13.0 | 5.47 | --- |
| Q8_0 | 8.50 | 6.7 | 5.47 | +0.00 |
| Q6_K | 6.56 | 5.2 | 5.48 | +0.01 |
| Q5_K_M | 5.69 | 4.5 | 5.52 | +0.05 |
| Q5_K_S | 5.50 | 4.3 | 5.54 | +0.07 |
| Q4_K_M | 4.83 | 3.8 | 5.63 | +0.16 |
| Q4_K_S | 4.50 | 3.6 | 5.68 | +0.21 |
| Q4_0 | 4.50 | 3.6 | 5.96 | +0.49 |
| Q3_K_M | 3.89 | 3.1 | 6.14 | +0.67 |
| Q3_K_S | 3.44 | 2.7 | 6.52 | +1.05 |
| Q2_K | 2.62 | 2.1 | 8.81 | +3.34 |
GGUF Perplexity vs Model Size (Llama-2 7B)
(WikiText-2 Perplexity)CPU Throughput
Llama-2 7B Decode Throughput by GGUF Type (M2 Max, 32 GB RAM)
| Type | Tokens/sec | vs Q8_0 |
|---|---|---|
| Q8_0 | 18.2 | 1.0x |
| Q6_K | 21.4 | 1.2x |
| Q5_K_M | 24.1 | 1.3x |
| Q4_K_M | 27.8 | 1.5x |
| Q4_0 | 29.3 | 1.6x |
| Q3_K_M | 30.1 | 1.7x |
| Q2_K | 33.5 | 1.8x |
Choosing the Right GGUF Type
def recommend_gguf_type(
model_size_B,
available_ram_GB,
quality_priority, # 'high', 'balanced', 'speed'
):
"""Recommend GGUF quantization type based on constraints."""
# Estimate model size at different quant levels
# (rough: params_B * bits_per_weight / 8 * 1.1 for overhead)
sizes = {
'Q8_0': model_size_B * 8.5 / 8 * 1.1,
'Q6_K': model_size_B * 6.56 / 8 * 1.1,
'Q5_K_M': model_size_B * 5.69 / 8 * 1.1,
'Q4_K_M': model_size_B * 4.83 / 8 * 1.1,
'Q4_K_S': model_size_B * 4.50 / 8 * 1.1,
'Q3_K_M': model_size_B * 3.89 / 8 * 1.1,
'Q2_K': model_size_B * 2.62 / 8 * 1.1,
}
# Filter by RAM
candidates = {k: v for k, v in sizes.items() if v < available_ram_GB * 0.85}
if not candidates:
return "Model too large for available RAM"
if quality_priority == 'high':
# Pick highest quality that fits
order = ['Q8_0', 'Q6_K', 'Q5_K_M', 'Q4_K_M']
elif quality_priority == 'balanced':
order = ['Q5_K_M', 'Q4_K_M', 'Q4_K_S', 'Q3_K_M']
else:
order = ['Q4_K_M', 'Q4_K_S', 'Q3_K_M', 'Q2_K']
for qtype in order:
if qtype in candidates:
return f"{qtype} ({sizes[qtype]:.1f} GB)"
return list(candidates.keys())[-1]
# Examples
print(recommend_gguf_type(7, 8, 'balanced')) # Q5_K_M (4.5 GB)
print(recommend_gguf_type(7, 4, 'balanced')) # Q4_K_M (3.8 GB)
print(recommend_gguf_type(70, 32, 'balanced')) # Q4_K_M (37.2 GB) -- too big
print(recommend_gguf_type(70, 64, 'balanced')) # Q4_K_M (37.2 GB)