This post is the capstone of the Quantization Masterclass series. It walks through the complete production pipeline: starting from an FP16 model checkpoint and ending with a quantized model serving requests in production with quality monitoring. Each step includes the exact commands, scripts, and validation checks. The goal is a reproducible pipeline that you can adapt for any transformer-based LLM.
Pipeline Overview
ββββββββββββββββ βββββββββββββββββ ββββββββββββββββ
β FP16 ββββββΆβ Calibration ββββββΆβ Quantization β
β Checkpoint β β Data Prep β β (GPTQ/AWQ) β
ββββββββββββββββ βββββββββββββββββ ββββββββ¬ββββββββ
β
βββββββββββββββββ ββββββββΌββββββββ
β Production βββββββ Quality β
β Deployment β β Validation β
βββββββββ¬ββββββββ ββββββββββββββββ
β
βββββββββΌββββββββ
β Monitoring β
β and Alerting β
βββββββββββββββββ
The pipeline has five stages. We will implement each one.
Stage 1: Checkpoint Preparation
Start by verifying the FP16 checkpoint is correct and establishing baseline metrics.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import hashlib
import os
class CheckpointPrep:
def __init__(self, model_name, output_dir):
self.model_name = model_name
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
def load_and_verify(self):
"""Load model and verify integrity."""
print(f"Loading {self.model_name}...")
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.float16,
device_map="auto"
)
# Count parameters
total_params = sum(p.numel() for p in self.model.parameters())
fp16_size_gb = total_params * 2 / (1024**3)
print(f"Parameters: {total_params / 1e9:.1f}B")
print(f"FP16 size: {fp16_size_gb:.1f} GB")
# Verify model architecture
config = self.model.config
print(f"Architecture: {config.model_type}")
print(f"Hidden size: {config.hidden_size}")
print(f"Layers: {config.num_hidden_layers}")
print(f"Attention heads: {config.num_attention_heads}")
return {
'total_params': total_params,
'fp16_size_gb': fp16_size_gb,
'num_layers': config.num_hidden_layers
}
def compute_baseline_metrics(self, eval_dataset):
"""Compute baseline perplexity and benchmark scores."""
from lm_eval import evaluator
# Perplexity on WikiText-2
ppl = self._compute_perplexity(eval_dataset)
print(f"Baseline perplexity: {ppl:.4f}")
# Save baseline for later comparison
baseline = {
'model_name': self.model_name,
'perplexity': ppl,
'timestamp': str(torch.cuda.get_device_name(0))
}
with open(os.path.join(self.output_dir, 'baseline_metrics.json'), 'w') as f:
json.dump(baseline, f, indent=2)
return baseline
def _compute_perplexity(self, dataset, max_length=2048):
"""Compute perplexity on evaluation dataset."""
self.model.eval()
total_loss = 0.0
total_tokens = 0
with torch.no_grad():
for text in dataset:
inputs = self.tokenizer(
text, return_tensors="pt",
max_length=max_length, truncation=True
).to(self.model.device)
outputs = self.model(**inputs, labels=inputs['input_ids'])
total_loss += outputs.loss.item() * inputs['input_ids'].shape[1]
total_tokens += inputs['input_ids'].shape[1]
avg_loss = total_loss / total_tokens
return torch.exp(torch.tensor(avg_loss)).item()
# Usage
prep = CheckpointPrep("meta-llama/Llama-2-70b-hf", "./quant_pipeline_output")
info = prep.load_and_verify()
Stage 2: Calibration Data Preparation
from datasets import load_dataset
import random
class CalibrationDataPrep:
def __init__(self, tokenizer, num_samples=128, seq_len=2048, seed=42):
self.tokenizer = tokenizer
self.num_samples = num_samples
self.seq_len = seq_len
self.seed = seed
random.seed(seed)
def prepare_c4(self):
"""Prepare calibration data from C4 dataset."""
dataset = load_dataset(
"allenai/c4", "en",
split="train",
streaming=True
)
samples = []
for item in dataset:
text = item['text']
tokens = self.tokenizer.encode(text, add_special_tokens=False)
if len(tokens) >= self.seq_len:
start = random.randint(0, len(tokens) - self.seq_len)
samples.append(tokens[start:start + self.seq_len])
if len(samples) >= self.num_samples:
break
calibration_data = torch.tensor(samples)
print(f"Calibration data: {calibration_data.shape}")
# Shape: [128, 2048]
return calibration_data
def prepare_domain_specific(self, texts):
"""Prepare calibration data from domain-specific text."""
samples = []
for text in texts:
tokens = self.tokenizer.encode(text, add_special_tokens=False)
if len(tokens) >= self.seq_len:
start = random.randint(0, len(tokens) - self.seq_len)
samples.append(tokens[start:start + self.seq_len])
if len(samples) < self.num_samples:
print(f"Warning: only {len(samples)} domain samples. "
f"Padding with C4 data.")
c4_samples = self.prepare_c4()
needed = self.num_samples - len(samples)
samples.extend(c4_samples[:needed].tolist())
return torch.tensor(samples[:self.num_samples])
def validate_calibration_data(self, data):
"""Sanity check calibration data."""
assert data.shape == (self.num_samples, self.seq_len), \
f"Expected ({self.num_samples}, {self.seq_len}), got {data.shape}"
assert data.min() >= 0, "Negative token IDs found"
assert data.max() < self.tokenizer.vocab_size, "Token ID exceeds vocab"
# Check diversity
unique_tokens = data.unique().numel()
diversity = unique_tokens / self.tokenizer.vocab_size
print(f"Token diversity: {diversity:.1%} of vocab")
if diversity < 0.1:
print("WARNING: Low token diversity. Consider more diverse text.")
return True
Stage 3: Quantization
GPTQ Quantization
# Install AutoGPTQ
pip install auto-gptq optimum
# GPTQ quantization script
python -c "
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer
import torch
model_name = 'meta-llama/Llama-2-70b-hf'
output_dir = './Llama-2-70B-GPTQ-INT4'
tokenizer = AutoTokenizer.from_pretrained(model_name)
quantize_config = BaseQuantizeConfig(
bits=4,
group_size=128,
desc_act=False, # False for Marlin kernel compatibility
damp_percent=0.01,
sym=False, # Asymmetric quantization
true_sequential=True, # Process layers sequentially
model_name_or_path=model_name
)
model = AutoGPTQForCausalLM.from_pretrained(
model_name,
quantize_config=quantize_config,
torch_dtype=torch.float16
)
# Load calibration data
calibration_data = load_calibration_data(tokenizer, 128, 2048)
examples = [{'input_ids': row.unsqueeze(0)} for row in calibration_data]
# Quantize
model.quantize(examples, batch_size=1)
# Save
model.save_quantized(output_dir)
tokenizer.save_pretrained(output_dir)
print(f'Saved GPTQ model to {output_dir}')
"
AWQ Quantization
pip install autoawq
python -c "
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_name = 'meta-llama/Llama-2-70b-hf'
output_dir = './Llama-2-70B-AWQ-INT4'
model = AutoAWQForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
quant_config = {
'zero_point': True,
'q_group_size': 128,
'w_bit': 4,
'version': 'GEMM' # GEMM for batched, GEMV for single
}
model.quantize(tokenizer, quant_config=quant_config)
model.save_quantized(output_dir)
tokenizer.save_pretrained(output_dir)
print(f'Saved AWQ model to {output_dir}')
"
FP8 Quantization (for H100)
# vLLM online FP8 quantization (no separate step needed)
# Just specify --quantization fp8 when launching vLLM
# But for pre-quantization with better quality:
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
model_name = "meta-llama/Llama-2-70b-hf"
output_dir = "./Llama-2-70B-FP8"
recipe = QuantizationModifier(
targets="Linear",
scheme="FP8",
ignore=["lm_head"], # Keep output head at FP16
)
oneshot(
model=model_name,
recipe=recipe,
output_dir=output_dir,
dataset="ultrachat_200k",
num_calibration_samples=512,
max_seq_length=2048,
)
Quantization Time and Resources (Llama 70B)
| Method | GPU Required | GPU Memory | CPU RAM | Time | Output Size |
|---|---|---|---|---|---|
| GPTQ INT4 g128 | 1x A100-80GB | ~75 GB | 128 GB | 3-5 hours | 37 GB |
| AWQ INT4 g128 | 1x A100-80GB | ~75 GB | 128 GB | 2-4 hours | 37 GB |
| FP8 (llmcompressor) | 1x H100-80GB | ~75 GB | 64 GB | 1-2 hours | 70 GB |
| FP8 (vLLM online) | Serving GPU | Same as serve | N/A | 0 (at load) | N/A |
Stage 4: Quality Validation
This is the most critical stage. Never deploy a quantized model without validation.
import json
import os
class QuantizationValidator:
def __init__(self, baseline_metrics_path, tolerance_ppl_pct=10.0,
tolerance_acc_pct=5.0):
with open(baseline_metrics_path) as f:
self.baseline = json.load(f)
self.tolerance_ppl = tolerance_ppl_pct
self.tolerance_acc = tolerance_acc_pct
def validate_perplexity(self, quantized_model, eval_data, tokenizer):
"""Check perplexity regression."""
quant_ppl = compute_perplexity(quantized_model, eval_data, tokenizer)
baseline_ppl = self.baseline['perplexity']
delta_pct = (quant_ppl - baseline_ppl) / baseline_ppl * 100
result = {
'metric': 'perplexity',
'baseline': baseline_ppl,
'quantized': quant_ppl,
'delta_pct': delta_pct,
'threshold_pct': self.tolerance_ppl,
'passed': delta_pct <= self.tolerance_ppl
}
status = "PASS" if result['passed'] else "FAIL"
print(f"Perplexity: {baseline_ppl:.4f} -> {quant_ppl:.4f} "
f"({delta_pct:+.1f}%) [{status}]")
return result
def validate_task_accuracy(self, quantized_model, tasks):
"""Run standard benchmarks on quantized model."""
results = []
for task_name, task_fn, baseline_acc in tasks:
quant_acc = task_fn(quantized_model)
delta_pct = (baseline_acc - quant_acc) / baseline_acc * 100
passed = delta_pct <= self.tolerance_acc
result = {
'task': task_name,
'baseline_acc': baseline_acc,
'quantized_acc': quant_acc,
'delta_pct': delta_pct,
'passed': passed
}
results.append(result)
status = "PASS" if passed else "FAIL"
print(f"{task_name}: {baseline_acc:.1%} -> {quant_acc:.1%} "
f"({delta_pct:+.1f}%) [{status}]")
return results
def validate_output_consistency(self, fp16_model, quant_model,
tokenizer, test_prompts):
"""Check output consistency between FP16 and quantized model."""
results = []
for prompt in test_prompts:
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
fp16_out = fp16_model.generate(
**inputs.to(fp16_model.device),
max_new_tokens=100,
do_sample=False # Greedy for reproducibility
)
quant_out = quant_model.generate(
**inputs.to(quant_model.device),
max_new_tokens=100,
do_sample=False
)
fp16_text = tokenizer.decode(fp16_out[0], skip_special_tokens=True)
quant_text = tokenizer.decode(quant_out[0], skip_special_tokens=True)
# Check if outputs match (greedy should be deterministic)
match = fp16_text == quant_text
results.append({
'prompt': prompt[:50],
'match': match,
'fp16_len': len(fp16_text),
'quant_len': len(quant_text)
})
match_rate = sum(r['match'] for r in results) / len(results)
print(f"Output match rate (greedy): {match_rate:.1%}")
return results
def generate_report(self, all_results, output_path):
"""Generate validation report."""
report = {
'model': self.baseline['model_name'],
'baseline_perplexity': self.baseline['perplexity'],
'results': all_results,
'overall_pass': all(
r.get('passed', True)
for r in all_results
if isinstance(r, dict)
)
}
with open(output_path, 'w') as f:
json.dump(report, f, indent=2)
status = "PASSED" if report['overall_pass'] else "FAILED"
print(f"\nValidation {status}")
return report
Never skip quality validation. A model that passes perplexity checks can still fail on specific tasks (code generation, math, structured output). Include task-specific benchmarks that match your production use case. If any check fails beyond tolerance, do NOT deploy β investigate and fix first.
Validation Time by Test Suite (70B Model)
(minutes)Stage 5: Production Deployment
vLLM Deployment Script
#!/usr/bin/env python3
"""Production deployment script for quantized model on vLLM."""
import subprocess
import sys
import json
import os
class ProductionDeployer:
def __init__(self, config_path):
with open(config_path) as f:
self.config = json.load(f)
def pre_flight_checks(self):
"""Verify system requirements before deployment."""
import torch
# Check GPU
assert torch.cuda.is_available(), "No GPU detected"
gpu_name = torch.cuda.get_device_name(0)
gpu_mem = torch.cuda.get_device_properties(0).total_mem / (1024**3)
print(f"GPU: {gpu_name}, {gpu_mem:.0f} GB")
# Check model exists
model_path = self.config['model_path']
assert os.path.exists(model_path), f"Model not found: {model_path}"
# Check validation report
report_path = self.config.get('validation_report')
if report_path:
with open(report_path) as f:
report = json.load(f)
assert report['overall_pass'], "Validation report shows FAILED"
print("Validation: PASSED")
# Check disk space for swap
swap_gb = self.config.get('swap_space_gb', 16)
# ... verify disk space
print("Pre-flight checks: ALL PASSED")
return True
def deploy(self):
"""Launch vLLM server with production settings."""
self.pre_flight_checks()
cmd = [
sys.executable, "-m", "vllm.entrypoints.openai.api_server",
"--model", self.config['model_path'],
"--quantization", self.config['quantization'],
"--tensor-parallel-size", str(self.config.get('tp_size', 1)),
"--max-model-len", str(self.config.get('max_model_len', 4096)),
"--gpu-memory-utilization", str(self.config.get('gpu_mem_util', 0.90)),
"--max-num-seqs", str(self.config.get('max_num_seqs', 128)),
"--max-num-batched-tokens", str(self.config.get('max_batched_tokens', 8192)),
"--host", self.config.get('host', '0.0.0.0'),
"--port", str(self.config.get('port', 8000)),
"--disable-log-requests",
]
if self.config.get('enable_chunked_prefill', True):
cmd.append("--enable-chunked-prefill")
print(f"Launching: {' '.join(cmd)}")
subprocess.run(cmd)
# Example config
deployment_config = {
"model_path": "./Llama-2-70B-AWQ-INT4",
"quantization": "awq",
"tp_size": 1,
"max_model_len": 4096,
"gpu_mem_util": 0.92,
"max_num_seqs": 128,
"max_batched_tokens": 8192,
"enable_chunked_prefill": True,
"host": "0.0.0.0",
"port": 8000,
"validation_report": "./quant_pipeline_output/validation_report.json",
"swap_space_gb": 16
}
Health Check Endpoint
import requests
import time
def health_check(base_url="http://localhost:8000", timeout=300):
"""Wait for server to be ready and verify it serves correctly."""
start = time.time()
while time.time() - start < timeout:
try:
resp = requests.get(f"{base_url}/health", timeout=5)
if resp.status_code == 200:
print("Server is healthy")
# Verify inference works
test_resp = requests.post(
f"{base_url}/v1/completions",
json={
"model": "Llama-2-70B-AWQ-INT4",
"prompt": "The capital of France is",
"max_tokens": 10,
"temperature": 0
},
timeout=30
)
if test_resp.status_code == 200:
result = test_resp.json()
text = result['choices'][0]['text']
print(f"Test inference: '{text.strip()}'")
if "paris" in text.lower():
print("Inference quality: OK")
return True
else:
print(f"WARNING: Unexpected output: {text}")
except requests.ConnectionError:
pass
time.sleep(5)
print("ERROR: Server failed to start within timeout")
return False
Stage 6: Production Monitoring
from prometheus_client import Histogram, Counter, Gauge, start_http_server
import time
# Define metrics
QUANT_QUALITY_SCORE = Gauge(
'llm_quantization_quality_score',
'Rolling quality score (0-1) based on output validation',
['model', 'quantization']
)
INFERENCE_LATENCY = Histogram(
'llm_inference_latency_seconds',
'Inference latency in seconds',
['model', 'operation'],
buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0]
)
TOKEN_THROUGHPUT = Gauge(
'llm_token_throughput_per_second',
'Current token generation throughput',
['model']
)
QUALITY_REGRESSION_ALERTS = Counter(
'llm_quality_regression_alerts_total',
'Number of quality regression alerts fired',
['model', 'severity']
)
class ProductionMonitor:
def __init__(self, model_name, quant_method, validation_prompts,
expected_outputs):
self.model_name = model_name
self.quant_method = quant_method
self.validation_prompts = validation_prompts
self.expected_outputs = expected_outputs
self.quality_history = []
def run_quality_probe(self, inference_fn):
"""Periodically check output quality against known-good responses."""
correct = 0
total = len(self.validation_prompts)
for prompt, expected in zip(self.validation_prompts, self.expected_outputs):
output = inference_fn(prompt, max_tokens=50, temperature=0)
if expected.lower() in output.lower():
correct += 1
score = correct / total
self.quality_history.append(score)
QUANT_QUALITY_SCORE.labels(
model=self.model_name,
quantization=self.quant_method
).set(score)
# Alert on quality regression
if len(self.quality_history) >= 10:
recent_avg = sum(self.quality_history[-10:]) / 10
if recent_avg < 0.8:
QUALITY_REGRESSION_ALERTS.labels(
model=self.model_name,
severity='critical'
).inc()
print(f"ALERT: Quality score {recent_avg:.1%} below threshold")
return score
def check_latency_regression(self, p99_threshold_ms=200):
"""Check if latency has regressed beyond threshold."""
# Query Prometheus for p99 latency
# If above threshold, alert
pass
Complete Pipeline Timeline (Llama 70B, 1x A100-80GB)
| Stage | Duration | Resources | Output | Can Fail? |
|---|---|---|---|---|
| 1. Checkpoint prep | 30 min | 1x A100 + 128GB RAM | baseline_metrics.json | Yes (corrupt checkpoint) |
| 2. Calibration data | 10 min | CPU only | calibration_data.pt (128x2048) | Yes (bad data source) |
| 3a. GPTQ quantization | 3-5 hours | 1x A100 + 128GB RAM | GPTQ checkpoint (37GB) | Yes (OOM, bad config) |
| 3b. AWQ quantization | 2-4 hours | 1x A100 + 128GB RAM | AWQ checkpoint (37GB) | Yes (OOM, bad config) |
| 4. Validation | 3-4 hours | 1x A100 | validation_report.json | Yes (quality regression) |
| 5. Deployment | 5-10 min | Serving GPU | Running server | Yes (OOM, config error) |
| 6. Monitoring | Ongoing | Prometheus + Grafana | Dashboards + alerts | No (always runs) |
Automation with CI/CD
# .github/workflows/quantize-and-deploy.yml
name: Quantize and Deploy LLM
on:
workflow_dispatch:
inputs:
model_name:
description: 'HuggingFace model name'
required: true
quant_method:
description: 'Quantization method (gptq, awq, fp8)'
required: true
default: 'awq'
target_bits:
description: 'Target bits (4 or 8)'
required: true
default: '4'
jobs:
quantize:
runs-on: [self-hosted, gpu-a100]
steps:
- uses: actions/checkout@v4
- name: Setup environment
run: |
pip install vllm auto-gptq autoawq transformers torch
- name: Prepare calibration data
run: python scripts/prepare_calibration.py --model ${{ inputs.model_name }}
- name: Run quantization
run: |
python scripts/quantize.py \
--model ${{ inputs.model_name }} \
--method ${{ inputs.quant_method }} \
--bits ${{ inputs.target_bits }} \
--output ./quantized_model
- name: Validate quality
run: |
python scripts/validate.py \
--model ./quantized_model \
--baseline ./baseline_metrics.json \
--tolerance-ppl 10 \
--tolerance-acc 5
- name: Upload model artifact
if: success()
run: |
# Upload to model registry (HuggingFace Hub, S3, etc.)
python scripts/upload_model.py \
--model ./quantized_model \
--registry s3://models/quantized/
deploy:
needs: quantize
runs-on: [self-hosted, gpu-serving]
steps:
- name: Deploy to serving infrastructure
run: |
python scripts/deploy.py \
--model s3://models/quantized/${{ inputs.model_name }} \
--config ./deploy_config.json
Common Failure Modes and Fixes
Pipeline Failure Modes and Remediation
| Failure | Stage | Symptom | Root Cause | Fix |
|---|---|---|---|---|
| OOM during quantization | 3 | CUDA OOM | FP16 model + calibration data + quantizer state | Use CPU offloading or larger GPU |
| High perplexity | 4 | PPL delta exceeds 10% | Bad calibration data or outlier layers | Try AWQ, reduce group size, or mixed precision |
| MMLU regression | 4 | Accuracy drops more than 5% | Knowledge layers quantized too aggressively | Keep first/last layers at FP16 |
| Garbled output | 5 | Incoherent generation | Wrong quantization config loaded | Verify quantize_config.json matches kernel |
| Serving OOM | 5 | CUDA OOM on load | Model + KV cache exceeds VRAM | Reduce max_model_len or gpu_memory_utilization |
| Latency regression | 6 | P99 above SLA | Dequantization overhead at high batch | Switch to FP8 for large batch workloads |
The most dangerous failure mode is when perplexity looks fine but specific task categories degrade. For example, INT4 quantization may preserve general language quality while destroying math reasoning or code generation accuracy. Always validate on task-specific benchmarks that match your production use case, not just perplexity.
Complete Pipeline Script
#!/bin/bash
set -euo pipefail
# Configuration
MODEL="meta-llama/Llama-2-70b-hf"
METHOD="awq"
BITS=4
OUTPUT_DIR="./pipeline_output"
SERVING_PORT=8000
echo "=== Stage 1: Baseline ==="
python scripts/baseline.py --model "$MODEL" --output "$OUTPUT_DIR"
echo "=== Stage 2: Calibration Data ==="
python scripts/calibration.py --model "$MODEL" --samples 128 --output "$OUTPUT_DIR"
echo "=== Stage 3: Quantization ==="
python scripts/quantize.py \
--model "$MODEL" \
--method "$METHOD" \
--bits "$BITS" \
--calibration "$OUTPUT_DIR/calibration_data.pt" \
--output "$OUTPUT_DIR/quantized_model"
echo "=== Stage 4: Validation ==="
python scripts/validate.py \
--model "$OUTPUT_DIR/quantized_model" \
--baseline "$OUTPUT_DIR/baseline_metrics.json" \
--tolerance-ppl 10 \
--tolerance-acc 5 \
--report "$OUTPUT_DIR/validation_report.json"
# Check validation result
PASSED=$(python -c "import json; r=json.load(open('$OUTPUT_DIR/validation_report.json')); print(r['overall_pass'])")
if [ "$PASSED" != "True" ]; then
echo "ERROR: Validation FAILED. Aborting deployment."
exit 1
fi
echo "=== Stage 5: Deployment ==="
python -m vllm.entrypoints.openai.api_server \
--model "$OUTPUT_DIR/quantized_model" \
--quantization "$METHOD" \
--port "$SERVING_PORT" \
--gpu-memory-utilization 0.92 \
--max-model-len 4096 &
# Wait for server
sleep 30
python scripts/health_check.py --url "http://localhost:$SERVING_PORT"
echo "=== Pipeline Complete ==="
echo "Model serving at http://localhost:$SERVING_PORT"
Pipeline Cost Breakdown (Llama 70B AWQ INT4, Cloud A100)
(USD)Summary
The end-to-end quantization pipeline has six stages: checkpoint preparation, calibration data generation, quantization execution, quality validation, production deployment, and ongoing monitoring. The most critical stage is validation β a model that fails quality checks should never reach production. For most deployments, AWQ INT4 on vLLM provides the best balance of quality, performance, and operational simplicity. FP8 is the easiest path on H100 hardware. Automating the pipeline with CI/CD ensures reproducibility and prevents human error. The total cost to quantize and validate a 70B model is under $30 on cloud GPUs, while the ongoing savings from 4x memory reduction and 3x throughput improvement pay for themselves within hours of production serving.