Language models hallucinate because they are trained to predict plausible next tokens, not truthful ones. The cross-entropy loss penalizes the model for assigning low probability to the actual next token in the training data. A factually incorrect but grammatically fluent sentence incurs zero extra loss if it never appeared in training. The model has no mechanism to distinguish “I have never seen evidence for this claim” from “This claim is true” — both are patterns it must complete fluently.
This post covers why hallucination is a fundamental property of autoregressive language models, the detection methods that catch hallucinations post-generation, and the prevention strategies that reduce hallucination rates at the architectural and training level.
Why LLMs Hallucinate
The Training Objective Problem
The standard language modeling objective is:
This objective has three properties that enable hallucination:
1. Plausibility, not truth. The model learns — the probability of token given the preceding context. This captures what text typically looks like, not what is factually correct. “The capital of France is Berlin” has low probability because it does not match training data, but “The capital of Borduria is Szohod” (a made-up claim about a fictional country) would receive whatever probability the model’s distributional statistics assign.
2. No uncertainty signal. The model has no way to express “I don’t know.” Every prompt requires a next-token distribution. If the model’s internal representation is uncertain, that uncertainty is compressed into the next-token probabilities rather than surfaced as an explicit “I’m unsure” signal.
3. Interpolation over memorization. LLMs generalize from training data by interpolating between learned patterns. When a query falls between training examples, the model generates a plausible interpolation — which may be factually wrong.
class HallucinationAnalyzer:
"""Analyze model behavior to understand hallucination patterns."""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def measure_confidence_calibration(self, questions_with_answers):
"""
Measure whether the model's confidence predicts accuracy.
Well-calibrated: when the model says 80% confident, it's right 80% of the time.
"""
bins = {i: {"correct": 0, "total": 0} for i in range(10)}
for question, correct_answer in questions_with_answers:
# Generate answer and compute confidence
response, confidence = self._generate_with_confidence(question)
# Check correctness
is_correct = correct_answer.lower() in response.lower()
# Bin by confidence
bin_idx = min(int(confidence * 10), 9)
bins[bin_idx]["total"] += 1
if is_correct:
bins[bin_idx]["correct"] += 1
# Compute calibration error
calibration = []
for i, data in bins.items():
if data["total"] > 0:
expected_conf = (i + 0.5) / 10
actual_acc = data["correct"] / data["total"]
calibration.append({
"confidence_bin": f"{i*10}-{(i+1)*10}%",
"expected_confidence": expected_conf,
"actual_accuracy": actual_acc,
"count": data["total"],
"calibration_error": abs(expected_conf - actual_acc),
})
ece = sum(
c["calibration_error"] * c["count"]
for c in calibration
) / sum(c["count"] for c in calibration)
return {"calibration": calibration, "ECE": ece}
def _generate_with_confidence(self, question):
"""Generate an answer and estimate confidence from token probabilities."""
prompt = f"Answer this question accurately: {question}\nAnswer:"
input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = self.model.generate(
input_ids, max_new_tokens=100,
output_scores=True, return_dict_in_generate=True,
)
# Compute average token probability as confidence proxy
scores = outputs.scores
token_probs = []
for step_logits in scores:
probs = torch.softmax(step_logits[0], dim=-1)
max_prob = probs.max().item()
token_probs.append(max_prob)
confidence = sum(token_probs) / max(len(token_probs), 1)
response = self.tokenizer.decode(
outputs.sequences[0][input_ids.shape[1]:], skip_special_tokens=True
)
return response, confidence
Hallucination is not a software bug that can be patched. It is a fundamental consequence of the training objective. The model is doing exactly what it was trained to do: generate plausible continuations of text. “Plausible” and “true” overlap most of the time, but not always. Every mitigation strategy is a trade-off between factual accuracy and generation fluency/capability.
Taxonomy of Hallucinations
class HallucinationType:
# Type 1: Factual errors about real entities
FACTUAL = "factual" # "Einstein was born in 1900" (wrong year)
# Type 2: Fabricated entities
FABRICATION = "fabrication" # "According to Dr. James Thornton's 2023 study..." (person doesn't exist)
# Type 3: Logical inconsistencies
INCONSISTENCY = "inconsistency" # Contradicting itself within the same response
# Type 4: Source attribution errors
ATTRIBUTION = "attribution" # Attributing a quote to the wrong person
# Type 5: Outdated information stated as current
TEMPORAL = "temporal" # "The current president of X is Y" (outdated)
Detection: Self-Consistency
The Method
Generate multiple responses to the same question. If the model gives different answers across samples, the uncertain answers are likely hallucinated:
import torch
from collections import Counter
class SelfConsistencyDetector:
"""Detect hallucinations via self-consistency checking."""
def __init__(self, model, tokenizer, num_samples=10):
self.model = model
self.tokenizer = tokenizer
self.num_samples = num_samples
def check(self, question, temperature=0.7):
"""Check self-consistency for a question."""
answers = []
full_responses = []
for _ in range(self.num_samples):
prompt = f"Question: {question}\nAnswer:"
input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
output = self.model.generate(
input_ids, max_new_tokens=200,
temperature=temperature, do_sample=True,
)
response = self.tokenizer.decode(
output[0][input_ids.shape[1]:], skip_special_tokens=True
)
full_responses.append(response)
# Extract the core answer
answer = self._extract_core_answer(response)
answers.append(answer)
# Compute consistency
answer_counts = Counter(answers)
most_common_answer, most_common_count = answer_counts.most_common(1)[0]
consistency = most_common_count / self.num_samples
return {
"question": question,
"most_common_answer": most_common_answer,
"consistency": consistency,
"likely_hallucination": consistency < 0.5,
"answer_distribution": dict(answer_counts),
"num_unique_answers": len(answer_counts),
}
def _extract_core_answer(self, response):
"""Extract the core factual claim from a response."""
# Simple: take the first sentence
import re
sentences = re.split(r'[.!?\n]', response)
for s in sentences:
s = s.strip()
if len(s) > 10:
return s.lower()
return response[:100].lower()
def batch_check(self, questions):
"""Check multiple questions and flag likely hallucinations."""
results = []
for q in questions:
result = self.check(q)
results.append(result)
hallucination_rate = sum(
1 for r in results if r["likely_hallucination"]
) / len(results)
return {
"results": results,
"hallucination_rate": hallucination_rate,
"avg_consistency": sum(r["consistency"] for r in results) / len(results),
}
Self-Consistency Detection Accuracy
(% detection accuracy)Detection: Retrieval Verification
The Method
Retrieve relevant documents and check whether the model’s claims are supported by the retrieved evidence:
class RetrievalVerifier:
"""Verify model claims against retrieved documents."""
def __init__(self, retriever, nli_model=None):
self.retriever = retriever
self.nli_model = nli_model # Natural language inference model
def verify_response(self, question, response, top_k=5):
"""Verify each claim in a response against retrieved documents."""
# Step 1: Extract claims from the response
claims = self._extract_claims(response)
# Step 2: Retrieve relevant documents
documents = self.retriever.search(question, top_k=top_k)
# Step 3: Check each claim against documents
verification_results = []
for claim in claims:
# Find the most relevant document for this claim
best_support = self._find_support(claim, documents)
verification_results.append({
"claim": claim,
"supported": best_support["supported"],
"support_score": best_support["score"],
"source": best_support.get("source", "none"),
})
# Overall verification
supported_count = sum(1 for v in verification_results if v["supported"])
total_claims = len(verification_results)
return {
"question": question,
"response": response,
"claims": verification_results,
"support_rate": supported_count / max(total_claims, 1),
"unsupported_claims": [
v["claim"] for v in verification_results if not v["supported"]
],
}
def _extract_claims(self, response):
"""Extract individual factual claims from a response."""
import re
sentences = re.split(r'(?<=[.!?])\s+', response)
claims = []
for s in sentences:
s = s.strip()
# Skip questions, hedged statements, and very short sentences
if s.endswith('?') or len(s) < 20:
continue
if any(hedge in s.lower() for hedge in ["i think", "maybe", "possibly", "might"]):
continue
claims.append(s)
return claims
def _find_support(self, claim, documents):
"""Check if any document supports a claim."""
best_score = 0.0
best_source = None
for doc in documents:
score = self._compute_entailment(doc["text"], claim)
if score > best_score:
best_score = score
best_source = doc.get("url", "unknown")
return {
"supported": best_score > 0.7,
"score": best_score,
"source": best_source,
}
def _compute_entailment(self, premise, hypothesis):
"""Compute entailment score between a document and a claim."""
if self.nli_model:
return self.nli_model.predict(premise, hypothesis)
# Fallback: simple token overlap
premise_tokens = set(premise.lower().split())
hypothesis_tokens = set(hypothesis.lower().split())
if not hypothesis_tokens:
return 0.0
overlap = len(premise_tokens & hypothesis_tokens)
return overlap / len(hypothesis_tokens)
Prevention: Retrieval-Augmented Generation (RAG)
RAG Architecture
RAG reduces hallucination by conditioning the model’s response on retrieved evidence:
class RAGPipeline:
"""Retrieval-Augmented Generation pipeline."""
def __init__(self, model, tokenizer, retriever, max_context_tokens=4096):
self.model = model
self.tokenizer = tokenizer
self.retriever = retriever
self.max_context_tokens = max_context_tokens
def generate(self, question, top_k=5):
"""Generate a response grounded in retrieved documents."""
# Retrieve relevant documents
docs = self.retriever.search(question, top_k=top_k)
# Build context from retrieved documents
context = self._build_context(docs)
# Generate response conditioned on context
prompt = (
f"Use the following context to answer the question. "
f"Only use information from the provided context. "
f"If the context does not contain the answer, say "
f"\"I don't have enough information to answer this.\"\n\n"
f"Context:\n{context}\n\n"
f"Question: {question}\n\n"
f"Answer:"
)
input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
output = self.model.generate(
input_ids, max_new_tokens=500,
temperature=0.3, # Low temperature for factual accuracy
)
response = self.tokenizer.decode(
output[0][input_ids.shape[1]:], skip_special_tokens=True
)
return {
"response": response,
"sources": [d.get("url", "unknown") for d in docs],
"num_context_docs": len(docs),
}
def _build_context(self, docs):
"""Build a context string from retrieved documents."""
context_parts = []
total_tokens = 0
for i, doc in enumerate(docs):
doc_text = doc["text"]
doc_tokens = len(self.tokenizer.encode(doc_text))
if total_tokens + doc_tokens > self.max_context_tokens:
# Truncate this document
remaining = self.max_context_tokens - total_tokens
truncated = self.tokenizer.decode(
self.tokenizer.encode(doc_text)[:remaining]
)
context_parts.append(f"[Source {i+1}]: {truncated}")
break
context_parts.append(f"[Source {i+1}]: {doc_text}")
total_tokens += doc_tokens
return "\n\n".join(context_parts)
RAG Impact on Hallucination Rates
Hallucination Rates: With and Without RAG
| Task | No RAG | RAG (top-3) | RAG (top-5) | RAG + Verification |
|---|---|---|---|---|
| Factual QA | 23% | 8% | 6% | 3% |
| Biography | 31% | 12% | 9% | 5% |
| Science | 18% | 7% | 5% | 2% |
| Current events | 45% | 15% | 11% | 7% |
| Code explanation | 12% | 5% | 4% | 2% |
Prevention: Constrained Decoding
Token-Level Constraints
Constrain the model’s output to only generate tokens that are consistent with retrieved evidence:
class ConstrainedDecoder:
"""Constrained decoding that biases toward factual tokens."""
def __init__(self, model, tokenizer, evidence_texts):
self.model = model
self.tokenizer = tokenizer
# Build token bias from evidence
self.evidence_tokens = set()
for text in evidence_texts:
tokens = self.tokenizer.encode(text)
self.evidence_tokens.update(tokens)
def generate(self, prompt, max_new_tokens=200, bias_strength=2.0):
"""Generate with token-level evidence bias."""
input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
# Create bias tensor
vocab_size = self.model.config.vocab_size
bias = torch.zeros(vocab_size, device="cuda")
for token_id in self.evidence_tokens:
bias[token_id] = bias_strength
generated = input_ids.clone()
for _ in range(max_new_tokens):
with torch.no_grad():
outputs = self.model(generated)
logits = outputs.logits[:, -1, :]
# Apply evidence bias
biased_logits = logits + bias.unsqueeze(0)
# Sample from biased distribution
probs = torch.softmax(biased_logits / 0.5, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
generated = torch.cat([generated, next_token], dim=-1)
if next_token.item() == self.tokenizer.eos_token_id:
break
response = self.tokenizer.decode(generated[0][input_ids.shape[1]:], skip_special_tokens=True)
return response
Prevention: Citation Training
Training Models to Cite Sources
class CitationTrainer:
"""Train models to produce inline citations."""
def create_citation_training_data(self, qa_pairs_with_sources):
"""Create training data that teaches citation behavior."""
training_examples = []
for item in qa_pairs_with_sources:
question = item["question"]
answer = item["answer"]
sources = item["sources"]
# Format answer with inline citations
cited_answer = self._add_citations(answer, sources)
training_examples.append({
"instruction": (
f"Answer the following question using the provided sources. "
f"Cite your sources using [Source N] format.\n\n"
f"Sources:\n{self._format_sources(sources)}\n\n"
f"Question: {question}"
),
"response": cited_answer,
})
return training_examples
def _add_citations(self, answer, sources):
"""Add citation markers to an answer."""
# Split answer into sentences
import re
sentences = re.split(r'(?<=[.!?])\s+', answer)
cited_sentences = []
for sentence in sentences:
# Find which source supports this sentence
best_source_idx = self._find_best_source(sentence, sources)
if best_source_idx is not None:
cited_sentences.append(f"{sentence} [Source {best_source_idx + 1}]")
else:
cited_sentences.append(sentence)
return " ".join(cited_sentences)
def _find_best_source(self, sentence, sources):
"""Find which source best supports a sentence."""
sentence_words = set(sentence.lower().split())
best_overlap = 0
best_idx = None
for i, source in enumerate(sources):
source_words = set(source["text"].lower().split())
overlap = len(sentence_words & source_words)
if overlap > best_overlap and overlap > 3:
best_overlap = overlap
best_idx = i
return best_idx
def _format_sources(self, sources):
parts = []
for i, source in enumerate(sources):
parts.append(f"[Source {i+1}]: {source['text'][:500]}")
return "\n".join(parts)
Complete Hallucination Mitigation Pipeline
class HallucinationMitigationPipeline:
"""
Complete pipeline combining RAG, self-consistency, and verification.
"""
def __init__(self, model, tokenizer, retriever):
self.model = model
self.tokenizer = tokenizer
self.retriever = retriever
self.rag = RAGPipeline(model, tokenizer, retriever)
self.consistency_checker = SelfConsistencyDetector(model, tokenizer)
self.verifier = RetrievalVerifier(retriever)
def generate_verified(self, question):
"""Generate a verified, hallucination-checked response."""
# Step 1: RAG generation
rag_result = self.rag.generate(question)
# Step 2: Self-consistency check
consistency = self.consistency_checker.check(question, temperature=0.5)
# Step 3: Retrieval verification
verification = self.verifier.verify_response(
question, rag_result["response"]
)
# Step 4: Flag unsupported claims
flagged_response = self._flag_unsupported(
rag_result["response"],
verification["unsupported_claims"],
)
confidence = min(
consistency["consistency"],
verification["support_rate"],
)
return {
"response": flagged_response,
"confidence": confidence,
"sources": rag_result["sources"],
"unsupported_claims": verification["unsupported_claims"],
"consistency_score": consistency["consistency"],
"support_rate": verification["support_rate"],
"reliability": "high" if confidence > 0.8 else "medium" if confidence > 0.5 else "low",
}
def _flag_unsupported(self, response, unsupported_claims):
"""Add warnings for unsupported claims."""
flagged = response
for claim in unsupported_claims:
flagged = flagged.replace(
claim,
f"{claim} [UNVERIFIED]",
)
return flagged