Hallucination Detection
Step-by-step walkthrough for detecting and flagging hallucinated content in LLM outputs, covering factual grounding checks, self-consistency verification, source attribution validation, and confidence scoring.
Hallucinations are confidently stated falsehoods generated by LLMs. In a security context, hallucinations are dangerous because users trust model outputs, and fabricated facts -- false API endpoints, nonexistent library functions, incorrect security configurations -- can lead to real vulnerabilities. This walkthrough builds a multi-method hallucination detection system that scores response reliability before content reaches the user.
Step 1: Categorize Hallucination Types
# hallucination/types.py
"""
Hallucination type taxonomy for detection targeting.
"""
from dataclasses import dataclass
from enum import Enum
class HallucinationType(str, Enum):
FACTUAL = "factual" # Incorrect facts
FABRICATION = "fabrication" # Invented entities, URLs, citations
ATTRIBUTION = "attribution" # Misattributed quotes or sources
TEMPORAL = "temporal" # Incorrect dates or timelines
TECHNICAL = "technical" # Wrong technical details (APIs, code)
LOGICAL = "logical" # Self-contradictory statements
@dataclass
class HallucinationFlag:
hallucination_type: HallucinationType
text_span: str
confidence: float
explanation: str
source_check: str = ""Step 2: Implement Self-Consistency Checking
# hallucination/consistency.py
"""
Self-consistency check: generate multiple responses and compare.
Inconsistencies across samples suggest hallucination.
"""
import numpy as np
from dataclasses import dataclass
@dataclass
class ConsistencyResult:
consistency_score: float
num_samples: int
agreement_rate: float
inconsistent_claims: list[str]
class SelfConsistencyChecker:
def __init__(self, llm_client, num_samples: int = 3, temperature: float = 0.7):
self.llm = llm_client
self.num_samples = num_samples
self.temperature = temperature
def check(self, prompt: str, original_response: str) -> ConsistencyResult:
"""Generate multiple responses and check consistency."""
responses = [original_response]
for _ in range(self.num_samples - 1):
response = self.llm.generate(
prompt, temperature=self.temperature
)
responses.append(response)
# Extract claims from each response
claims_per_response = [
self._extract_claims(r) for r in responses
]
# Check agreement across samples
all_claims = set()
for claims in claims_per_response:
all_claims.update(claims)
agreement_counts = {}
for claim in all_claims:
count = sum(
1 for claims in claims_per_response
if self._claim_present(claim, claims)
)
agreement_counts[claim] = count
total_claims = len(all_claims) if all_claims else 1
agreed = sum(
1 for c in agreement_counts.values()
if c >= self.num_samples * 0.6
)
agreement_rate = agreed / total_claims
inconsistent = [
claim for claim, count in agreement_counts.items()
if count < self.num_samples * 0.5
]
return ConsistencyResult(
consistency_score=agreement_rate,
num_samples=self.num_samples,
agreement_rate=agreement_rate,
inconsistent_claims=inconsistent[:5],
)
def _extract_claims(self, text: str) -> list[str]:
"""Extract individual factual claims from text."""
sentences = text.replace("\n", " ").split(".")
claims = [s.strip() for s in sentences if len(s.strip()) > 20]
return claims
def _claim_present(self, claim: str, claim_list: list[str]) -> bool:
"""Check if a claim is semantically present in a list."""
claim_words = set(claim.lower().split())
for other in claim_list:
other_words = set(other.lower().split())
overlap = len(claim_words & other_words) / max(len(claim_words), 1)
if overlap > 0.6:
return True
return FalseStep 3: Build Source Attribution Verification
# hallucination/attribution.py
"""
Verify that claims in model output are supported by source documents.
Essential for RAG applications.
"""
from dataclasses import dataclass
from typing import Optional
@dataclass
class AttributionResult:
claim: str
supported: bool
support_score: float
source_document: Optional[str] = None
source_snippet: Optional[str] = None
class AttributionVerifier:
def __init__(self, embedding_model):
self.model = embedding_model
def verify_response(
self, response: str, source_documents: list[str]
) -> list[AttributionResult]:
"""Check if each claim in the response is supported by sources."""
claims = self._extract_claims(response)
results = []
# Embed all source documents
source_embeddings = self.model.embed_batch(source_documents)
for claim in claims:
claim_embedding = self.model.embed(claim)
# Find best matching source
similarities = source_embeddings @ claim_embedding
best_idx = int(similarities.argmax())
best_score = float(similarities[best_idx])
results.append(AttributionResult(
claim=claim,
supported=best_score >= 0.7,
support_score=best_score,
source_document=source_documents[best_idx] if best_score >= 0.5 else None,
source_snippet=self._find_snippet(
claim, source_documents[best_idx]
) if best_score >= 0.5 else None,
))
return results
def _extract_claims(self, text: str) -> list[str]:
sentences = text.replace("\n", " ").split(".")
return [s.strip() for s in sentences if len(s.strip()) > 15]
def _find_snippet(self, claim: str, document: str) -> str:
"""Find the most relevant snippet in the source document."""
sentences = document.split(".")
if not sentences:
return ""
claim_words = set(claim.lower().split())
best_sentence = max(
sentences,
key=lambda s: len(set(s.lower().split()) & claim_words),
)
return best_sentence.strip()[:200]Step 4: Create a Technical Claim Validator
# hallucination/technical.py
"""
Validate technical claims: URLs, package names, API endpoints, code.
"""
import re
from dataclasses import dataclass
@dataclass
class TechnicalValidation:
claim_type: str
value: str
valid: bool
reason: str
class TechnicalClaimValidator:
def validate_response(self, text: str) -> list[TechnicalValidation]:
results = []
# Check URLs
urls = re.findall(r'https?://[^\s<>"\']+', text)
for url in urls:
results.append(self._validate_url(url))
# Check Python package names
packages = re.findall(r'pip install\s+(\S+)', text)
for pkg in packages:
results.append(self._validate_package(pkg))
# Check import statements
imports = re.findall(r'(?:from|import)\s+([\w.]+)', text)
for imp in imports:
results.append(self._validate_import(imp))
return results
def _validate_url(self, url: str) -> TechnicalValidation:
# Check for obviously fabricated URLs
suspicious = [
"example-api.com", "fake-endpoint",
"/v999/", "/api/nonexistent",
]
for pattern in suspicious:
if pattern in url:
return TechnicalValidation(
"url", url, False,
f"Suspicious URL pattern: {pattern}"
)
return TechnicalValidation("url", url, True, "Pattern check passed")
def _validate_package(self, package: str) -> TechnicalValidation:
# Known valid packages (expand as needed)
common_packages = {
"numpy", "pandas", "scikit-learn", "torch", "tensorflow",
"fastapi", "flask", "django", "requests", "openai",
}
if package in common_packages:
return TechnicalValidation("package", package, True, "Known package")
return TechnicalValidation(
"package", package, True,
"Unknown package -- verify on PyPI"
)
def _validate_import(self, module: str) -> TechnicalValidation:
top_level = module.split(".")[0]
stdlib = {
"os", "sys", "json", "re", "pathlib", "datetime",
"typing", "dataclasses", "logging", "hashlib",
}
if top_level in stdlib:
return TechnicalValidation("import", module, True, "Standard library")
return TechnicalValidation("import", module, True, "Verify availability")Step 5: Build the Unified Hallucination Scorer
# hallucination/scorer.py
"""
Unified hallucination risk score combining all detection methods.
"""
from dataclasses import dataclass, field
@dataclass
class HallucinationScore:
overall_risk: float
consistency_score: float = 1.0
attribution_score: float = 1.0
technical_score: float = 1.0
flags: list = field(default_factory=list)
recommendation: str = "allow"
class HallucinationScorer:
def __init__(
self,
consistency_weight: float = 0.3,
attribution_weight: float = 0.4,
technical_weight: float = 0.3,
):
self.weights = {
"consistency": consistency_weight,
"attribution": attribution_weight,
"technical": technical_weight,
}
def score(
self,
consistency_result=None,
attribution_results=None,
technical_results=None,
) -> HallucinationScore:
scores = {}
flags = []
if consistency_result:
scores["consistency"] = consistency_result.consistency_score
if consistency_result.consistency_score < 0.5:
flags.append("Low self-consistency across samples")
if attribution_results:
supported = sum(1 for r in attribution_results if r.supported)
total = max(len(attribution_results), 1)
scores["attribution"] = supported / total
unsupported = [r for r in attribution_results if not r.supported]
for u in unsupported[:3]:
flags.append(f"Unsupported claim: {u.claim[:60]}...")
if technical_results:
valid = sum(1 for r in technical_results if r.valid)
total = max(len(technical_results), 1)
scores["technical"] = valid / total
invalid = [r for r in technical_results if not r.valid]
for i in invalid:
flags.append(f"Invalid {i.claim_type}: {i.value[:60]}")
# Weighted average
total_weight = 0.0
weighted_sum = 0.0
for key, weight in self.weights.items():
if key in scores:
weighted_sum += weight * scores[key]
total_weight += weight
reliability = weighted_sum / max(total_weight, 0.01)
risk = 1.0 - reliability
recommendation = "allow"
if risk > 0.7:
recommendation = "block"
elif risk > 0.4:
recommendation = "flag"
return HallucinationScore(
overall_risk=round(risk, 3),
consistency_score=scores.get("consistency", 1.0),
attribution_score=scores.get("attribution", 1.0),
technical_score=scores.get("technical", 1.0),
flags=flags,
recommendation=recommendation,
)Step 6: Deploy as an Output Postprocessor
# hallucination/api.py
from fastapi import FastAPI
from pydantic import BaseModel
from hallucination.technical import TechnicalClaimValidator
from hallucination.scorer import HallucinationScorer
app = FastAPI(title="Hallucination Detection")
validator = TechnicalClaimValidator()
scorer = HallucinationScorer()
class CheckRequest(BaseModel):
response_text: str
source_documents: list[str] = []
class CheckResponse(BaseModel):
risk_score: float
recommendation: str
flags: list[str]
@app.post("/check", response_model=CheckResponse)
async def check_hallucination(request: CheckRequest):
tech_results = validator.validate_response(request.response_text)
score = scorer.score(technical_results=tech_results)
return CheckResponse(
risk_score=score.overall_risk,
recommendation=score.recommendation,
flags=score.flags,
)uvicorn hallucination.api:app --port 8520Step 7: Test Hallucination Detection
# tests/test_hallucination.py
import pytest
from hallucination.technical import TechnicalClaimValidator
from hallucination.scorer import HallucinationScorer
@pytest.fixture
def validator():
return TechnicalClaimValidator()
def test_valid_import_accepted(validator):
results = validator.validate_response("import json\nimport os")
assert all(r.valid for r in results)
def test_suspicious_url_flagged(validator):
results = validator.validate_response(
"Use the API at https://fake-endpoint.com/v999/data"
)
assert any(not r.valid for r in results)
def test_scorer_high_risk():
scorer = HallucinationScorer()
from hallucination.technical import TechnicalValidation
bad_results = [
TechnicalValidation("url", "https://fake.com", False, "Fake"),
TechnicalValidation("package", "nonexistent-pkg", False, "Unknown"),
]
score = scorer.score(technical_results=bad_results)
assert score.overall_risk > 0.5
assert score.recommendation in ("flag", "block")
def test_scorer_low_risk():
scorer = HallucinationScorer()
from hallucination.technical import TechnicalValidation
good_results = [
TechnicalValidation("import", "json", True, "stdlib"),
TechnicalValidation("import", "os", True, "stdlib"),
]
score = scorer.score(technical_results=good_results)
assert score.overall_risk < 0.3pytest tests/test_hallucination.py -vRelated Topics
- Output Content Classifier -- Broader output harm classification
- Structured Output Validation -- Schema-based output validation
- LLM Judge Implementation -- Using an LLM to judge output quality
- Model Drift Monitoring -- Tracking hallucination rates over time
Why is self-consistency checking effective at detecting hallucinations but not factual errors?