Hallucination Detection

advanced9 min readUpdated 2026-03-15

Step-by-step walkthrough for detecting and flagging hallucinated content in LLM outputs, covering factual grounding checks, self-consistency verification, source attribution validation, and confidence scoring.

hallucination detection factual-grounding output-filtering defense walkthrough

Hallucinations are confidently stated falsehoods generated by LLMs. In a security context, hallucinations are dangerous because users trust model outputs, and fabricated facts -- false API endpoints, nonexistent library functions, incorrect security configurations -- can lead to real vulnerabilities. This walkthrough builds a multi-method hallucination detection system that scores response reliability before content reaches the user.

Step 1: Categorize Hallucination Types

# hallucination/types.py
"""
Hallucination type taxonomy for detection targeting.
"""
from dataclasses import dataclass
from enum import Enum
 
class HallucinationType(str, Enum):
    FACTUAL = "factual"           # Incorrect facts
    FABRICATION = "fabrication"    # Invented entities, URLs, citations
    ATTRIBUTION = "attribution"   # Misattributed quotes or sources
    TEMPORAL = "temporal"         # Incorrect dates or timelines
    TECHNICAL = "technical"       # Wrong technical details (APIs, code)
    LOGICAL = "logical"           # Self-contradictory statements
 
@dataclass
class HallucinationFlag:
    hallucination_type: HallucinationType
    text_span: str
    confidence: float
    explanation: str
    source_check: str = ""

Step 2: Implement Self-Consistency Checking

# hallucination/consistency.py
"""
Self-consistency check: generate multiple responses and compare.
Inconsistencies across samples suggest hallucination.
"""
import numpy as np
from dataclasses import dataclass
 
@dataclass
class ConsistencyResult:
    consistency_score: float
    num_samples: int
    agreement_rate: float
    inconsistent_claims: list[str]
 
class SelfConsistencyChecker:
    def __init__(self, llm_client, num_samples: int = 3, temperature: float = 0.7):
        self.llm = llm_client
        self.num_samples = num_samples
        self.temperature = temperature
 
    def check(self, prompt: str, original_response: str) -> ConsistencyResult:
        """Generate multiple responses and check consistency."""
        responses = [original_response]
        for _ in range(self.num_samples - 1):
            response = self.llm.generate(
                prompt, temperature=self.temperature
            )
            responses.append(response)
 
        # Extract claims from each response
        claims_per_response = [
            self._extract_claims(r) for r in responses
        ]
 
        # Check agreement across samples
        all_claims = set()
        for claims in claims_per_response:
            all_claims.update(claims)
 
        agreement_counts = {}
        for claim in all_claims:
            count = sum(
                1 for claims in claims_per_response
                if self._claim_present(claim, claims)
            )
            agreement_counts[claim] = count
 
        total_claims = len(all_claims) if all_claims else 1
        agreed = sum(
            1 for c in agreement_counts.values()
            if c >= self.num_samples * 0.6
        )
        agreement_rate = agreed / total_claims
 
        inconsistent = [
            claim for claim, count in agreement_counts.items()
            if count < self.num_samples * 0.5
        ]
 
        return ConsistencyResult(
            consistency_score=agreement_rate,
            num_samples=self.num_samples,
            agreement_rate=agreement_rate,
            inconsistent_claims=inconsistent[:5],
        )
 
    def _extract_claims(self, text: str) -> list[str]:
        """Extract individual factual claims from text."""
        sentences = text.replace("\n", " ").split(".")
        claims = [s.strip() for s in sentences if len(s.strip()) > 20]
        return claims
 
    def _claim_present(self, claim: str, claim_list: list[str]) -> bool:
        """Check if a claim is semantically present in a list."""
        claim_words = set(claim.lower().split())
        for other in claim_list:
            other_words = set(other.lower().split())
            overlap = len(claim_words & other_words) / max(len(claim_words), 1)
            if overlap > 0.6:
                return True
        return False

Step 3: Build Source Attribution Verification

# hallucination/attribution.py
"""
Verify that claims in model output are supported by source documents.
Essential for RAG applications.
"""
from dataclasses import dataclass
from typing import Optional
 
@dataclass
class AttributionResult:
    claim: str
    supported: bool
    support_score: float
    source_document: Optional[str] = None
    source_snippet: Optional[str] = None
 
class AttributionVerifier:
    def __init__(self, embedding_model):
        self.model = embedding_model
 
    def verify_response(
        self, response: str, source_documents: list[str]
    ) -> list[AttributionResult]:
        """Check if each claim in the response is supported by sources."""
        claims = self._extract_claims(response)
        results = []
 
        # Embed all source documents
        source_embeddings = self.model.embed_batch(source_documents)
 
        for claim in claims:
            claim_embedding = self.model.embed(claim)
 
            # Find best matching source
            similarities = source_embeddings @ claim_embedding
            best_idx = int(similarities.argmax())
            best_score = float(similarities[best_idx])
 
            results.append(AttributionResult(
                claim=claim,
                supported=best_score >= 0.7,
                support_score=best_score,
                source_document=source_documents[best_idx] if best_score >= 0.5 else None,
                source_snippet=self._find_snippet(
                    claim, source_documents[best_idx]
                ) if best_score >= 0.5 else None,
            ))
 
        return results
 
    def _extract_claims(self, text: str) -> list[str]:
        sentences = text.replace("\n", " ").split(".")
        return [s.strip() for s in sentences if len(s.strip()) > 15]
 
    def _find_snippet(self, claim: str, document: str) -> str:
        """Find the most relevant snippet in the source document."""
        sentences = document.split(".")
        if not sentences:
            return ""
        claim_words = set(claim.lower().split())
        best_sentence = max(
            sentences,
            key=lambda s: len(set(s.lower().split()) & claim_words),
        )
        return best_sentence.strip()[:200]

Step 4: Create a Technical Claim Validator

# hallucination/technical.py
"""
Validate technical claims: URLs, package names, API endpoints, code.
"""
import re
from dataclasses import dataclass
 
@dataclass
class TechnicalValidation:
    claim_type: str
    value: str
    valid: bool
    reason: str
 
class TechnicalClaimValidator:
    def validate_response(self, text: str) -> list[TechnicalValidation]:
        results = []
 
        # Check URLs
        urls = re.findall(r'https?://[^\s<>"\']+', text)
        for url in urls:
            results.append(self._validate_url(url))
 
        # Check Python package names
        packages = re.findall(r'pip install\s+(\S+)', text)
        for pkg in packages:
            results.append(self._validate_package(pkg))
 
        # Check import statements
        imports = re.findall(r'(?:from|import)\s+([\w.]+)', text)
        for imp in imports:
            results.append(self._validate_import(imp))
 
        return results
 
    def _validate_url(self, url: str) -> TechnicalValidation:
        # Check for obviously fabricated URLs
        suspicious = [
            "example-api.com", "fake-endpoint",
            "/v999/", "/api/nonexistent",
        ]
        for pattern in suspicious:
            if pattern in url:
                return TechnicalValidation(
                    "url", url, False,
                    f"Suspicious URL pattern: {pattern}"
                )
        return TechnicalValidation("url", url, True, "Pattern check passed")
 
    def _validate_package(self, package: str) -> TechnicalValidation:
        # Known valid packages (expand as needed)
        common_packages = {
            "numpy", "pandas", "scikit-learn", "torch", "tensorflow",
            "fastapi", "flask", "django", "requests", "openai",
        }
        if package in common_packages:
            return TechnicalValidation("package", package, True, "Known package")
        return TechnicalValidation(
            "package", package, True,
            "Unknown package -- verify on PyPI"
        )
 
    def _validate_import(self, module: str) -> TechnicalValidation:
        top_level = module.split(".")[0]
        stdlib = {
            "os", "sys", "json", "re", "pathlib", "datetime",
            "typing", "dataclasses", "logging", "hashlib",
        }
        if top_level in stdlib:
            return TechnicalValidation("import", module, True, "Standard library")
        return TechnicalValidation("import", module, True, "Verify availability")

Step 5: Build the Unified Hallucination Scorer

# hallucination/scorer.py
"""
Unified hallucination risk score combining all detection methods.
"""
from dataclasses import dataclass, field
 
@dataclass
class HallucinationScore:
    overall_risk: float
    consistency_score: float = 1.0
    attribution_score: float = 1.0
    technical_score: float = 1.0
    flags: list = field(default_factory=list)
    recommendation: str = "allow"
 
class HallucinationScorer:
    def __init__(
        self,
        consistency_weight: float = 0.3,
        attribution_weight: float = 0.4,
        technical_weight: float = 0.3,
    ):
        self.weights = {
            "consistency": consistency_weight,
            "attribution": attribution_weight,
            "technical": technical_weight,
        }
 
    def score(
        self,
        consistency_result=None,
        attribution_results=None,
        technical_results=None,
    ) -> HallucinationScore:
        scores = {}
        flags = []
 
        if consistency_result:
            scores["consistency"] = consistency_result.consistency_score
            if consistency_result.consistency_score < 0.5:
                flags.append("Low self-consistency across samples")
 
        if attribution_results:
            supported = sum(1 for r in attribution_results if r.supported)
            total = max(len(attribution_results), 1)
            scores["attribution"] = supported / total
            unsupported = [r for r in attribution_results if not r.supported]
            for u in unsupported[:3]:
                flags.append(f"Unsupported claim: {u.claim[:60]}...")
 
        if technical_results:
            valid = sum(1 for r in technical_results if r.valid)
            total = max(len(technical_results), 1)
            scores["technical"] = valid / total
            invalid = [r for r in technical_results if not r.valid]
            for i in invalid:
                flags.append(f"Invalid {i.claim_type}: {i.value[:60]}")
 
        # Weighted average
        total_weight = 0.0
        weighted_sum = 0.0
        for key, weight in self.weights.items():
            if key in scores:
                weighted_sum += weight * scores[key]
                total_weight += weight
 
        reliability = weighted_sum / max(total_weight, 0.01)
        risk = 1.0 - reliability
 
        recommendation = "allow"
        if risk > 0.7:
            recommendation = "block"
        elif risk > 0.4:
            recommendation = "flag"
 
        return HallucinationScore(
            overall_risk=round(risk, 3),
            consistency_score=scores.get("consistency", 1.0),
            attribution_score=scores.get("attribution", 1.0),
            technical_score=scores.get("technical", 1.0),
            flags=flags,
            recommendation=recommendation,
        )

Step 6: Deploy as an Output Postprocessor

# hallucination/api.py
from fastapi import FastAPI
from pydantic import BaseModel
from hallucination.technical import TechnicalClaimValidator
from hallucination.scorer import HallucinationScorer
 
app = FastAPI(title="Hallucination Detection")
validator = TechnicalClaimValidator()
scorer = HallucinationScorer()
 
class CheckRequest(BaseModel):
    response_text: str
    source_documents: list[str] = []
 
class CheckResponse(BaseModel):
    risk_score: float
    recommendation: str
    flags: list[str]
 
@app.post("/check", response_model=CheckResponse)
async def check_hallucination(request: CheckRequest):
    tech_results = validator.validate_response(request.response_text)
    score = scorer.score(technical_results=tech_results)
    return CheckResponse(
        risk_score=score.overall_risk,
        recommendation=score.recommendation,
        flags=score.flags,
    )

uvicorn hallucination.api:app --port 8520

Step 7: Test Hallucination Detection

# tests/test_hallucination.py
import pytest
from hallucination.technical import TechnicalClaimValidator
from hallucination.scorer import HallucinationScorer
 
@pytest.fixture
def validator():
    return TechnicalClaimValidator()
 
def test_valid_import_accepted(validator):
    results = validator.validate_response("import json\nimport os")
    assert all(r.valid for r in results)
 
def test_suspicious_url_flagged(validator):
    results = validator.validate_response(
        "Use the API at https://fake-endpoint.com/v999/data"
    )
    assert any(not r.valid for r in results)
 
def test_scorer_high_risk():
    scorer = HallucinationScorer()
    from hallucination.technical import TechnicalValidation
    bad_results = [
        TechnicalValidation("url", "https://fake.com", False, "Fake"),
        TechnicalValidation("package", "nonexistent-pkg", False, "Unknown"),
    ]
    score = scorer.score(technical_results=bad_results)
    assert score.overall_risk > 0.5
    assert score.recommendation in ("flag", "block")
 
def test_scorer_low_risk():
    scorer = HallucinationScorer()
    from hallucination.technical import TechnicalValidation
    good_results = [
        TechnicalValidation("import", "json", True, "stdlib"),
        TechnicalValidation("import", "os", True, "stdlib"),
    ]
    score = scorer.score(technical_results=good_results)
    assert score.overall_risk < 0.3

pytest tests/test_hallucination.py -v

Output Content Classifier -- Broader output harm classification
Structured Output Validation -- Schema-based output validation
LLM Judge Implementation -- Using an LLM to judge output quality
Model Drift Monitoring -- Tracking hallucination rates over time

Knowledge Check

Why is self-consistency checking effective at detecting hallucinations but not factual errors?

Hallucination Detection

Related articles

Hallucination Detection

Related articles