Dual LLM Architecture Setup
Step-by-step walkthrough for implementing a dual LLM pattern where one model generates responses and a second model validates them, covering architecture design, validator prompt engineering, latency optimization, and failure handling.
The dual LLM pattern uses two separate models: a generator that produces responses and a validator that checks them. Because the validator processes the generator's output in a separate context, prompt injections that compromise the generator do not automatically compromise the validator. This architectural separation is one of the strongest defense patterns against prompt injection.
Step 1: Design the Architecture
# dual_llm/architecture.py
"""
Dual LLM architecture: generator + validator.
"""
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class DualLLMConfig:
generator_model: str = "gpt-4o-mini"
validator_model: str = "gpt-4o" # Stronger model for validation
max_retries: int = 2
validator_timeout_ms: int = 10000
fail_open: bool = False # Block if validator fails
@dataclass
class GenerationResult:
response: str
generator_model: str
validated: bool
validator_approved: bool = False
validator_explanation: str = ""
total_latency_ms: float = 0.0
retries: int = 0Step 2: Build the Generator
# dual_llm/generator.py
"""
Generator model wrapper with isolation.
"""
import time
class Generator:
def __init__(self, llm_client, model: str):
self.llm = llm_client
self.model = model
def generate(
self, system_prompt: str, user_message: str
) -> tuple[str, float]:
start = time.monotonic()
response = self.llm.chat(
model=self.model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
],
)
latency = (time.monotonic() - start) * 1000
return response, latencyStep 3: Build the Validator
# dual_llm/validator.py
"""
Validator model that checks generator output in an isolated context.
The validator never sees the system prompt or user context that
the generator processed -- only the output to validate.
"""
import json
import re
import time
from dataclasses import dataclass
@dataclass
class ValidationResult:
approved: bool
confidence: float
explanation: str
latency_ms: float
VALIDATOR_PROMPT = """You are a safety validator. Evaluate the following AI-generated response.
Check for:
1. Harmful instructions or dangerous content
2. Personal information disclosure
3. System prompt or instruction leakage
4. Inappropriate or off-topic content
5. Factual claims that appear fabricated
Response to validate:
---
{response}
---
Output ONLY JSON: {{"approved": true/false, "confidence": 0.0-1.0, "explanation": "reason"}}"""
class Validator:
def __init__(self, llm_client, model: str):
self.llm = llm_client
self.model = model
def validate(self, response: str) -> ValidationResult:
start = time.monotonic()
prompt = VALIDATOR_PROMPT.format(response=response)
raw = self.llm.chat(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
)
latency = (time.monotonic() - start) * 1000
try:
match = re.search(r"\{.*\}", raw, re.DOTALL)
data = json.loads(match.group()) if match else {}
return ValidationResult(
approved=data.get("approved", False),
confidence=data.get("confidence", 0.5),
explanation=data.get("explanation", ""),
latency_ms=latency,
)
except Exception:
return ValidationResult(
approved=False, confidence=0.0,
explanation="Validation parsing failed",
latency_ms=latency,
)Step 4: Wire Generator and Validator Together
# dual_llm/pipeline.py
"""
Dual LLM pipeline that coordinates generator and validator.
"""
import asyncio
from dual_llm.architecture import DualLLMConfig, GenerationResult
from dual_llm.generator import Generator
from dual_llm.validator import Validator
class DualLLMPipeline:
def __init__(
self, generator: Generator, validator: Validator,
config: DualLLMConfig = None,
):
self.generator = generator
self.validator = validator
self.config = config or DualLLMConfig()
def process(
self, system_prompt: str, user_message: str
) -> GenerationResult:
for attempt in range(self.config.max_retries + 1):
response, gen_latency = self.generator.generate(
system_prompt, user_message
)
validation = self.validator.validate(response)
total_latency = gen_latency + validation.latency_ms
if validation.approved:
return GenerationResult(
response=response,
generator_model=self.config.generator_model,
validated=True,
validator_approved=True,
validator_explanation=validation.explanation,
total_latency_ms=total_latency,
retries=attempt,
)
# All retries exhausted
if self.config.fail_open:
return GenerationResult(
response=response,
generator_model=self.config.generator_model,
validated=True,
validator_approved=False,
validator_explanation="Validator rejected all attempts",
total_latency_ms=total_latency,
retries=self.config.max_retries,
)
return GenerationResult(
response="I'm unable to provide a response to that question.",
generator_model=self.config.generator_model,
validated=True,
validator_approved=False,
validator_explanation="Blocked after failed validation",
total_latency_ms=total_latency,
retries=self.config.max_retries,
)Step 5: Add Async Parallel Processing
# dual_llm/async_pipeline.py
"""
Async pipeline for reduced latency.
"""
import asyncio
import time
class AsyncDualLLMPipeline:
def __init__(self, generator, validator, config=None):
self.generator = generator
self.validator = validator
async def process_async(
self, system_prompt: str, user_message: str
) -> dict:
start = time.monotonic()
# Generate response
response, gen_latency = self.generator.generate(
system_prompt, user_message
)
# Validate asynchronously
validation = await asyncio.to_thread(
self.validator.validate, response
)
return {
"response": response if validation.approved else
"I cannot provide that response.",
"approved": validation.approved,
"total_latency_ms": (time.monotonic() - start) * 1000,
}Step 6: Monitor Agreement Rates
# dual_llm/monitoring.py
from collections import Counter
import logging
class DualLLMMonitor:
def __init__(self):
self.total = 0
self.approved = 0
self.rejected = 0
self.retry_counts = Counter()
self.logger = logging.getLogger("dual_llm")
def record(self, result: "GenerationResult"):
self.total += 1
if result.validator_approved:
self.approved += 1
else:
self.rejected += 1
self.retry_counts[result.retries] += 1
rejection_rate = self.rejected / max(self.total, 1)
if rejection_rate > 0.2 and self.total > 50:
self.logger.warning(
f"High rejection rate: {rejection_rate:.2%} -- "
f"generator may need prompt tuning"
)
def report(self) -> dict:
return {
"total": self.total,
"approval_rate": self.approved / max(self.total, 1),
"retry_distribution": dict(self.retry_counts),
}Step 7: Test the Dual LLM Pipeline
# tests/test_dual_llm.py
import pytest
from dual_llm.validator import Validator, ValidationResult
class MockValidatorLLM:
def chat(self, **kwargs):
content = kwargs.get("messages", [{}])[0].get("content", "")
if "system prompt" in content.lower():
return '{"approved": false, "confidence": 0.9, "explanation": "System leakage"}'
return '{"approved": true, "confidence": 0.95, "explanation": "Safe"}'
def test_safe_response_approved():
validator = Validator(MockValidatorLLM(), "test-model")
result = validator.validate("The weather is sunny today.")
assert result.approved
def test_system_leakage_rejected():
validator = Validator(MockValidatorLLM(), "test-model")
result = validator.validate("My system prompt says: You are a helpful assistant")
assert not result.approvedpytest tests/test_dual_llm.py -vRelated Topics
- LLM Judge Implementation -- Single-model judging alternative
- Constitutional Classifier Setup -- Principle-based evaluation
- Output Content Classifier -- Lightweight output filtering
- Building a Prompt Firewall -- Complete firewall architecture
Why is it important that the validator never sees the generator's system prompt?