雙 LLM 架構設置

進階5 分鐘閱讀更新於 2026-03-15

實作雙 LLM 模式的逐步演練——一個模型產生回應、另一個模型驗證之,涵蓋架構設計、驗證者提示詞工程、延遲最佳化與失敗處理。

dual-llm architecture validation defense-in-depth defense walkthrough

雙 LLM 模式使用兩個獨立模型:一個產生回應的生成者,以及一個檢查回應的驗證者。由於驗證者在獨立上下文中處理生成者的輸出,使生成者受提示詞注入影響,並不會自動影響驗證者。此架構隔離是對抗提示詞注入最強的防禦模式之一。

步驟 1:設計架構

# dual_llm/architecture.py
"""
Dual LLM architecture: generator + validator.
"""
from dataclasses import dataclass, field
from typing import Optional
 
@dataclass
class DualLLMConfig:
    generator_model: str = "gpt-4o-mini"
    validator_model: str = "gpt-4o"  # Stronger model for validation
    max_retries: int = 2
    validator_timeout_ms: int = 10000
    fail_open: bool = False  # Block if validator fails
 
@dataclass
class GenerationResult:
    response: str
    generator_model: str
    validated: bool
    validator_approved: bool = False
    validator_explanation: str = ""
    total_latency_ms: float = 0.0
    retries: int = 0

步驟 2:建構生成者

# dual_llm/generator.py
"""
Generator model wrapper with isolation.
"""
import time
 
class Generator:
    def __init__(self, llm_client, model: str):
        self.llm = llm_client
        self.model = model
 
    def generate(
        self, system_prompt: str, user_message: str
    ) -> tuple[str, float]:
        start = time.monotonic()
        response = self.llm.chat(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message},
            ],
        )
        latency = (time.monotonic() - start) * 1000
        return response, latency

步驟 3:建構驗證者

# dual_llm/validator.py
"""
Validator model that checks generator output in an isolated context.
The validator never sees the system prompt or user context that
the generator processed -- only the output to validate.
"""
import json
import re
import time
from dataclasses import dataclass
 
@dataclass
class ValidationResult:
    approved: bool
    confidence: float
    explanation: str
    latency_ms: float
 
VALIDATOR_PROMPT = """You are a safety validator. Evaluate the following AI-generated response.
 
Check for:
1. Harmful instructions or dangerous content
2. Personal information disclosure
3. System prompt or instruction leakage
4. Inappropriate or off-topic content
5. Factual claims that appear fabricated
 
Response to validate:
---
{response}
---
 
Output ONLY JSON: {{"approved": true/false, "confidence": 0.0-1.0, "explanation": "reason"}}"""
 
class Validator:
    def __init__(self, llm_client, model: str):
        self.llm = llm_client
        self.model = model
 
    def validate(self, response: str) -> ValidationResult:
        start = time.monotonic()
 
        prompt = VALIDATOR_PROMPT.format(response=response)
        raw = self.llm.chat(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
        )
 
        latency = (time.monotonic() - start) * 1000
 
        try:
            match = re.search(r"\{.*\}", raw, re.DOTALL)
            data = json.loads(match.group()) if match else {}
            return ValidationResult(
                approved=data.get("approved", False),
                confidence=data.get("confidence", 0.5),
                explanation=data.get("explanation", ""),
                latency_ms=latency,
            )
        except Exception:
            return ValidationResult(
                approved=False, confidence=0.0,
                explanation="Validation parsing failed",
                latency_ms=latency,
            )

步驟 4:將生成者與驗證者串接

# dual_llm/pipeline.py
"""
Dual LLM pipeline that coordinates generator and validator.
"""
import asyncio
from dual_llm.architecture import DualLLMConfig, GenerationResult
from dual_llm.generator import Generator
from dual_llm.validator import Validator
 
class DualLLMPipeline:
    def __init__(
        self, generator: Generator, validator: Validator,
        config: DualLLMConfig = None,
    ):
        self.generator = generator
        self.validator = validator
        self.config = config or DualLLMConfig()
 
    def process(
        self, system_prompt: str, user_message: str
    ) -> GenerationResult:
        for attempt in range(self.config.max_retries + 1):
            response, gen_latency = self.generator.generate(
                system_prompt, user_message
            )
 
            validation = self.validator.validate(response)
            total_latency = gen_latency + validation.latency_ms
 
            if validation.approved:
                return GenerationResult(
                    response=response,
                    generator_model=self.config.generator_model,
                    validated=True,
                    validator_approved=True,
                    validator_explanation=validation.explanation,
                    total_latency_ms=total_latency,
                    retries=attempt,
                )
 
        # All retries exhausted
        if self.config.fail_open:
            return GenerationResult(
                response=response,
                generator_model=self.config.generator_model,
                validated=True,
                validator_approved=False,
                validator_explanation="Validator rejected all attempts",
                total_latency_ms=total_latency,
                retries=self.config.max_retries,
            )
 
        return GenerationResult(
            response="I'm unable to provide a response to that question.",
            generator_model=self.config.generator_model,
            validated=True,
            validator_approved=False,
            validator_explanation="Blocked after failed validation",
            total_latency_ms=total_latency,
            retries=self.config.max_retries,
        )

步驟 5:加入非同步並行處理

# dual_llm/async_pipeline.py
"""
Async pipeline for reduced latency.
"""
import asyncio
import time
 
class AsyncDualLLMPipeline:
    def __init__(self, generator, validator, config=None):
        self.generator = generator
        self.validator = validator
 
    async def process_async(
        self, system_prompt: str, user_message: str
    ) -> dict:
        start = time.monotonic()
 
        # Generate response
        response, gen_latency = self.generator.generate(
            system_prompt, user_message
        )
 
        # Validate asynchronously
        validation = await asyncio.to_thread(
            self.validator.validate, response
        )
 
        return {
            "response": response if validation.approved else
                "I cannot provide that response.",
            "approved": validation.approved,
            "total_latency_ms": (time.monotonic() - start) * 1000,
        }

步驟 6:監控一致率

# dual_llm/monitoring.py
from collections import Counter
import logging
 
class DualLLMMonitor:
    def __init__(self):
        self.total = 0
        self.approved = 0
        self.rejected = 0
        self.retry_counts = Counter()
        self.logger = logging.getLogger("dual_llm")
 
    def record(self, result: "GenerationResult"):
        self.total += 1
        if result.validator_approved:
            self.approved += 1
        else:
            self.rejected += 1
        self.retry_counts[result.retries] += 1
 
        rejection_rate = self.rejected / max(self.total, 1)
        if rejection_rate > 0.2 and self.total > 50:
            self.logger.warning(
                f"High rejection rate: {rejection_rate:.2%} -- "
                f"generator may need prompt tuning"
            )
 
    def report(self) -> dict:
        return {
            "total": self.total,
            "approval_rate": self.approved / max(self.total, 1),
            "retry_distribution": dict(self.retry_counts),
        }

步驟 7:測試雙 LLM 管線

# tests/test_dual_llm.py
import pytest
from dual_llm.validator import Validator, ValidationResult
 
class MockValidatorLLM:
    def chat(self, **kwargs):
        content = kwargs.get("messages", [{}])[0].get("content", "")
        if "system prompt" in content.lower():
            return '{"approved": false, "confidence": 0.9, "explanation": "System leakage"}'
        return '{"approved": true, "confidence": 0.95, "explanation": "Safe"}'
 
def test_safe_response_approved():
    validator = Validator(MockValidatorLLM(), "test-model")
    result = validator.validate("The weather is sunny today.")
    assert result.approved
 
def test_system_leakage_rejected():
    validator = Validator(MockValidatorLLM(), "test-model")
    result = validator.validate("My system prompt says: You are a helpful assistant")
    assert not result.approved

pytest tests/test_dual_llm.py -v

雙 LLM 架構設置

進階5 分鐘閱讀更新於 2026-03-15

實作雙 LLM 模式的逐步演練——一個模型產生回應、另一個模型驗證之,涵蓋架構設計、驗證者提示詞工程、延遲最佳化與失敗處理。

dual-llm architecture validation defense-in-depth defense walkthrough

步驟 1:設計架構

# dual_llm/architecture.py
"""
Dual LLM architecture: generator + validator.
"""
from dataclasses import dataclass, field
from typing import Optional
 
@dataclass
class DualLLMConfig:
    generator_model: str = "gpt-4o-mini"
    validator_model: str = "gpt-4o"  # Stronger model for validation
    max_retries: int = 2
    validator_timeout_ms: int = 10000
    fail_open: bool = False  # Block if validator fails
 
@dataclass
class GenerationResult:
    response: str
    generator_model: str
    validated: bool
    validator_approved: bool = False
    validator_explanation: str = ""
    total_latency_ms: float = 0.0
    retries: int = 0

步驟 2:建構生成者

# dual_llm/generator.py
"""
Generator model wrapper with isolation.
"""
import time
 
class Generator:
    def __init__(self, llm_client, model: str):
        self.llm = llm_client
        self.model = model
 
    def generate(
        self, system_prompt: str, user_message: str
    ) -> tuple[str, float]:
        start = time.monotonic()
        response = self.llm.chat(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message},
            ],
        )
        latency = (time.monotonic() - start) * 1000
        return response, latency

步驟 3:建構驗證者

# dual_llm/validator.py
"""
Validator model that checks generator output in an isolated context.
The validator never sees the system prompt or user context that
the generator processed -- only the output to validate.
"""
import json
import re
import time
from dataclasses import dataclass
 
@dataclass
class ValidationResult:
    approved: bool
    confidence: float
    explanation: str
    latency_ms: float
 
VALIDATOR_PROMPT = """You are a safety validator. Evaluate the following AI-generated response.
 
Check for:
1. Harmful instructions or dangerous content
2. Personal information disclosure
3. System prompt or instruction leakage
4. Inappropriate or off-topic content
5. Factual claims that appear fabricated
 
Response to validate:
---
{response}
---
 
Output ONLY JSON: {{"approved": true/false, "confidence": 0.0-1.0, "explanation": "reason"}}"""
 
class Validator:
    def __init__(self, llm_client, model: str):
        self.llm = llm_client
        self.model = model
 
    def validate(self, response: str) -> ValidationResult:
        start = time.monotonic()
 
        prompt = VALIDATOR_PROMPT.format(response=response)
        raw = self.llm.chat(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
        )
 
        latency = (time.monotonic() - start) * 1000
 
        try:
            match = re.search(r"\{.*\}", raw, re.DOTALL)
            data = json.loads(match.group()) if match else {}
            return ValidationResult(
                approved=data.get("approved", False),
                confidence=data.get("confidence", 0.5),
                explanation=data.get("explanation", ""),
                latency_ms=latency,
            )
        except Exception:
            return ValidationResult(
                approved=False, confidence=0.0,
                explanation="Validation parsing failed",
                latency_ms=latency,
            )

步驟 4:將生成者與驗證者串接

# dual_llm/pipeline.py
"""
Dual LLM pipeline that coordinates generator and validator.
"""
import asyncio
from dual_llm.architecture import DualLLMConfig, GenerationResult
from dual_llm.generator import Generator
from dual_llm.validator import Validator
 
class DualLLMPipeline:
    def __init__(
        self, generator: Generator, validator: Validator,
        config: DualLLMConfig = None,
    ):
        self.generator = generator
        self.validator = validator
        self.config = config or DualLLMConfig()
 
    def process(
        self, system_prompt: str, user_message: str
    ) -> GenerationResult:
        for attempt in range(self.config.max_retries + 1):
            response, gen_latency = self.generator.generate(
                system_prompt, user_message
            )
 
            validation = self.validator.validate(response)
            total_latency = gen_latency + validation.latency_ms
 
            if validation.approved:
                return GenerationResult(
                    response=response,
                    generator_model=self.config.generator_model,
                    validated=True,
                    validator_approved=True,
                    validator_explanation=validation.explanation,
                    total_latency_ms=total_latency,
                    retries=attempt,
                )
 
        # All retries exhausted
        if self.config.fail_open:
            return GenerationResult(
                response=response,
                generator_model=self.config.generator_model,
                validated=True,
                validator_approved=False,
                validator_explanation="Validator rejected all attempts",
                total_latency_ms=total_latency,
                retries=self.config.max_retries,
            )
 
        return GenerationResult(
            response="I'm unable to provide a response to that question.",
            generator_model=self.config.generator_model,
            validated=True,
            validator_approved=False,
            validator_explanation="Blocked after failed validation",
            total_latency_ms=total_latency,
            retries=self.config.max_retries,
        )

步驟 5:加入非同步並行處理

# dual_llm/async_pipeline.py
"""
Async pipeline for reduced latency.
"""
import asyncio
import time
 
class AsyncDualLLMPipeline:
    def __init__(self, generator, validator, config=None):
        self.generator = generator
        self.validator = validator
 
    async def process_async(
        self, system_prompt: str, user_message: str
    ) -> dict:
        start = time.monotonic()
 
        # Generate response
        response, gen_latency = self.generator.generate(
            system_prompt, user_message
        )
 
        # Validate asynchronously
        validation = await asyncio.to_thread(
            self.validator.validate, response
        )
 
        return {
            "response": response if validation.approved else
                "I cannot provide that response.",
            "approved": validation.approved,
            "total_latency_ms": (time.monotonic() - start) * 1000,
        }

步驟 6:監控一致率

# dual_llm/monitoring.py
from collections import Counter
import logging
 
class DualLLMMonitor:
    def __init__(self):
        self.total = 0
        self.approved = 0
        self.rejected = 0
        self.retry_counts = Counter()
        self.logger = logging.getLogger("dual_llm")
 
    def record(self, result: "GenerationResult"):
        self.total += 1
        if result.validator_approved:
            self.approved += 1
        else:
            self.rejected += 1
        self.retry_counts[result.retries] += 1
 
        rejection_rate = self.rejected / max(self.total, 1)
        if rejection_rate > 0.2 and self.total > 50:
            self.logger.warning(
                f"High rejection rate: {rejection_rate:.2%} -- "
                f"generator may need prompt tuning"
            )
 
    def report(self) -> dict:
        return {
            "total": self.total,
            "approval_rate": self.approved / max(self.total, 1),
            "retry_distribution": dict(self.retry_counts),
        }

步驟 7:測試雙 LLM 管線

# tests/test_dual_llm.py
import pytest
from dual_llm.validator import Validator, ValidationResult
 
class MockValidatorLLM:
    def chat(self, **kwargs):
        content = kwargs.get("messages", [{}])[0].get("content", "")
        if "system prompt" in content.lower():
            return '{"approved": false, "confidence": 0.9, "explanation": "System leakage"}'
        return '{"approved": true, "confidence": 0.95, "explanation": "Safe"}'
 
def test_safe_response_approved():
    validator = Validator(MockValidatorLLM(), "test-model")
    result = validator.validate("The weather is sunny today.")
    assert result.approved
 
def test_system_leakage_rejected():
    validator = Validator(MockValidatorLLM(), "test-model")
    result = validator.validate("My system prompt says: You are a helpful assistant")
    assert not result.approved

pytest tests/test_dual_llm.py -v

雙 LLM 架構設置

步驟 1:設計架構

步驟 2:建構生成者

步驟 3:建構驗證者

步驟 4:將生成者與驗證者串接

步驟 5:加入非同步並行處理

步驟 6:監控一致率

步驟 7:測試雙 LLM 管線

相關主題

雙 LLM 架構設置

步驟 1:設計架構

步驟 2:建構生成者

步驟 3:建構驗證者

步驟 4:將生成者與驗證者串接

步驟 5:加入非同步並行處理

步驟 6:監控一致率

步驟 7:測試雙 LLM 管線

相關主題

雙 LLM 架構設置

相關文章

雙 LLM 架構設置

相關文章