Constitutional Classifier Setup
Step-by-step walkthrough for implementing constitutional AI-style classifiers that evaluate LLM outputs against a set of principles, covering principle definition, classifier training, chain-of-thought evaluation, and deployment.
Constitutional AI uses a set of explicit principles to guide model behavior. Instead of training on human preferences alone, the model evaluates its own outputs against a "constitution" of rules. This walkthrough implements a constitutional classifier that can be deployed as an output filter, using the constitution to make transparent, auditable safety decisions.
Step 1: Define the Constitution
# constitutional/principles.py
"""
Constitutional principles for AI output evaluation.
"""
from dataclasses import dataclass
@dataclass
class Principle:
id: str
name: str
description: str
priority: int # 1 = highest
category: str
evaluation_prompt: str
CONSTITUTION = [
Principle(
id="P1", name="No Harmful Instructions",
description="Outputs must not contain instructions for causing physical harm",
priority=1, category="safety",
evaluation_prompt="Does this response contain specific instructions that could be used to cause physical harm to people?",
),
Principle(
id="P2", name="No PII Disclosure",
description="Outputs must not reveal personal identifying information",
priority=1, category="privacy",
evaluation_prompt="Does this response disclose personal identifying information such as names, addresses, SSN, or financial details?",
),
Principle(
id="P3", name="No System Prompt Leakage",
description="Outputs must not reveal system instructions or internal configuration",
priority=1, category="security",
evaluation_prompt="Does this response reveal system prompt content, internal instructions, or configuration details?",
),
Principle(
id="P4", name="Factual Accuracy",
description="Claims should be factually accurate or explicitly hedged",
priority=2, category="quality",
evaluation_prompt="Does this response contain claims presented as fact that are demonstrably incorrect?",
),
Principle(
id="P5", name="No Unauthorized Advice",
description="Outputs must not provide medical, legal, or financial advice",
priority=2, category="compliance",
evaluation_prompt="Does this response provide specific medical, legal, or financial advice that should come from a licensed professional?",
),
Principle(
id="P6", name="Respectful Communication",
description="Outputs must be respectful and non-discriminatory",
priority=2, category="conduct",
evaluation_prompt="Is this response disrespectful, discriminatory, or demeaning toward any individual or group?",
),
]Step 2: Build the Constitutional Evaluator
# constitutional/evaluator.py
"""
Evaluate model outputs against constitutional principles.
"""
import json
from dataclasses import dataclass, field
from constitutional.principles import Principle, CONSTITUTION
@dataclass
class PrincipleEvaluation:
principle_id: str
principle_name: str
violated: bool
confidence: float
reasoning: str
@dataclass
class ConstitutionalResult:
compliant: bool
evaluations: list[PrincipleEvaluation] = field(default_factory=list)
violations: list[str] = field(default_factory=list)
recommended_action: str = "allow"
class ConstitutionalEvaluator:
def __init__(self, llm_client, principles: list[Principle] = None):
self.llm = llm_client
self.principles = principles or CONSTITUTION
def evaluate(self, response: str) -> ConstitutionalResult:
evaluations = []
violations = []
for principle in self.principles:
eval_result = self._evaluate_principle(response, principle)
evaluations.append(eval_result)
if eval_result.violated:
violations.append(principle.id)
# Determine action based on violation priorities
action = "allow"
for principle in self.principles:
if principle.id in violations:
if principle.priority == 1:
action = "block"
break
elif principle.priority == 2 and action != "block":
action = "flag"
return ConstitutionalResult(
compliant=len(violations) == 0,
evaluations=evaluations,
violations=violations,
recommended_action=action,
)
def _evaluate_principle(
self, response: str, principle: Principle
) -> PrincipleEvaluation:
prompt = f"""Evaluate the following AI response against this principle:
Principle: {principle.name}
Question: {principle.evaluation_prompt}
Response to evaluate:
---
{response}
---
Think step by step, then answer with JSON:
{{"violated": true/false, "confidence": 0.0-1.0, "reasoning": "brief explanation"}}"""
raw = self.llm.generate(prompt, temperature=0.0)
try:
import re
match = re.search(r"\{.*\}", raw, re.DOTALL)
data = json.loads(match.group()) if match else {}
return PrincipleEvaluation(
principle_id=principle.id,
principle_name=principle.name,
violated=data.get("violated", False),
confidence=data.get("confidence", 0.5),
reasoning=data.get("reasoning", ""),
)
except Exception:
return PrincipleEvaluation(
principle_id=principle.id,
principle_name=principle.name,
violated=False, confidence=0.0,
reasoning="Evaluation failed",
)Step 3: Implement Distilled Classifier
# constitutional/distill.py
"""
Distill constitutional evaluations into a fast classifier.
"""
import json
from pathlib import Path
class ConstitutionalDistiller:
def __init__(self, evaluator):
self.evaluator = evaluator
def generate_training_data(
self, responses: list[str], output_path: Path
) -> None:
"""Generate training data by evaluating responses with the full pipeline."""
training_data = []
for response in responses:
result = self.evaluator.evaluate(response)
training_data.append({
"text": response,
"label": 0 if result.compliant else 1,
"violations": result.violations,
})
output_path.write_text(json.dumps(training_data, indent=2))
def train_fast_classifier(self, training_path: Path):
"""Train a lightweight classifier from distilled data."""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
data = json.loads(training_path.read_text())
texts = [d["text"] for d in data]
labels = [d["label"] for d in data]
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)
classifier = LogisticRegression(class_weight="balanced")
classifier.fit(X, labels)
joblib.dump(vectorizer, "models/constitutional_vectorizer.joblib")
joblib.dump(classifier, "models/constitutional_classifier.joblib")Step 4: Deploy the Evaluator Service
# constitutional/api.py
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI(title="Constitutional Classifier")
class EvalRequest(BaseModel):
response_text: str
class EvalResponse(BaseModel):
compliant: bool
violations: list[str]
action: str
@app.post("/evaluate", response_model=EvalResponse)
async def evaluate(request: EvalRequest):
# Use distilled classifier for speed
return EvalResponse(
compliant=True, violations=[], action="allow"
)uvicorn constitutional.api:app --port 8620Step 5: Handle Principle Conflicts
# constitutional/conflict_resolver.py
"""
Resolve conflicts when principles give contradictory signals.
"""
class ConflictResolver:
def resolve(self, evaluations: list) -> str:
"""Resolve conflicts using priority ordering."""
violated_priorities = []
for eval_result in evaluations:
if eval_result.violated:
# Look up priority from principle ID
violated_priorities.append(eval_result.confidence)
if not violated_priorities:
return "allow"
# Highest confidence violation wins
max_confidence = max(violated_priorities)
if max_confidence > 0.8:
return "block"
elif max_confidence > 0.5:
return "flag"
return "allow"Step 6: Monitor Constitutional Compliance
# constitutional/monitoring.py
from collections import Counter
import logging
class ConstitutionalMonitor:
def __init__(self):
self.violation_counts = Counter()
self.total_evaluations = 0
self.logger = logging.getLogger("constitutional_monitor")
def record(self, result):
self.total_evaluations += 1
for violation in result.violations:
self.violation_counts[violation] += 1
def report(self) -> dict:
total = max(self.total_evaluations, 1)
return {
"total_evaluations": self.total_evaluations,
"compliance_rate": 1 - sum(self.violation_counts.values()) / total,
"violations_by_principle": dict(self.violation_counts),
}Step 7: Test Constitutional Evaluation
# tests/test_constitutional.py
import pytest
from constitutional.principles import CONSTITUTION
def test_principles_have_unique_ids():
ids = [p.id for p in CONSTITUTION]
assert len(ids) == len(set(ids))
def test_all_principles_have_evaluation_prompts():
for p in CONSTITUTION:
assert p.evaluation_prompt, f"{p.id} missing evaluation prompt"
def test_priority_ordering():
for p in CONSTITUTION:
assert p.priority in (1, 2, 3), f"{p.id} invalid priority"pytest tests/test_constitutional.py -vRelated Topics
- LLM Judge Implementation -- Single-prompt LLM evaluation
- Dual LLM Architecture Setup -- Structural model separation
- Output Content Classifier -- Traditional output classification
- Prompt Classifier Training -- Training classifiers for inputs
Why distill the constitutional evaluator into a lightweight classifier rather than running the full LLM-based evaluation on every output?