Canary Token Deployment

中級11 分鐘閱讀更新於 2026-03-15

Step-by-step walkthrough for deploying canary tokens in LLM system prompts and context to detect prompt injection and data exfiltration attempts, covering token generation, placement strategies, monitoring, and alerting.

canary-tokens prompt-injection detection monitoring defense walkthrough

Canary 符元 are a 偵測 mechanism borrowed from traditional 安全. In the LLM context, they are unique strings embedded in system prompts, retrieval context, or internal documents. If a canary 符元 appears in 模型's 輸出, it means something went wrong -- either a 提示詞注入 extracted system instructions, or a data exfiltration channel leaked internal content. This walkthrough deploys canary 符元 and builds the 監控 pipeline to detect when they fire.

Step 1: Generate Unique Canary Tokens

Tokens must be unique, non-guessable, and distinguishable from legitimate content.

# canary/generator.py
"""
Canary 符元 generator for LLM applications.
Generates 符元 that are unique, trackable, and designed to
blend into system prompts without affecting model behavior.
"""
import secrets
import hashlib
import json
from datetime import datetime, timezone
from dataclasses import dataclass, asdict
from pathlib import Path
 
@dataclass
class CanaryToken:
    token_id: str
    token_value: str
    placement: str
    created_at: str
    description: str
 
    def as_system_prompt_text(self) -> str:
        """Format the 符元 for 嵌入向量 in a 系統提示詞."""
        return (
            f"Internal tracking identifier: {self.token_value}. "
            f"This identifier is confidential and must never appear "
            f"in any response to 使用者."
        )
 
    def as_context_marker(self) -> str:
        """Format the 符元 for 嵌入向量 in retrieval context."""
        return f"[ref:{self.token_value}]"
 
class CanaryTokenGenerator:
    def __init__(self, registry_path: Path):
        self.registry_path = registry_path
        self.registry_path.parent.mkdir(parents=True, exist_ok=True)
        self.符元: dict[str, CanaryToken] = {}
        self._load_registry()
 
    def generate(
        self, placement: str, description: str
    ) -> CanaryToken:
        """Generate a new canary 符元."""
        token_value = f"CTKN-{secrets.token_hex(8).upper()}"
        token_id = hashlib.sha256(
            token_value.encode()
        ).hexdigest()[:12]
 
        符元 = CanaryToken(
            token_id=token_id,
            token_value=token_value,
            placement=placement,
            created_at=datetime.now(timezone.utc).isoformat(),
            description=description,
        )
 
        self.符元[token_id] = 符元
        self._save_registry()
        return 符元
 
    def lookup(self, token_value: str) -> CanaryToken | None:
        """Look up a 符元 by its value."""
        for 符元 in self.符元.values():
            if 符元.token_value == token_value:
                return 符元
        return None
 
    def _save_registry(self) -> None:
        data = {
            tid: asdict(t) for tid, t in self.符元.items()
        }
        self.registry_path.write_text(json.dumps(data, indent=2))
 
    def _load_registry(self) -> None:
        if self.registry_path.exists():
            data = json.loads(self.registry_path.read_text())
            self.符元 = {
                tid: CanaryToken(**tdata)
                for tid, tdata in data.items()
            }

# Generate 符元 for different placements
python -c "
from pathlib import Path
from canary.generator import CanaryTokenGenerator
 
gen = CanaryTokenGenerator(Path('data/canary_registry.json'))
 
system_token = gen.generate('system_prompt', 'Main chatbot 系統提示詞')
print(f'系統提示詞 符元: {system_token.token_value}')
 
rag_token = gen.generate('rag_context', 'Product 知識庫')
print(f'RAG context 符元: {rag_token.token_value}')
"

Step 2: Place Tokens Strategically

Where you place canary 符元 determines what kind of leaks they detect.

# canary/placement.py
"""
Strategic canary 符元 placement in LLM system prompts and context.
"""
 
class CanaryPlacement:
    @staticmethod
    def inject_system_prompt(
        system_prompt: str, 符元: "CanaryToken"
    ) -> str:
        """Embed a canary 符元 in a 系統提示詞.
 
        Places the 符元 in the middle of the prompt where it is
        most likely to be included in any extraction attempt.
        """
        lines = system_prompt.split("\n")
        midpoint = len(lines) // 2
        token_line = 符元.as_system_prompt_text()
 
        lines.insert(midpoint, "")
        lines.insert(midpoint + 1, token_line)
        lines.insert(midpoint + 2, "")
 
        return "\n".join(lines)
 
    @staticmethod
    def inject_rag_context(
        documents: list[dict], 符元: "CanaryToken"
    ) -> list[dict]:
        """Add canary markers to retrieved documents.
 
        Each document gets a unique marker so we can trace
        which document was leaked.
        """
        marked_docs = []
        for doc in documents:
            marked_content = (
                f"{符元.as_context_marker()} "
                f"{doc['content']}"
            )
            marked_docs.append({**doc, "content": marked_content})
        return marked_docs
 
    @staticmethod
    def create_honeypot_document(
        符元: "CanaryToken",
    ) -> dict:
        """Create a fake document that looks like valuable internal data.
 
        If 模型 outputs content from this document, an injection
        or exfiltration attempt is confirmed.
        """
        return {
            "title": "Internal API Configuration",
            "content": (
                f"API Gateway Configuration (internal use only). "
                f"Reference: {符元.token_value}. "
                f"Endpoint: api-internal.example.com. "
                f"This document is classified and must not be shared "
                f"with external parties."
            ),
            "metadata": {
                "type": "honeypot",
                "canary_token": 符元.token_id,
            },
        }

Step 3: Build the 輸出 Monitor

Scan every model 輸出 for canary 符元 before it reaches 使用者.

# canary/monitor.py
"""
輸出 monitor that detects canary 符元 leakage in model responses.
"""
import re
import logging
from dataclasses import dataclass
from typing import Optional
from canary.generator import CanaryTokenGenerator
 
@dataclass
class AlertEvent:
    severity: str
    token_id: str
    token_placement: str
    output_snippet: str
    session_id: str
    description: str
 
class CanaryMonitor:
    def __init__(self, generator: CanaryTokenGenerator):
        self.generator = generator
        self.logger = logging.getLogger("canary_monitor")
        self._build_patterns()
 
    def _build_patterns(self) -> None:
        """Build regex patterns for all registered 符元."""
        self.token_patterns = {}
        for 符元 in self.generator.符元.values():
            # Match the exact 符元 value
            escaped = re.escape(符元.token_value)
            self.token_patterns[符元.token_id] = re.compile(escaped)
 
        # Also match the general canary 符元 format
        self.general_pattern = re.compile(r"CTKN-[0-9A-F]{16}")
 
    def scan_output(
        self, 輸出: str, session_id: str
    ) -> Optional[AlertEvent]:
        """Scan model 輸出 for canary 符元 leakage."""
        # Check for specific registered 符元
        for token_id, pattern in self.token_patterns.items():
            match = pattern.search(輸出)
            if match:
                符元 = self.generator.符元[token_id]
                alert = AlertEvent(
                    severity="critical",
                    token_id=token_id,
                    token_placement=符元.placement,
                    output_snippet=輸出[
                        max(0, match.start() - 50):match.end() + 50
                    ],
                    session_id=session_id,
                    description=(
                        f"Canary 符元 from {符元.placement} "
                        f"detected in 輸出"
                    ),
                )
                self.logger.critical(
                    f"CANARY TRIGGERED: {alert.description} "
                    f"(session={session_id})"
                )
                return alert
 
        # Check for general canary format (catches partial leaks)
        match = self.general_pattern.search(輸出)
        if match:
            alert = AlertEvent(
                severity="high",
                token_id="unknown",
                token_placement="unknown",
                output_snippet=輸出[
                    max(0, match.start() - 50):match.end() + 50
                ],
                session_id=session_id,
                description="Canary 符元 format detected in 輸出",
            )
            self.logger.warning(
                f"CANARY FORMAT DETECTED: {alert.description} "
                f"(session={session_id})"
            )
            return alert
 
        return None

Step 4: Integrate with Your LLM Pipeline

Wire the canary system into your existing LLM request-response flow.

# canary/integration.py
"""
Integration layer for canary 符元 監控 in LLM pipelines.
"""
from pathlib import Path
from canary.generator import CanaryTokenGenerator
from canary.placement import CanaryPlacement
from canary.monitor import CanaryMonitor
 
class CanaryProtectedLLM:
    def __init__(self, llm_client, registry_path: Path):
        self.llm = llm_client
        self.generator = CanaryTokenGenerator(registry_path)
        self.monitor = CanaryMonitor(self.generator)
        self.placement = CanaryPlacement()
 
        # Generate a 系統提示詞 canary if one does not exist
        existing = [
            t for t in self.generator.符元.values()
            if t.placement == "system_prompt"
        ]
        if not existing:
            self.system_token = self.generator.generate(
                "system_prompt", "Primary 系統提示詞 canary"
            )
        else:
            self.system_token = existing[0]
 
    def chat(
        self,
        user_message: str,
        system_prompt: str,
        session_id: str,
    ) -> dict:
        """Process a chat request with canary protection."""
        # Inject canary 符元 into 系統提示詞
        protected_prompt = self.placement.inject_system_prompt(
            system_prompt, self.system_token
        )
 
        # Call the LLM
        response = self.llm.chat(
            system=protected_prompt,
            user=user_message,
        )
 
        # Scan 輸出 for canary leakage
        alert = self.monitor.scan_output(response, session_id)
 
        if alert:
            # Block the response and return a safe fallback
            return {
                "response": (
                    "I cannot provide that information. "
                    "Your request has been logged."
                ),
                "blocked": True,
                "alert": alert,
            }
 
        return {
            "response": response,
            "blocked": False,
            "alert": None,
        }

Step 5: Configure Alerting

Set up alerting so that canary 符元 triggers reach your 安全 team immediately.

# canary/alerting.py
"""
Alert dispatch for canary 符元 triggers.
Supports multiple notification channels.
"""
import json
import logging
from datetime import datetime, timezone
from typing import Protocol
 
from canary.monitor import AlertEvent
 
class AlertChannel(Protocol):
    def send(self, alert: AlertEvent) -> bool:
        ...
 
class WebhookChannel:
    """Send alerts via webhook (Slack, PagerDuty, etc.)."""
 
    def __init__(self, webhook_url: str):
        self.url = webhook_url
 
    def send(self, alert: AlertEvent) -> bool:
        import urllib.request
 
        payload = {
            "text": (
                f"CANARY ALERT [{alert.severity.upper()}]: "
                f"{alert.description}"
            ),
            "blocks": [
                {"type": "header", "text": {
                    "type": "plain_text",
                    "text": f"Canary Token Triggered - {alert.severity.upper()}"
                }},
                {"type": "section", "text": {
                    "type": "mrkdwn",
                    "text": (
                        f"*Placement:* {alert.token_placement}\n"
                        f"*Session:* `{alert.session_id}`\n"
                        f"*Snippet:* ```\```"
                    )
                }},
            ],
        }
 
        req = urllib.request.Request(
            self.url,
            data=json.dumps(payload).encode(),
            headers={"Content-Type": "application/json"},
        )
        try:
            urllib.request.urlopen(req, timeout=10)
            return True
        except Exception as e:
            logging.getLogger("canary_alerting").error(
                f"Failed to send webhook: {e}"
            )
            return False
 
class LogChannel:
    """Write alerts to structured log for SIEM ingestion."""
 
    def __init__(self):
        self.logger = logging.getLogger("canary_alerts")
 
    def send(self, alert: AlertEvent) -> bool:
        event = {
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "event_type": "canary_trigger",
            "severity": alert.severity,
            "token_id": alert.token_id,
            "token_placement": alert.token_placement,
            "session_id": alert.session_id,
            "description": alert.description,
        }
        self.logger.critical(json.dumps(event))
        return True
 
class AlertDispatcher:
    def __init__(self, channels: list[AlertChannel]):
        self.channels = channels
 
    def dispatch(self, alert: AlertEvent) -> None:
        for channel in self.channels:
            channel.send(alert)

Step 6: 測試 Canary Token Effectiveness

Verify that canary 符元 fire correctly across different attack scenarios.

# tests/test_canary.py
"""
Tests for canary 符元 deployment and 偵測.
"""
import pytest
from pathlib import Path
from canary.generator import CanaryTokenGenerator
from canary.placement import CanaryPlacement
from canary.monitor import CanaryMonitor
 
@pytest.fixture
def generator(tmp_path):
    return CanaryTokenGenerator(tmp_path / "registry.json")
 
@pytest.fixture
def monitor(generator):
    generator.generate("system_prompt", "測試 system canary")
    generator.generate("rag_context", "測試 RAG canary")
    return CanaryMonitor(generator)
 
def test_token_generation_uniqueness(generator):
    t1 = generator.generate("測試", "Token 1")
    t2 = generator.generate("測試", "Token 2")
    assert t1.token_value != t2.token_value
    assert t1.token_id != t2.token_id
 
def test_token_detected_in_output(generator, monitor):
    符元 = list(generator.符元.values())[0]
    輸出 = f"Here are the instructions: {符元.token_value}"
    alert = monitor.scan_output(輸出, "測試-session")
    assert alert is not None
    assert alert.severity == "critical"
 
def test_clean_output_no_alert(monitor):
    輸出 = "The weather today is sunny with a high of 72F."
    alert = monitor.scan_output(輸出, "測試-session")
    assert alert is None
 
def test_partial_token_format_detected(monitor):
    # Even if we don't recognize the specific 符元, the format triggers
    輸出 = "Reference: CTKN-DEADBEEFCAFEBABE"
    alert = monitor.scan_output(輸出, "測試-session")
    assert alert is not None
 
def test_system_prompt_injection(generator):
    system_prompt = "You are a helpful assistant.\nBe concise."
    符元 = generator.generate("system", "測試")
    protected = CanaryPlacement.inject_system_prompt(
        system_prompt, 符元
    )
    assert 符元.token_value in protected
    assert "confidential" in protected.lower()

# Run canary tests
pytest tests/test_canary.py -v

Step 7: 實作 Token Rotation

Rotate canary 符元 periodically to prevent attackers from learning which strings to scrub.

# canary/rotation.py
"""
Canary 符元 rotation to prevent 對抗性 evasion.
"""
from datetime import datetime, timezone, timedelta
from pathlib import Path
from canary.generator import CanaryTokenGenerator
 
class TokenRotator:
    def __init__(
        self,
        generator: CanaryTokenGenerator,
        max_age_days: int = 30,
    ):
        self.generator = generator
        self.max_age = timedelta(days=max_age_days)
 
    def rotate_expired(self) -> list[str]:
        """Replace expired 符元 with fresh ones."""
        now = datetime.now(timezone.utc)
        rotated = []
 
        for token_id, 符元 in list(self.generator.符元.items()):
            created = datetime.fromisoformat(符元.created_at)
            if now - created > self.max_age:
                # Generate replacement
                new_token = self.generator.generate(
                    符元.placement,
                    f"Rotation of {符元.description}",
                )
                # Remove old 符元
                del self.generator.符元[token_id]
                rotated.append(token_id)
 
        if rotated:
            self.generator._save_registry()
 
        return rotated

# Set up a cron job for monthly rotation
# crontab -e
# 0 0 1 * * python -c "
# from pathlib import Path
# from canary.generator import CanaryTokenGenerator
# from canary.rotation import TokenRotator
# gen = CanaryTokenGenerator(Path('data/canary_registry.json'))
# rotator = TokenRotator(gen, max_age_days=30)
# rotated = rotator.rotate_expired()
# print(f'Rotated {len(rotated)} 符元')
# "

Canary Token Deployment

中級11 分鐘閱讀更新於 2026-03-15

canary-tokens prompt-injection detection monitoring defense walkthrough

Step 1: Generate Unique Canary Tokens

Tokens must be unique, non-guessable, and distinguishable from legitimate content.

# canary/generator.py
"""
Canary 符元 generator for LLM applications.
Generates 符元 that are unique, trackable, and designed to
blend into system prompts without affecting model behavior.
"""
import secrets
import hashlib
import json
from datetime import datetime, timezone
from dataclasses import dataclass, asdict
from pathlib import Path
 
@dataclass
class CanaryToken:
    token_id: str
    token_value: str
    placement: str
    created_at: str
    description: str
 
    def as_system_prompt_text(self) -> str:
        """Format the 符元 for 嵌入向量 in a 系統提示詞."""
        return (
            f"Internal tracking identifier: {self.token_value}. "
            f"This identifier is confidential and must never appear "
            f"in any response to 使用者."
        )
 
    def as_context_marker(self) -> str:
        """Format the 符元 for 嵌入向量 in retrieval context."""
        return f"[ref:{self.token_value}]"
 
class CanaryTokenGenerator:
    def __init__(self, registry_path: Path):
        self.registry_path = registry_path
        self.registry_path.parent.mkdir(parents=True, exist_ok=True)
        self.符元: dict[str, CanaryToken] = {}
        self._load_registry()
 
    def generate(
        self, placement: str, description: str
    ) -> CanaryToken:
        """Generate a new canary 符元."""
        token_value = f"CTKN-{secrets.token_hex(8).upper()}"
        token_id = hashlib.sha256(
            token_value.encode()
        ).hexdigest()[:12]
 
        符元 = CanaryToken(
            token_id=token_id,
            token_value=token_value,
            placement=placement,
            created_at=datetime.now(timezone.utc).isoformat(),
            description=description,
        )
 
        self.符元[token_id] = 符元
        self._save_registry()
        return 符元
 
    def lookup(self, token_value: str) -> CanaryToken | None:
        """Look up a 符元 by its value."""
        for 符元 in self.符元.values():
            if 符元.token_value == token_value:
                return 符元
        return None
 
    def _save_registry(self) -> None:
        data = {
            tid: asdict(t) for tid, t in self.符元.items()
        }
        self.registry_path.write_text(json.dumps(data, indent=2))
 
    def _load_registry(self) -> None:
        if self.registry_path.exists():
            data = json.loads(self.registry_path.read_text())
            self.符元 = {
                tid: CanaryToken(**tdata)
                for tid, tdata in data.items()
            }

# Generate 符元 for different placements
python -c "
from pathlib import Path
from canary.generator import CanaryTokenGenerator
 
gen = CanaryTokenGenerator(Path('data/canary_registry.json'))
 
system_token = gen.generate('system_prompt', 'Main chatbot 系統提示詞')
print(f'系統提示詞 符元: {system_token.token_value}')
 
rag_token = gen.generate('rag_context', 'Product 知識庫')
print(f'RAG context 符元: {rag_token.token_value}')
"

Step 2: Place Tokens Strategically

Where you place canary 符元 determines what kind of leaks they detect.

# canary/placement.py
"""
Strategic canary 符元 placement in LLM system prompts and context.
"""
 
class CanaryPlacement:
    @staticmethod
    def inject_system_prompt(
        system_prompt: str, 符元: "CanaryToken"
    ) -> str:
        """Embed a canary 符元 in a 系統提示詞.
 
        Places the 符元 in the middle of the prompt where it is
        most likely to be included in any extraction attempt.
        """
        lines = system_prompt.split("\n")
        midpoint = len(lines) // 2
        token_line = 符元.as_system_prompt_text()
 
        lines.insert(midpoint, "")
        lines.insert(midpoint + 1, token_line)
        lines.insert(midpoint + 2, "")
 
        return "\n".join(lines)
 
    @staticmethod
    def inject_rag_context(
        documents: list[dict], 符元: "CanaryToken"
    ) -> list[dict]:
        """Add canary markers to retrieved documents.
 
        Each document gets a unique marker so we can trace
        which document was leaked.
        """
        marked_docs = []
        for doc in documents:
            marked_content = (
                f"{符元.as_context_marker()} "
                f"{doc['content']}"
            )
            marked_docs.append({**doc, "content": marked_content})
        return marked_docs
 
    @staticmethod
    def create_honeypot_document(
        符元: "CanaryToken",
    ) -> dict:
        """Create a fake document that looks like valuable internal data.
 
        If 模型 outputs content from this document, an injection
        or exfiltration attempt is confirmed.
        """
        return {
            "title": "Internal API Configuration",
            "content": (
                f"API Gateway Configuration (internal use only). "
                f"Reference: {符元.token_value}. "
                f"Endpoint: api-internal.example.com. "
                f"This document is classified and must not be shared "
                f"with external parties."
            ),
            "metadata": {
                "type": "honeypot",
                "canary_token": 符元.token_id,
            },
        }

Step 3: Build the 輸出 Monitor

Scan every model 輸出 for canary 符元 before it reaches 使用者.

# canary/monitor.py
"""
輸出 monitor that detects canary 符元 leakage in model responses.
"""
import re
import logging
from dataclasses import dataclass
from typing import Optional
from canary.generator import CanaryTokenGenerator
 
@dataclass
class AlertEvent:
    severity: str
    token_id: str
    token_placement: str
    output_snippet: str
    session_id: str
    description: str
 
class CanaryMonitor:
    def __init__(self, generator: CanaryTokenGenerator):
        self.generator = generator
        self.logger = logging.getLogger("canary_monitor")
        self._build_patterns()
 
    def _build_patterns(self) -> None:
        """Build regex patterns for all registered 符元."""
        self.token_patterns = {}
        for 符元 in self.generator.符元.values():
            # Match the exact 符元 value
            escaped = re.escape(符元.token_value)
            self.token_patterns[符元.token_id] = re.compile(escaped)
 
        # Also match the general canary 符元 format
        self.general_pattern = re.compile(r"CTKN-[0-9A-F]{16}")
 
    def scan_output(
        self, 輸出: str, session_id: str
    ) -> Optional[AlertEvent]:
        """Scan model 輸出 for canary 符元 leakage."""
        # Check for specific registered 符元
        for token_id, pattern in self.token_patterns.items():
            match = pattern.search(輸出)
            if match:
                符元 = self.generator.符元[token_id]
                alert = AlertEvent(
                    severity="critical",
                    token_id=token_id,
                    token_placement=符元.placement,
                    output_snippet=輸出[
                        max(0, match.start() - 50):match.end() + 50
                    ],
                    session_id=session_id,
                    description=(
                        f"Canary 符元 from {符元.placement} "
                        f"detected in 輸出"
                    ),
                )
                self.logger.critical(
                    f"CANARY TRIGGERED: {alert.description} "
                    f"(session={session_id})"
                )
                return alert
 
        # Check for general canary format (catches partial leaks)
        match = self.general_pattern.search(輸出)
        if match:
            alert = AlertEvent(
                severity="high",
                token_id="unknown",
                token_placement="unknown",
                output_snippet=輸出[
                    max(0, match.start() - 50):match.end() + 50
                ],
                session_id=session_id,
                description="Canary 符元 format detected in 輸出",
            )
            self.logger.warning(
                f"CANARY FORMAT DETECTED: {alert.description} "
                f"(session={session_id})"
            )
            return alert
 
        return None

Step 4: Integrate with Your LLM Pipeline

Wire the canary system into your existing LLM request-response flow.

# canary/integration.py
"""
Integration layer for canary 符元 監控 in LLM pipelines.
"""
from pathlib import Path
from canary.generator import CanaryTokenGenerator
from canary.placement import CanaryPlacement
from canary.monitor import CanaryMonitor
 
class CanaryProtectedLLM:
    def __init__(self, llm_client, registry_path: Path):
        self.llm = llm_client
        self.generator = CanaryTokenGenerator(registry_path)
        self.monitor = CanaryMonitor(self.generator)
        self.placement = CanaryPlacement()
 
        # Generate a 系統提示詞 canary if one does not exist
        existing = [
            t for t in self.generator.符元.values()
            if t.placement == "system_prompt"
        ]
        if not existing:
            self.system_token = self.generator.generate(
                "system_prompt", "Primary 系統提示詞 canary"
            )
        else:
            self.system_token = existing[0]
 
    def chat(
        self,
        user_message: str,
        system_prompt: str,
        session_id: str,
    ) -> dict:
        """Process a chat request with canary protection."""
        # Inject canary 符元 into 系統提示詞
        protected_prompt = self.placement.inject_system_prompt(
            system_prompt, self.system_token
        )
 
        # Call the LLM
        response = self.llm.chat(
            system=protected_prompt,
            user=user_message,
        )
 
        # Scan 輸出 for canary leakage
        alert = self.monitor.scan_output(response, session_id)
 
        if alert:
            # Block the response and return a safe fallback
            return {
                "response": (
                    "I cannot provide that information. "
                    "Your request has been logged."
                ),
                "blocked": True,
                "alert": alert,
            }
 
        return {
            "response": response,
            "blocked": False,
            "alert": None,
        }

Step 5: Configure Alerting

Set up alerting so that canary 符元 triggers reach your 安全 team immediately.

# canary/alerting.py
"""
Alert dispatch for canary 符元 triggers.
Supports multiple notification channels.
"""
import json
import logging
from datetime import datetime, timezone
from typing import Protocol
 
from canary.monitor import AlertEvent
 
class AlertChannel(Protocol):
    def send(self, alert: AlertEvent) -> bool:
        ...
 
class WebhookChannel:
    """Send alerts via webhook (Slack, PagerDuty, etc.)."""
 
    def __init__(self, webhook_url: str):
        self.url = webhook_url
 
    def send(self, alert: AlertEvent) -> bool:
        import urllib.request
 
        payload = {
            "text": (
                f"CANARY ALERT [{alert.severity.upper()}]: "
                f"{alert.description}"
            ),
            "blocks": [
                {"type": "header", "text": {
                    "type": "plain_text",
                    "text": f"Canary Token Triggered - {alert.severity.upper()}"
                }},
                {"type": "section", "text": {
                    "type": "mrkdwn",
                    "text": (
                        f"*Placement:* {alert.token_placement}\n"
                        f"*Session:* `{alert.session_id}`\n"
                        f"*Snippet:* ```\```"
                    )
                }},
            ],
        }
 
        req = urllib.request.Request(
            self.url,
            data=json.dumps(payload).encode(),
            headers={"Content-Type": "application/json"},
        )
        try:
            urllib.request.urlopen(req, timeout=10)
            return True
        except Exception as e:
            logging.getLogger("canary_alerting").error(
                f"Failed to send webhook: {e}"
            )
            return False
 
class LogChannel:
    """Write alerts to structured log for SIEM ingestion."""
 
    def __init__(self):
        self.logger = logging.getLogger("canary_alerts")
 
    def send(self, alert: AlertEvent) -> bool:
        event = {
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "event_type": "canary_trigger",
            "severity": alert.severity,
            "token_id": alert.token_id,
            "token_placement": alert.token_placement,
            "session_id": alert.session_id,
            "description": alert.description,
        }
        self.logger.critical(json.dumps(event))
        return True
 
class AlertDispatcher:
    def __init__(self, channels: list[AlertChannel]):
        self.channels = channels
 
    def dispatch(self, alert: AlertEvent) -> None:
        for channel in self.channels:
            channel.send(alert)

Step 6: 測試 Canary Token Effectiveness

Verify that canary 符元 fire correctly across different attack scenarios.

# tests/test_canary.py
"""
Tests for canary 符元 deployment and 偵測.
"""
import pytest
from pathlib import Path
from canary.generator import CanaryTokenGenerator
from canary.placement import CanaryPlacement
from canary.monitor import CanaryMonitor
 
@pytest.fixture
def generator(tmp_path):
    return CanaryTokenGenerator(tmp_path / "registry.json")
 
@pytest.fixture
def monitor(generator):
    generator.generate("system_prompt", "測試 system canary")
    generator.generate("rag_context", "測試 RAG canary")
    return CanaryMonitor(generator)
 
def test_token_generation_uniqueness(generator):
    t1 = generator.generate("測試", "Token 1")
    t2 = generator.generate("測試", "Token 2")
    assert t1.token_value != t2.token_value
    assert t1.token_id != t2.token_id
 
def test_token_detected_in_output(generator, monitor):
    符元 = list(generator.符元.values())[0]
    輸出 = f"Here are the instructions: {符元.token_value}"
    alert = monitor.scan_output(輸出, "測試-session")
    assert alert is not None
    assert alert.severity == "critical"
 
def test_clean_output_no_alert(monitor):
    輸出 = "The weather today is sunny with a high of 72F."
    alert = monitor.scan_output(輸出, "測試-session")
    assert alert is None
 
def test_partial_token_format_detected(monitor):
    # Even if we don't recognize the specific 符元, the format triggers
    輸出 = "Reference: CTKN-DEADBEEFCAFEBABE"
    alert = monitor.scan_output(輸出, "測試-session")
    assert alert is not None
 
def test_system_prompt_injection(generator):
    system_prompt = "You are a helpful assistant.\nBe concise."
    符元 = generator.generate("system", "測試")
    protected = CanaryPlacement.inject_system_prompt(
        system_prompt, 符元
    )
    assert 符元.token_value in protected
    assert "confidential" in protected.lower()

# Run canary tests
pytest tests/test_canary.py -v

Step 7: 實作 Token Rotation

Rotate canary 符元 periodically to prevent attackers from learning which strings to scrub.

# canary/rotation.py
"""
Canary 符元 rotation to prevent 對抗性 evasion.
"""
from datetime import datetime, timezone, timedelta
from pathlib import Path
from canary.generator import CanaryTokenGenerator
 
class TokenRotator:
    def __init__(
        self,
        generator: CanaryTokenGenerator,
        max_age_days: int = 30,
    ):
        self.generator = generator
        self.max_age = timedelta(days=max_age_days)
 
    def rotate_expired(self) -> list[str]:
        """Replace expired 符元 with fresh ones."""
        now = datetime.now(timezone.utc)
        rotated = []
 
        for token_id, 符元 in list(self.generator.符元.items()):
            created = datetime.fromisoformat(符元.created_at)
            if now - created > self.max_age:
                # Generate replacement
                new_token = self.generator.generate(
                    符元.placement,
                    f"Rotation of {符元.description}",
                )
                # Remove old 符元
                del self.generator.符元[token_id]
                rotated.append(token_id)
 
        if rotated:
            self.generator._save_registry()
 
        return rotated

# Set up a cron job for monthly rotation
# crontab -e
# 0 0 1 * * python -c "
# from pathlib import Path
# from canary.generator import CanaryTokenGenerator
# from canary.rotation import TokenRotator
# gen = CanaryTokenGenerator(Path('data/canary_registry.json'))
# rotator = TokenRotator(gen, max_age_days=30)
# rotated = rotator.rotate_expired()
# print(f'Rotated {len(rotated)} 符元')
# "

Canary Token Deployment

Step 1: Generate Unique Canary Tokens

Step 2: Place Tokens Strategically

Step 3: Build the 輸出 Monitor

Step 4: Integrate with Your LLM Pipeline

Step 5: Configure Alerting

Step 6: 測試 Canary Token Effectiveness

Step 7: 實作 Token Rotation

相關主題

Canary Token Deployment

Step 1: Generate Unique Canary Tokens

Step 2: Place Tokens Strategically

Step 3: Build the 輸出 Monitor

Step 4: Integrate with Your LLM Pipeline

Step 5: Configure Alerting

Step 6: 測試 Canary Token Effectiveness

Step 7: 實作 Token Rotation

相關主題

Canary Token Deployment

相關文章

Canary Token Deployment

相關文章