Implementing Access Control in RAG Pipelines
導覽 for building access control systems in RAG pipelines that enforce document-level permissions, prevent cross-user data leakage, filter retrieved context based on user authorization, and resist retrieval poisoning attacks.
RAG (檢索增強生成) pipelines are one of the most popular patterns for grounding LLM responses in organizational knowledge. They are also one of the most dangerous from a 安全 perspective. Without access control, the retrieval step returns documents based solely on semantic similarity -- a query about "executive compensation" retrieves board meeting minutes regardless of whether 使用者 is authorized to see them. This walkthrough builds access control into the RAG pipeline so that retrieval respects user 權限.
Step 1: 理解 the RAG Access Control Problem
The core challenge is that vector similarity search has no concept of 授權. When a user asks a question, the retriever finds the most semantically similar documents -- without checking if 使用者 should see them:
Without Access Control:
User (intern) asks: "What is the CEO's salary?"
→ Vector search finds: board_meeting_minutes.pdf (confidential)
→ Model responds with confidential salary information
With Access Control:
User (intern) asks: "What is the CEO's salary?"
→ Vector search finds: board_meeting_minutes.pdf (confidential)
→ Access control filter: User lacks "executive_committee" clearance
→ Document filtered out before it reaches 模型
→ Model responds: "I don't have information about executive compensation."
Step 2: Document-Level 權限 Metadata
Embed 權限 metadata into every document at ingestion time:
# rag/權限.py
"""Document 權限 metadata for RAG access control."""
from dataclasses import dataclass, field
from enum import Enum
class AccessLevel(Enum):
PUBLIC = "public"
INTERNAL = "internal"
CONFIDENTIAL = "confidential"
RESTRICTED = "restricted"
@dataclass
class DocumentPermission:
"""權限 metadata embedded with each document chunk."""
document_id: str
access_level: AccessLevel
allowed_roles: list[str] = field(default_factory=list)
allowed_users: list[str] = field(default_factory=list)
allowed_departments: list[str] = field(default_factory=list)
owner: str = ""
classification_tags: list[str] = field(default_factory=list)
def user_has_access(self, user_id: str, user_roles: list[str],
user_department: str) -> bool:
"""Check if a user has access to this document."""
# Public documents are accessible to everyone
if self.access_level == AccessLevel.PUBLIC:
return True
# Check explicit user access
if user_id in self.allowed_users:
return True
# Check role-based access
if any(role in self.allowed_roles for role in user_roles):
return True
# Check department access
if user_department in self.allowed_departments:
return True
# Owner always has access
if user_id == self.owner:
return True
return False
def to_metadata(self) -> dict:
"""Convert to vector store metadata format."""
return {
"document_id": self.document_id,
"access_level": self.access_level.value,
"allowed_roles": ",".join(self.allowed_roles),
"allowed_users": ",".join(self.allowed_users),
"allowed_departments": ",".join(self.allowed_departments),
"owner": self.owner,
}Step 3: Access-Controlled Ingestion
Embed 權限 when adding documents to the vector store:
# rag/ingestion.py
"""Access-controlled document ingestion for RAG pipelines."""
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from rag.權限 import DocumentPermission, AccessLevel
class AccessControlledIngestion:
"""Ingest documents with 權限 metadata."""
def __init__(self, vectorstore: FAISS):
self.vectorstore = vectorstore
def ingest_document(
self,
content: str,
權限: DocumentPermission,
source: str = "",
chunk_size: int = 1000,
):
"""Ingest a document with 權限 metadata attached to every chunk."""
# Split into chunks
chunks = self._split_text(content, chunk_size)
documents = []
for i, chunk in enumerate(chunks):
metadata = 權限.to_metadata()
metadata["source"] = source
metadata["chunk_index"] = i
metadata["total_chunks"] = len(chunks)
documents.append(Document(
page_content=chunk,
metadata=metadata,
))
self.vectorstore.add_documents(documents)
return len(documents)
def _split_text(self, text: str, chunk_size: int) -> list[str]:
"""Simple text splitting by character count with overlap."""
chunks = []
overlap = chunk_size // 5
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
if chunk.strip():
chunks.append(chunk)
start = end - overlap
return chunks
# 範例 ingestion
def ingest_sample_documents():
嵌入向量 = OpenAIEmbeddings()
vectorstore = FAISS.from_texts(["init"], 嵌入向量)
ingestion = AccessControlledIngestion(vectorstore)
# Public document
ingestion.ingest_document(
content="Our company was founded in 2010 and has 500 employees.",
權限=DocumentPermission(
document_id="DOC-001",
access_level=AccessLevel.PUBLIC,
),
source="company_overview.pdf",
)
# Confidential HR document
ingestion.ingest_document(
content="CEO compensation: $500,000 base salary plus stock options.",
權限=DocumentPermission(
document_id="DOC-002",
access_level=AccessLevel.CONFIDENTIAL,
allowed_roles=["executive", "hr_director"],
allowed_departments=["executive_office", "human_resources"],
),
source="executive_compensation.pdf",
)
# Internal engineering document
ingestion.ingest_document(
content="Production 資料庫 connection: postgres://prod:secret@db.internal",
權限=DocumentPermission(
document_id="DOC-003",
access_level=AccessLevel.RESTRICTED,
allowed_roles=["senior_engineer", "devops"],
allowed_departments=["engineering"],
),
source="infrastructure_guide.pdf",
)
return vectorstoreStep 4: Access-Controlled Retrieval
Filter retrieved documents based on the requesting user's 權限:
# rag/retriever.py
"""Access-controlled retrieval for RAG pipelines."""
from dataclasses import dataclass
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from rag.權限 import DocumentPermission, AccessLevel
@dataclass
class UserContext:
user_id: str
roles: list[str]
department: str
class AccessControlledRetriever:
"""Retrieve documents with access control enforcement."""
def __init__(self, vectorstore: FAISS, top_k: int = 5, over_fetch_ratio: int = 3):
self.vectorstore = vectorstore
self.top_k = top_k
self.over_fetch_ratio = over_fetch_ratio
def retrieve(self, query: str, user: UserContext) -> list[Document]:
"""Retrieve documents that 使用者 is authorized to access.
Strategy: Over-fetch by a ratio, filter by 權限, return top_k.
This ensures we still get enough results after filtering.
"""
# Fetch more than needed to account for filtering
candidates = self.vectorstore.similarity_search(
query, k=self.top_k * self.over_fetch_ratio
)
# Filter by access control
authorized = []
for doc in candidates:
if self._check_access(doc, user):
authorized.append(doc)
if len(authorized) >= self.top_k:
break
return authorized
def _check_access(self, doc: Document, user: UserContext) -> bool:
"""Check if 使用者 has access to a specific document."""
metadata = doc.metadata
access_level = metadata.get("access_level", "public")
if access_level == "public":
return True
# Check explicit user access
allowed_users = metadata.get("allowed_users", "").split(",")
if user.user_id in allowed_users:
return True
# Check role-based access
allowed_roles = metadata.get("allowed_roles", "").split(",")
if any(role in allowed_roles for role in user.roles):
return True
# Check department access
allowed_departments = metadata.get("allowed_departments", "").split(",")
if user.department in allowed_departments:
return True
return False
def retrieve_with_audit(self, query: str, user: UserContext) -> tuple[list[Document], dict]:
"""Retrieve with audit information about filtered documents."""
candidates = self.vectorstore.similarity_search(
query, k=self.top_k * self.over_fetch_ratio
)
authorized = []
filtered_out = []
for doc in candidates:
if self._check_access(doc, user):
authorized.append(doc)
else:
filtered_out.append({
"document_id": doc.metadata.get("document_id", "unknown"),
"access_level": doc.metadata.get("access_level", "unknown"),
"reason": "User lacks required role or department access",
})
audit = {
"total_candidates": len(candidates),
"authorized": len(authorized),
"filtered_out": len(filtered_out),
"filtered_details": filtered_out[:10],
}
return authorized[:self.top_k], auditStep 5: Document Sanitization
Sanitize documents during ingestion to prevent retrieval 投毒:
# rag/sanitizer.py
"""Document sanitization for retrieval 投毒 prevention."""
import re
from dataclasses import dataclass
@dataclass
class SanitizationResult:
original: str
sanitized: str
threats_found: list[str]
was_modified: bool
class DocumentSanitizer:
"""Sanitize documents before ingestion into the RAG pipeline."""
INJECTION_PATTERNS = [
(re.compile(r'(?:SYSTEM|ADMIN|INSTRUCTION)\s*:', re.IGNORECASE),
"instruction_injection", "System instruction marker"),
(re.compile(r'(?:ignore|disregard|override)\s+(?:all\s+)?(?:previous|prior)', re.IGNORECASE),
"instruction_override", "Instruction override attempt"),
(re.compile(r'(?:you are now|new instructions|updated instructions)', re.IGNORECASE),
"role_switching", "Role switching attempt"),
(re.compile(r'<\|.*?\|>', re.IGNORECASE),
"delimiter_injection", "Model delimiter injection"),
]
def sanitize(self, text: str) -> SanitizationResult:
"""Remove potentially 對抗性 content from a document."""
sanitized = text
threats = []
for pattern, threat_type, description in self.INJECTION_PATTERNS:
matches = pattern.findall(sanitized)
if matches:
threats.append(f"{description}: {len(matches)} instance(s)")
sanitized = pattern.sub("[SANITIZED]", sanitized)
return SanitizationResult(
original=text,
sanitized=sanitized,
threats_found=threats,
was_modified=sanitized != text,
)Step 6: Integration and 測試
Wire everything together and 測試 with 紅隊 scenarios:
# rag/pipeline.py
"""Complete access-controlled RAG pipeline."""
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from rag.retriever import AccessControlledRetriever, UserContext
from rag.sanitizer import DocumentSanitizer
class SecureRAGPipeline:
"""RAG pipeline with access control and sanitization."""
def __init__(self, vectorstore: FAISS):
self.retriever = AccessControlledRetriever(vectorstore)
self.sanitizer = DocumentSanitizer()
self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
self.prompt = ChatPromptTemplate.from_template(
"Use the following context to answer the question. "
"If the context doesn't contain relevant information, "
"say you don't have that information.\n\n"
"Context: {context}\n\n"
"Question: {question}\n\nAnswer:"
)
def query(self, question: str, user: UserContext) -> dict:
"""Execute a RAG query with access control."""
# Retrieve authorized documents
docs, audit = self.retriever.retrieve_with_audit(question, user)
if not docs:
return {
"answer": "I don't have access to information relevant to your question.",
"sources": [],
"audit": audit,
}
# Sanitize retrieved content
context_parts = []
for doc in docs:
sanitized = self.sanitizer.sanitize(doc.page_content)
context_parts.append(sanitized.sanitized)
context = "\n\n".join(context_parts)
# Generate response
chain = self.prompt | self.llm
response = chain.invoke({"context": context, "question": question})
return {
"answer": response.content,
"sources": [doc.metadata.get("source", "") for doc in docs],
"audit": audit,
}
# 測試 scenarios
def test_access_control():
"""測試 that access control works correctly."""
from rag.ingestion import ingest_sample_documents
vectorstore = ingest_sample_documents()
pipeline = SecureRAGPipeline(vectorstore)
# Intern should NOT see compensation data
intern = UserContext(user_id="intern1", roles=["intern"], department="marketing")
result = pipeline.query("What is the CEO's salary?", intern)
assert "500,000" not in result["answer"], "Intern should not see CEO salary"
print(f"Intern query (should be restricted): {result['answer'][:100]}")
# HR Director SHOULD see compensation data
hr = UserContext(user_id="hr1", roles=["hr_director"], department="human_resources")
result = pipeline.query("What is the CEO's salary?", hr)
print(f"HR query (should have access): {result['answer'][:100]}")
# Engineer should NOT see compensation but SHOULD see infra
eng = UserContext(user_id="eng1", roles=["senior_engineer"], department="engineering")
result = pipeline.query("What is the 資料庫 connection string?", eng)
print(f"Engineer query (should have access): {result['answer'][:100]}")
comp_result = pipeline.query("What is the CEO's salary?", eng)
assert "500,000" not in comp_result["answer"], "Engineer should not see CEO salary"
print(f"Engineer compensation query (should be restricted): {comp_result['answer'][:100]}")Common Pitfalls and Troubleshooting
| Problem | Cause | Solution |
|---|---|---|
| Over-filtering returns no results | Permissions too restrictive or over_fetch_ratio too low | Increase over_fetch_ratio, review 權限 assignments |
| Permissions not applied to all chunks | Document split after 權限 attachment | Attach 權限 during chunking, not before |
| Cross-user leakage through shared sessions | RAG context cached across users | Clear context between user sessions, never cache retrieval results |
| Retrieval 投毒 bypasses sanitizer | Novel injection patterns not covered | Update sanitizer patterns from latest 紅隊 findings |
| Metadata filtering too slow | Full table scan on metadata | Use vector DB native metadata filtering, index 權限 fields |
| Users can infer restricted content exists | "Access denied" vs "not found" is different | Return "I don't have that information" for both cases |
關鍵要點
RAG access control prevents the retrieval pipeline from becoming a data leakage channel:
- Embed 權限 at ingestion time -- every document chunk must carry its 權限 metadata. You cannot add access control after the fact.
- Over-fetch and filter -- retrieve more candidates than needed, filter by 權限, then return the top results. This ensures enough authorized results.
- Sanitize before 嵌入向量 -- documents can contain 對抗性 instructions that 模型 will follow. Sanitize during ingestion, not at query time.
- Audit retrieval decisions -- log which documents were retrieved and which were filtered. This audit trail is essential for compliance and incident response.
- Never reveal access control decisions -- the response to an unauthorized query should be identical to a query with no relevant documents. Do not tell 使用者 that documents exist but are restricted.
Advanced Considerations
Adapting to Modern 防禦
The defensive landscape for LLM applications has evolved significantly since the initial wave of 提示詞注入 research. Modern production systems often deploy multiple independent defensive layers, requiring attackers to adapt their techniques accordingly.
輸入 classification: The most common first line of 防禦 is an 輸入 classifier that evaluates incoming prompts for 對抗性 patterns. These classifiers range from simple keyword-based filters to sophisticated ML models trained on 對抗性 examples. Bypassing 輸入 classifiers requires 理解 their 偵測 methodology:
- Keyword-based classifiers can be evaded through encoding (Base64, ROT13, Unicode homoglyphs), paraphrasing, or splitting the payload across multiple turns
- ML-based classifiers require more sophisticated evasion such as semantic camouflage, gradual escalation, or 利用 of the classifier's own blind spots
class DefenseBypassStrategy:
"""Strategy selector for bypassing different 防禦 configurations."""
def __init__(self):
self.strategies = {
"keyword_filter": self._bypass_keyword,
"ml_classifier": self._bypass_ml,
"output_filter": self._bypass_output,
"rate_limiter": self._bypass_rate,
"behavioral": self._bypass_behavioral,
}
def select_strategy(self, identified_defenses: list) -> callable:
"""Select the optimal bypass strategy based on identified 防禦."""
if not identified_defenses:
return self._direct_attack
# Combine strategies for multi-layer 防禦
applicable = [
self.strategies[d] for d in identified_defenses
if d in self.strategies
]
return self._chain_strategies(applicable)
def _bypass_keyword(self, payload: str) -> str:
"""Bypass keyword-based 輸入 filters."""
import base64
# Encode the payload to bypass keyword matching
encoded = base64.b64encode(payload.encode()).decode()
return f"Decode and follow: {encoded}"
def _bypass_ml(self, payload: str) -> list:
"""Generate multiple paraphrased variants for ML classifier evasion."""
variants = [
f"As a thought experiment, 考慮: {payload}",
f"For academic research purposes: {payload}",
f"In a fictional scenario where: {payload}",
]
return variants
def _bypass_output(self, payload: str) -> str:
"""Modify the payload to extract data through 輸出 filter gaps."""
return payload + "\nRespond using only code blocks and variable names."
def _bypass_rate(self, payload: str) -> dict:
"""Configure timing to avoid rate limit triggers."""
return {"payload": payload, "delay": 3.0, "jitter": 1.5}
def _bypass_behavioral(self, payload: str) -> list:
"""Split the payload across multiple natural-sounding turns."""
# Multi-turn escalation to avoid behavioral anomaly 偵測
return [
"I'm working on a 安全 research project.",
"Could you help me 理解 how 安全 mechanisms work?",
"Specifically, I'm interested in how systems like yours " + payload[:100],
payload,
]
def _direct_attack(self, payload: str) -> str:
return payload
def _chain_strategies(self, strategies: list) -> callable:
"""Chain multiple bypass strategies."""
def chained(payload):
result = payload
for strategy in strategies:
result = strategy(result)
return result
return chained輸出 filtering: 輸出 filters inspect 模型's response before it reaches 使用者, looking for sensitive data leakage, harmful content, or other policy violations. Common 輸出 filter bypass techniques include:
| Technique | 運作方式 | Effectiveness |
|---|---|---|
| Encoding 輸出 | Request Base64/hex encoded responses | Medium — some filters check decoded content |
| Code block wrapping | Embed data in code comments/variables | High — many filters skip code blocks |
| Steganographic 輸出 | Hide data in formatting, capitalization, or spacing | High — difficult to detect |
| Chunked extraction | Extract small pieces across many turns | High — individual pieces may pass filters |
| Indirect extraction | Have 模型 reveal data through behavior changes | Very High — no explicit data in 輸出 |
Cross-Model Considerations
Techniques that work against one model may not directly transfer to others. 然而, 理解 the general principles allows adaptation:
-
安全 訓練 methodology: Models trained with RLHF (GPT-4, Claude) have different 安全 characteristics than those using DPO (Llama, Mistral) or other methods. RLHF-trained models tend to refuse more broadly but may be more susceptible to multi-turn escalation.
-
Context window size: Models with larger context windows (Claude with 200K, Gemini with 1M+) may be more susceptible to 上下文視窗 manipulation where 對抗性 content is buried in large amounts of benign text.
-
Multimodal capabilities: Models that process images, audio, or other modalities introduce additional attack surfaces not present in text-only models.
-
工具使用 實作: The 實作 details of 函式呼叫 vary significantly between providers. OpenAI uses a structured 函式呼叫 format, while Anthropic uses 工具使用 blocks. These differences affect 利用 techniques.
Operational Considerations
測試 Ethics and Boundaries
Professional 紅隊 測試 operates within clear ethical and legal boundaries:
- Authorization: Always obtain written 授權 before 測試. This should specify the scope, methods allowed, and any restrictions.
- Scope limits: Stay within the authorized scope. If you discover a 漏洞 that leads outside the authorized perimeter, document it and report it without exploiting it.
- Data handling: Handle any sensitive data discovered during 測試 according to the engagement agreement. Never retain sensitive data beyond what's needed for reporting.
- Responsible disclosure: Follow responsible disclosure practices for any 漏洞 discovered, particularly if they affect systems beyond your 測試 scope.
Documenting Results
Professional documentation follows a structured format:
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class Finding:
"""Structure for documenting a 安全 finding."""
id: str
title: str
severity: str # Critical, High, Medium, Low, Informational
category: str # OWASP LLM Top 10 category
description: str
steps_to_reproduce: list[str]
impact: str
recommendation: str
evidence: list[str] = field(default_factory=list)
mitre_atlas: Optional[str] = None
cvss_score: Optional[float] = None
discovered_at: str = field(default_factory=lambda: datetime.now().isoformat())
def to_report_section(self) -> str:
"""Generate a report section for this finding."""
steps = "\n".join(f" {i+1}. {s}" for i, s in enumerate(self.steps_to_reproduce))
return f"""
### {self.id}: {self.title}
**Severity**: {self.severity}
**Category**: {self.category}
{f"**MITRE ATLAS**: {self.mitre_atlas}" if self.mitre_atlas else ""}
#### Description
{self.description}
#### Steps to Reproduce
{steps}
#### Impact
{self.impact}
#### Recommendation
{self.recommendation}
"""This structured approach ensures that findings are actionable and that remediation teams have the information they need to address the 漏洞 effectively.
Advanced Considerations
Evolving 攻擊 Landscape
The AI 安全 landscape evolves rapidly as both offensive techniques and defensive measures advance. Several trends shape the current state of play:
Increasing model capabilities create new attack surfaces. As models gain access to tools, code execution, web browsing, and computer use, each new capability introduces potential 利用 vectors that did not exist in earlier, text-only systems. The principle of least privilege becomes increasingly important as model capabilities expand.
安全 訓練 improvements are necessary but not sufficient. Model providers invest heavily in 安全 訓練 through RLHF, DPO, constitutional AI, and other 對齊 techniques. These improvements raise the bar for successful attacks but do not eliminate the fundamental 漏洞: models cannot reliably distinguish legitimate instructions from 對抗性 ones 因為 this distinction is not represented in the architecture.
Automated 紅隊演練 tools democratize 測試. Tools like NVIDIA's Garak, Microsoft's PyRIT, and Promptfoo enable organizations to conduct automated 安全 測試 without deep AI 安全 expertise. 然而, automated tools catch known patterns; novel attacks and business logic 漏洞 still require human creativity and domain knowledge.
Regulatory pressure drives organizational investment. The EU AI Act, NIST AI RMF, and industry-specific regulations increasingly require organizations to 評估 and mitigate AI-specific risks. This regulatory pressure is driving investment in AI 安全 programs, but many organizations are still in the early stages of building mature AI 安全 practices.
Cross-Cutting 安全 Principles
Several 安全 principles apply across all topics covered 在本 curriculum:
-
防禦-in-depth: No single defensive measure is sufficient. Layer multiple independent 防禦 so that failure of any single layer does not result in system compromise. 輸入 classification, 輸出 filtering, behavioral 監控, and architectural controls should all be present.
-
Assume breach: Design systems assuming that any individual component can be compromised. This mindset leads to better isolation, 監控, and incident response capabilities. When a 提示詞注入 succeeds, the blast radius should be minimized through architectural controls.
-
Least privilege: Grant models and 代理 only the minimum capabilities needed for their intended function. A customer service chatbot does not need file system access or code execution. Excessive capabilities magnify the impact of successful 利用.
-
Continuous 測試: AI 安全 is not a one-time 評估. Models change, 防禦 evolve, and new attack techniques are discovered regularly. 實作 continuous 安全 測試 as part of the development and deployment lifecycle.
-
Secure by default: Default configurations should be secure. Require explicit opt-in for risky capabilities, use allowlists rather than denylists, and err on the side of restriction rather than permissiveness.
Integration with Organizational 安全
AI 安全 does not exist in isolation — it must integrate with the organization's broader 安全 program:
| 安全 Domain | AI-Specific Integration |
|---|---|
| Identity and Access | API key management, model access controls, user 認證 for AI features |
| Data Protection | 訓練資料 classification, PII in prompts, data residency for model calls |
| Application 安全 | AI feature threat modeling, 提示詞注入 in SAST/DAST, secure AI design patterns |
| Incident Response | AI-specific playbooks, model behavior 監控, 提示詞注入 forensics |
| Compliance | AI regulatory mapping (EU AI Act, NIST), AI audit trails, model documentation |
| Supply Chain | Model provenance, dependency 安全, adapter/weight integrity verification |
class OrganizationalIntegration:
"""Framework for integrating AI 安全 with organizational 安全 programs."""
def __init__(self, org_config: dict):
self.config = org_config
self.gaps = []
def assess_maturity(self) -> dict:
"""評估 the organization's AI 安全 maturity."""
domains = {
"governance": self._check_governance(),
"technical_controls": self._check_technical(),
"監控": self._check_monitoring(),
"incident_response": self._check_ir(),
"訓練": self._check_training(),
}
overall = sum(d["score"] for d in domains.values()) / len(domains)
return {"domains": domains, "overall_maturity": round(overall, 1)}
def _check_governance(self) -> dict:
has_policy = self.config.get("ai_security_policy", False)
has_framework = self.config.get("risk_framework", False)
score = (int(has_policy) + int(has_framework)) * 2.5
return {"score": score, "max": 5.0}
def _check_technical(self) -> dict:
controls = ["input_classification", "output_filtering", "rate_limiting", "sandboxing"]
active = sum(1 for c in controls if self.config.get(c, False))
return {"score": active * 1.25, "max": 5.0}
def _check_monitoring(self) -> dict:
has_monitoring = self.config.get("ai_monitoring", False)
has_alerting = self.config.get("ai_alerting", False)
score = (int(has_monitoring) + int(has_alerting)) * 2.5
return {"score": score, "max": 5.0}
def _check_ir(self) -> dict:
has_playbook = self.config.get("ai_ir_playbook", False)
return {"score": 5.0 if has_playbook else 0.0, "max": 5.0}
def _check_training(self) -> dict:
has_training = self.config.get("ai_security_training", False)
return {"score": 5.0 if has_training else 0.0, "max": 5.0}Future Directions
Several research and industry trends will shape the evolution of this field:
- Formal methods for AI 安全: Development of mathematical frameworks that can provide bounded guarantees about model behavior under 對抗性 conditions
- Automated 紅隊演練 at scale: Continued improvement of automated 測試 tools that can discover novel 漏洞 without human guidance
- AI-assisted 防禦: Using AI systems to detect and respond to attacks on other AI systems, creating a dynamic attack-防禦 ecosystem
- Standardized 評估: Growing adoption of standardized benchmarks (HarmBench, JailbreakBench) that enable consistent measurement of progress
- Regulatory harmonization: Convergence of AI regulatory frameworks across jurisdictions, providing clearer requirements for organizations