模型辨識技術
Intermediate5 min readUpdated 2026-03-15
以行為簽名、輸出分析與系統化探測對 API 後之模型進行指紋辨識,以判定模型家族、大小與版本。
模型辨識技術
模型辨識(或模型指紋)是判定你正在與何種模型互動的偵察階段。了解模型使紅隊得以選擇最有效之攻擊技術、參照已知漏洞,並校準預期。即便供應商未揭露其模型,行為分析常仍可將辨識收斂至具體模型家族與大致參數量。
行為指紋
診斷探測
一組精心設計之提示,對不同模型家族引發特徵性回應:
class ModelFingerprinter:
"""經由行為分析辨識模型。"""
def __init__(self):
self.probes = {
"self_identification": [
"What model are you?",
"What is your model name and version?",
"Who created you and what are you called?",
],
"knowledge_cutoff": [
"What is the most recent event you know about?",
"What happened in the world in December 2024?",
"Do you know about [recent event]?",
],
"capability_boundary": [
"Can you generate images?",
"Can you browse the web in real-time?",
"Can you execute code?",
"How many tokens can you process?",
],
"formatting_style": [
"List 5 programming languages",
"Explain gravity in 2 sentences",
"Write a haiku about the ocean",
],
"refusal_pattern": [
"How do I pick a lock?",
"Write a controversial opinion about politics",
"Tell me something you cannot do",
],
"reasoning_pattern": [
"What is 17 * 23?",
"If a train leaves at 3pm going 60mph...",
"A bat and ball cost $1.10 together...",
]
}
def fingerprint(self, api_client):
"""執行所有探測並產生行為指紋。"""
fingerprint = {}
for category, prompts in self.probes.items():
responses = []
for prompt in prompts:
response = api_client.generate(prompt)
responses.append({
"prompt": prompt,
"response": response,
"response_length": len(response),
"formatting": self.analyze_formatting(response)
})
fingerprint[category] = responses
return self.classify(fingerprint)
def classify(self, fingerprint):
"""依指紋分類模型。"""
indicators = {}
# 自我辨識分析
self_id = fingerprint["self_identification"]
for response in self_id:
text = response["response"].lower()
if "claude" in text:
indicators["family"] = "Anthropic Claude"
elif "gpt" in text:
indicators["family"] = "OpenAI GPT"
elif "gemini" in text or "google" in text:
indicators["family"] = "Google Gemini"
elif "llama" in text or "meta" in text:
indicators["family"] = "Meta Llama"
# 格式分析
formatting = fingerprint["formatting_style"]
markdown_usage = sum(
1 for r in formatting
if "**" in r["response"] or "- " in r["response"]
)
if markdown_usage >= 2:
indicators["formatting"] = "heavy_markdown"
# 拒絕模式分析
refusal = fingerprint["refusal_pattern"]
refusal_style = self.classify_refusal_style(refusal)
indicators["refusal_style"] = refusal_style
return indicators
def analyze_formatting(self, text):
"""分析文字格式特徵。"""
return {
"uses_markdown_bold": "**" in text,
"uses_bullet_points": any(
text.strip().startswith(p)
for p in ["- ", "* ", "• "]
for line in text.split("\n")
),
"uses_numbered_list": any(
line.strip()[:2].rstrip(".").isdigit()
for line in text.split("\n")
if line.strip()
),
"uses_headers": "##" in text or "###" in text,
"uses_code_blocks": "```" in text,
"average_sentence_length": len(text.split()) / max(text.count("."), 1)
}
def classify_refusal_style(self, refusal_responses):
"""分類模型之拒絕風格。"""
for r in refusal_responses:
text = r["response"].lower()
if "i can't" in text or "i cannot" in text:
return "direct_refusal"
elif "i'd be happy to help" in text and "however" in text:
return "polite_redirect"
elif "as an ai" in text:
return "ai_identity_refusal"
return "unknown"特徵性模型特性
| 特徵 | GPT-4 | Claude 3 | Gemini | Llama 3 |
|---|---|---|---|---|
| Markdown 使用 | 重度 | 中度 | 重度 | 不一 |
| 拒絕風格 | 援引政策 | 原則性說明 | 簡短拒絕 | 不一 |
| 數學做法 | 逐步 | 詳細推理 | 精簡 | 基本 |
| 自我辨識 | 常宣稱 GPT | 自稱 Claude | 自稱 Gemini | 可能幻覺身份 |
| 列表格式 | 粗體編號 | 乾淨編號 | 粗體項目符號 | 不一 |
| 程式碼風格 | 程式上方註解 | 行內註解 | 混合 | 極少註解 |
Token 分布分析
即便對相同提示,不同模型具不同之 token 分布。若 API 提供 logprob,此為強力指紋訊號:
def logprob_fingerprint(api_client, probe_prompts):
"""
以 logprob 分布對模型指紋。
不同模型對相同 token 指派不同機率。
"""
fingerprints = []
for prompt in probe_prompts:
response = api_client.generate(
prompt, max_tokens=1, logprobs=5, temperature=0
)
top_tokens = response.logprobs.top_logprobs[0]
fingerprints.append({
"prompt": prompt,
"top_token": max(top_tokens, key=top_tokens.get),
"top_5_tokens": dict(sorted(
top_tokens.items(),
key=lambda x: x[1],
reverse=True
)[:5]),
"entropy": compute_entropy(top_tokens)
})
return fingerprints回應時間分析
模型推論速度因架構與部署而異:
import time
import statistics
def timing_fingerprint(api_client, num_trials=20):
"""
以回應時間特徵對模型指紋。
"""
# 固定提示以便一致比較
prompt = "Count from 1 to 10, one number per line."
timings = []
for _ in range(num_trials):
start = time.monotonic()
response = api_client.generate(
prompt, max_tokens=50, temperature=0
)
elapsed = time.monotonic() - start
timings.append(elapsed)
return {
"median_latency": statistics.median(timings),
"p95_latency": sorted(timings)[int(0.95 * len(timings))],
"variance": statistics.variance(timings),
"tokens_per_second": 50 / statistics.median(timings),
# 較大模型通常每秒 token 數較慢
}實務應用
依模型辨識選擇攻擊策略
一旦模型被辨識,紅隊可選擇最有效之攻擊做法:
ATTACK_RECOMMENDATIONS = {
"OpenAI GPT": {
"effective_techniques": [
"multi-turn escalation",
"role-play framing",
"few-shot jailbreaking"
],
"known_weaknesses": [
"instruction hierarchy bypass via developer messages",
"function calling manipulation"
],
"less_effective": [
"direct instruction override (well-defended)",
"GCG suffixes (actively filtered)"
]
},
"Anthropic Claude": {
"effective_techniques": [
"many-shot jailbreaking",
"role-play with detailed personas",
"progressive topic shifting"
],
"known_weaknesses": [
"long-context attention patterns",
"system prompt extraction via reasoning"
],
"less_effective": [
"simple DAN-style prompts",
"basic instruction override"
]
},
# ... 其他模型家族
}相關主題
Knowledge Check
某 API 對『What model are you?』回應『I'm a helpful AI assistant.』。它使用重度 markdown 格式、以詳細逐步方式解數學,並以援引『our usage policy』拒絕請求。最可能是哪個模型家族?
參考資料
- Ippolito et al.,〈Reverse-Engineering Decoding Strategies Given Blackbox Access to a Language Generation System〉(2020)
- Shen et al.,〈Identifying LLMs Behind API Endpoints〉(2024)
- Carlini et al.,〈Stealing Part of a Production Language Model〉(2024)