實作:基礎模型提取
中級6 分鐘閱讀更新於 2026-03-13
為以 API 為本之模型提取攻擊之實作,查詢目標模型以近似其行為、量測保真度,並理解查詢預算。
模型提取(或模型竊取)旨在僅使用 API 存取建立目標模型行為之本地副本。於本實作中,你將查詢目標模型、蒐集其輸出並訓練近似其決策之替代。
先備條件
pip install openai scikit-learn transformers torch datasets python-dotenv實作設置
定義目標模型
為此實作,我們使用分類式模型作為目標——配組態以分類客戶支援票券之系統提示之模型。
from openai import OpenAI from dotenv import load_dotenv load_dotenv() client = OpenAI() SYSTEM_PROMPT = """You are a customer support ticket classifier. Classify each ticket into exactly one category. Respond with ONLY the category name. Categories: - billing: Payment issues, invoices, refunds, subscription changes - technical: Bugs, errors, performance, compatibility issues - account: Login problems, password reset, profile changes, permissions - feature: Feature requests, suggestions, product feedback - general: Everything else Apply these priority rules: - If the ticket mentions "urgent" or "critical", always classify as technical - If the ticket mentions money amounts, always classify as billing - If the ticket mentions "password" or "login", always classify as account""" def classify_ticket(ticket_text: str) -> str: """使用目標模型分類支援票券。""" response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": ticket_text}, ], temperature=0.0, max_tokens=20, ) return response.choices[0].message.content.strip().lower()建立查詢資料集
import random # 為探測每個類別與邊界案例之種子查詢 SEED_QUERIES = [ # 清楚類別範例 "I was charged twice for my subscription this month", "The application crashes when I try to upload files larger than 10MB", "I can't log into my account, password reset isn't working", "It would be great if you added dark mode support", "When are your office hours?", # 邊界案例(測試優先度規則) "URGENT: My password isn't working and I can't access billing", "Critical bug: the payment page shows $0 for all items", "I want to change my login email and also request a refund of $50", # 模糊案例 "Everything is broken", "I need help", "The thing with the thing isn't working right", ] # 為更多涵蓋之以範本為本之查詢生成 TEMPLATES = [ "I was charged ${amount} incorrectly", "The {feature} feature is broken on {platform}", "I can't {action} my {object}", "Please add {feature} to the product", "My {object} was {issue} and I need urgent help", ] FILLS = { "amount": ["29.99", "100", "0", "999"], "feature": ["search", "export", "sync", "dashboard", "reports"], "platform": ["Windows", "Mac", "iOS", "Android", "Chrome"], "action": ["access", "update", "delete", "reset", "find"], "object": ["account", "profile", "subscription", "invoice", "password"], "issue": ["hacked", "deleted", "overcharged", "suspended", "corrupted"], } def generate_queries(n: int = 100) -> list[str]: """為提取生成多樣查詢。""" queries = list(SEED_QUERIES) while len(queries) < n: template = random.choice(TEMPLATES) query = template for key, values in FILLS.items(): if f"{{{key}}}" in query: query = query.replace(f"{{{key}}}", random.choice(values)) queries.append(query) return queries[:n]
練習
練習 1:蒐集目標模型輸出
import json
import time
from target_model import classify_ticket
from query_dataset import generate_queries
queries = generate_queries(200)
dataset = []
print(f"Querying target model with {len(queries)} inputs...")
for i, query in enumerate(queries):
try:
label = classify_ticket(query)
dataset.append({"text": query, "label": label})
if (i + 1) % 25 == 0:
print(f" Collected {i+1}/{len(queries)} labels")
except Exception as e:
print(f" Error on query {i+1}: {e}")
time.sleep(0.1) # 速率限制
# 儲存資料集
with open("extraction_dataset.json", "w") as f:
json.dump(dataset, f, indent=2)
# 分布分析
from collections import Counter
labels = Counter(item["label"] for item in dataset)
print(f"\nCollected {len(dataset)} labeled examples")
print("Label distribution:")
for label, count in labels.most_common():
print(f" {label}: {count} ({count/len(dataset):.0%})")練習 2:訓練替代模型
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
import numpy as np
# 載入提取之資料集
with open("extraction_dataset.json") as f:
dataset = json.load(f)
texts = [item["text"] for item in dataset]
labels = [item["label"] for item in dataset]
# 以 TF-IDF 向量化
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(texts)
y = np.array(labels)
# 訓練替代
surrogate = LogisticRegression(max_iter=1000, multi_class="multinomial")
scores = cross_val_score(surrogate, X, y, cv=5, scoring="accuracy")
print(f"Cross-validation accuracy: {scores.mean():.2%} (+/- {scores.std():.2%})")
# 於完整資料集訓練供部署
surrogate.fit(X, y)
# 於新查詢測試
TEST_QUERIES = [
"My credit card was charged $500 for something I didn't buy",
"URGENT: The entire platform is down",
"Can you add two-factor authentication?",
"How do I reset my password?",
"The export to PDF feature has a bug on Safari",
]
print("\nSurrogate predictions vs target:")
from target_model import classify_ticket
for query in TEST_QUERIES:
surrogate_pred = surrogate.predict(vectorizer.transform([query]))[0]
target_pred = classify_ticket(query)
match = "MATCH" if surrogate_pred == target_pred else "MISMATCH"
print(f" [{match}] Surrogate={surrogate_pred}, Target={target_pred}")
print(f" Query: {query[:60]}")練習 3:量測保真度 vs. 查詢預算
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from target_model import classify_ticket
from query_dataset import generate_queries
# 生成保留測試集
test_queries = generate_queries(50)
test_labels = [classify_ticket(q) for q in test_queries]
# 載入完整提取資料集
with open("extraction_dataset.json") as f:
full_dataset = json.load(f)
# 於不同查詢預算量測保真度
budgets = [10, 25, 50, 100, 150, 200]
results = []
for budget in budgets:
subset = full_dataset[:budget]
texts = [item["text"] for item in subset]
labels = [item["label"] for item in subset]
vec = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_train = vec.fit_transform(texts)
X_test = vec.transform(test_queries)
model = LogisticRegression(max_iter=1000, multi_class="multinomial")
model.fit(X_train, labels)
predictions = model.predict(X_test)
fidelity = np.mean(np.array(predictions) == np.array(test_labels))
results.append({"budget": budget, "fidelity": fidelity})
print(f"Budget={budget:>4d}: Fidelity={fidelity:.2%}")
# 估計成本
COST_PER_QUERY = 0.0003 # 為 GPT-4o-mini 之近似
print("\nCost analysis:")
for r in results:
cost = r["budget"] * COST_PER_QUERY
print(f" {r['budget']} queries: ${cost:.2f} -> {r['fidelity']:.0%} fidelity")預期結果
| 查詢預算 | 典型保真度 | 大約成本 |
|---|---|---|
| 10 | 40-55% | $0.003 |
| 50 | 60-75% | $0.015 |
| 100 | 75-85% | $0.030 |
| 200 | 85-92% | $0.060 |
故障排除
| 問題 | 解法 |
|---|---|
| 即便具許多查詢仍低保真度 | 檢查標籤分布——不平衡類別傷害小替代 |
| 目標返回非預期標籤 | 模型可能未一致遵循系統提示——降低溫度 |
| sklearn import 錯誤 | 以 pip install scikit-learn 安裝 |
| 自 API 之速率限制 | 於查詢間增加 sleep 時間或使用批次端點 |
相關主題
- Embedding 操弄 - 使用提取模型於離線最佳化 embedding 碰撞攻擊
- 訓練資料提取 - 自模型提取訓練資料而非模型行為
- PAIR 攻擊 - 使用替代模型於對目標測試前離線發展攻擊
- 對抗後綴生成 - 對提取之替代模型打造對抗輸入
參考資料
- "Stealing Machine Learning Models via Prediction APIs" - Tramer et al.(2016)- 經 API 查詢之模型提取基礎工作
- "High-Fidelity Extraction of Neural Network Models" - Jagielski et al.(2020)- 為高準確度模型提取之進階技術
- "OWASP Top 10 for LLM Applications: Model Theft" - OWASP(2025)- 保護模型對抗提取攻擊之業界指引
- "Model Extraction Attacks and Defenses on Machine Learning Models: A Survey" - Oliynyk et al.(2023)- 提取技術與對策之完整調查
Knowledge Check
提取替代模型之主要紅隊好處為何?