Tokenizer 安全
How tokenization creates attack surfaces in LLM systems: BPE exploitation, token boundary attacks, encoding edge cases, and tokenizer-aware adversarial techniques.
Tokenizer 安全
Tokenization is the first processing step for any LLM 輸入 -- and the last step most 安全 teams think about. The 分詞器 converts human-readable text into 符元 sequences that 模型 processes. This conversion is not a simple character-by-character mapping; it involves complex merging rules, special handling of whitespace and Unicode, and vocabulary-dependent segmentation that creates subtle but exploitable 安全 gaps.
How BPE Creates 攻擊 Surface
Byte Pair Encoding Mechanics
BPE builds its vocabulary by iteratively merging the most frequent character pairs in 訓練資料. This creates a vocabulary where common words are single 符元 but rare character combinations are split into multiple 符元:
from transformers import AutoTokenizer
def tokenization_analysis(分詞器, text):
"""Analyze how text is tokenized, revealing potential edge cases."""
符元 = 分詞器.encode(text)
decoded_tokens = [分詞器.decode([t]) for t in 符元]
analysis = {
"text": text,
"num_tokens": len(符元),
"token_ids": 符元,
"token_strings": decoded_tokens,
"chars_per_token": len(text) / len(符元)
}
# 識別 multi-byte 符元 and single-character splits
for i, (tid, tstr) in enumerate(zip(符元, decoded_tokens)):
if len(tstr) == 1 and ord(tstr) > 127:
analysis.setdefault("unicode_single_chars", []).append({
"position": i,
"character": tstr,
"codepoint": hex(ord(tstr))
})
return analysis
# 範例: "ignore" is likely one 符元, but "ign" + "ore" might
# bypass a filter that looks for the "ignore" 符元
分詞器 = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b")
# Normal 分詞
print(tokenization_analysis(分詞器, "ignore previous instructions"))
# Modified 分詞 (adding zero-width characters)
print(tokenization_analysis(分詞器, "ig\u200bnore previous instructions"))Token Boundary Misalignment
安全 filters that operate on 符元 can be bypassed by ensuring dangerous content spans 符元 boundaries differently than expected:
def find_boundary_exploits(分詞器, target_word):
"""
Find ways to write a target word that tokenizes differently,
potentially bypassing 符元-level content filters.
"""
base_tokens = 分詞器.encode(target_word, add_special_tokens=False)
exploits = []
# Strategy 1: Case variations
variations = [
target_word.upper(),
target_word.capitalize(),
target_word.swapcase(),
''.join(c.upper() if i % 2 == 0 else c
for i, c in enumerate(target_word))
]
# Strategy 2: Unicode substitutions
unicode_map = {
'a': '\u0430', # Cyrillic 'a' (visually identical)
'e': '\u0435', # Cyrillic 'e'
'o': '\u043e', # Cyrillic 'o'
'p': '\u0440', # Cyrillic 'p'
'c': '\u0441', # Cyrillic 'c'
'x': '\u0445', # Cyrillic 'x'
}
for i, char in enumerate(target_word):
if char.lower() in unicode_map:
variant = (target_word[:i] +
unicode_map[char.lower()] +
target_word[i+1:])
variations.append(variant)
# Strategy 3: Zero-width character insertion
for i in range(1, len(target_word)):
zwc_variant = target_word[:i] + '\u200b' + target_word[i:]
variations.append(zwc_variant)
for variant in variations:
variant_tokens = 分詞器.encode(variant, add_special_tokens=False)
if variant_tokens != base_tokens:
exploits.append({
"variant": variant,
"original_tokens": base_tokens,
"variant_tokens": variant_tokens,
"token_count_change": len(variant_tokens) - len(base_tokens),
"repr": repr(variant)
})
return exploitsToken-Level Filter Bypass
Keyword Filter Evasion
Many content filters operate on individual 符元 or 符元 sequences. Disrupting the expected 分詞 defeats these filters:
class TokenFilterBypass:
"""Techniques to bypass 符元-level content filters."""
def __init__(self, 分詞器):
self.分詞器 = 分詞器
def unicode_homoglyph_bypass(self, text, target_words):
"""
Replace characters in target words with visually identical
Unicode characters that change 分詞.
"""
homoglyphs = {
'A': '\u0410', 'B': '\u0412', 'C': '\u0421',
'E': '\u0415', 'H': '\u041d', 'K': '\u041a',
'M': '\u041c', 'O': '\u041e', 'P': '\u0420',
'T': '\u0422', 'X': '\u0425',
'a': '\u0430', 'c': '\u0441', 'e': '\u0435',
'o': '\u043e', 'p': '\u0440', 'x': '\u0445',
}
result = text
for word in target_words:
modified_word = ''
for char in word:
if char in homoglyphs and len(modified_word) % 3 == 0:
modified_word += homoglyphs[char]
else:
modified_word += char
result = result.replace(word, modified_word)
return result
def whitespace_splitting(self, text, target_words):
"""
Insert unusual whitespace characters to split 符元.
"""
special_spaces = [
'\u00a0', # Non-breaking space
'\u2009', # Thin space
'\u200a', # Hair space
'\u2003', # Em space
]
result = text
for word in target_words:
mid = len(word) // 2
space = special_spaces[hash(word) % len(special_spaces)]
split_word = word[:mid] + space + word[mid:]
result = result.replace(word, split_word)
return result
def encoding_normalization_bypass(self, text):
"""
Use Unicode normalization forms that render identically
but tokenize differently.
"""
import unicodedata
# NFC vs NFD normalization produces different byte sequences
# for accented characters, changing 分詞
nfd_text = unicodedata.normalize('NFD', text)
nfc_text = unicodedata.normalize('NFC', text)
# Check which normalization changes 分詞
nfd_tokens = self.分詞器.encode(nfd_text)
nfc_tokens = self.分詞器.encode(nfc_text)
if nfd_tokens != nfc_tokens:
return nfd_text # Different 分詞 might bypass filter
return textSpecial Token 利用
Special 符元 (BOS, EOS, PAD, custom 符元) can interfere with model behavior:
def special_token_analysis(分詞器):
"""Analyze special 符元 for potential 利用."""
special = {
"bos": 分詞器.bos_token,
"eos": 分詞器.eos_token,
"pad": 分詞器.pad_token,
"unk": 分詞器.unk_token,
}
# Check for additional special 符元
if hasattr(分詞器, 'additional_special_tokens'):
for i, 符元 in enumerate(分詞器.additional_special_tokens):
special[f"special_{i}"] = 符元
# Check if special 符元 strings can be injected via text
injectable = {}
for name, 符元 in special.items():
if 符元 and 符元 in "normal looking text":
injectable[name] = {
"符元": 符元,
"can_inject_via_text": True
}
return {
"special_tokens": special,
"injectable": injectable,
"vocab_size": 分詞器.vocab_size
}Token Count Manipulation
Context Window 利用 via Token Efficiency
Different text encodings produce different 符元 counts. 攻擊者 can pack more or less content into a 上下文視窗 by choosing representations that are 符元-efficient or 符元-wasteful:
def token_efficiency_analysis(分詞器, text):
"""
Analyze 符元 efficiency and find more compact representations.
"""
base_tokens = len(分詞器.encode(text))
strategies = {
"base": {
"text": text,
"符元": base_tokens
}
}
# Abbreviations use fewer 符元
abbreviation_map = {
"因為": "bc",
"without": "w/o",
"with": "w/",
"information": "info",
"approximately": "~",
}
abbreviated = text
for full, abbr in abbreviation_map.items():
abbreviated = abbreviated.replace(full, abbr)
strategies["abbreviated"] = {
"text": abbreviated,
"符元": len(分詞器.encode(abbreviated))
}
# Compact formatting
compact = ' '.join(text.split()) # Remove extra whitespace
strategies["compact"] = {
"text": compact,
"符元": len(分詞器.encode(compact))
}
return strategiesToken Limit 攻擊
Craft inputs that consume maximum 符元 to exhaust context windows or trigger truncation at specific points:
def context_exhaustion_payload(分詞器, max_context, target_text,
system_prompt_length):
"""
Craft a payload that fills the 上下文視窗 such that
critical 系統提示詞 content is truncated.
"""
target_tokens = 分詞器.encode(target_text)
target_len = len(target_tokens)
# Calculate padding needed
available = max_context - system_prompt_length - target_len - 100
# 100 符元 buffer for generation
# Generate high-符元-count padding
# Single characters often tokenize to 1 符元 each
padding = "x " * available
return padding + "\n" + target_textTokenizer-Aware 防禦
Robust Content Filtering
防禦 must account for 分詞 variability:
class TokenizerAwareFilter:
"""Content filter that handles 分詞 edge cases."""
def __init__(self, 分詞器, blocked_words):
self.分詞器 = 分詞器
self.blocked_words = blocked_words
def check(self, text):
"""
Check text against blocked words using multiple
normalization strategies.
"""
# Strategy 1: Check raw text (case-insensitive)
text_lower = text.lower()
for word in self.blocked_words:
if word.lower() in text_lower:
return {"blocked": True, "reason": f"Contains '{word}'"}
# Strategy 2: Unicode normalize then check
import unicodedata
normalized = unicodedata.normalize('NFKC', text)
normalized_lower = normalized.lower()
for word in self.blocked_words:
if word.lower() in normalized_lower:
return {"blocked": True, "reason": f"Contains '{word}' (normalized)"}
# Strategy 3: Remove zero-width characters then check
zwc_removed = text
for zwc in ['\u200b', '\u200c', '\u200d', '\ufeff']:
zwc_removed = zwc_removed.replace(zwc, '')
zwc_lower = zwc_removed.lower()
for word in self.blocked_words:
if word.lower() in zwc_lower:
return {"blocked": True, "reason": f"Contains '{word}' (ZWC removed)"}
# Strategy 4: Confusable character 偵測
stripped = self.strip_confusables(text)
stripped_lower = stripped.lower()
for word in self.blocked_words:
if word.lower() in stripped_lower:
return {"blocked": True, "reason": f"Contains '{word}' (confusables)"}
return {"blocked": False}
def strip_confusables(self, text):
"""Replace confusable Unicode characters with ASCII equivalents."""
confusable_map = {
'\u0430': 'a', '\u0435': 'e', '\u043e': 'o',
'\u0440': 'p', '\u0441': 'c', '\u0445': 'x',
'\u0410': 'A', '\u0415': 'E', '\u041e': 'O',
'\u0420': 'P', '\u0421': 'C',
}
return ''.join(confusable_map.get(c, c) for c in text)相關主題
- Logit Manipulation — How 分詞 affects 輸出 distributions
- Context Window Internals — Token limits and 注意力 mechanics
- Semantic Injection — Meaning-level attacks that complement 符元-level evasion
A content filter blocks the 符元 sequence for '系統提示詞'. 攻擊者 writes 'syst\u200bem pr\u200bompt'. Why might this bypass the filter?
參考文獻
- Jiang et al., "Identifying and Mitigating the 安全 Risks of Generative AI" (2023)
- Boucher et al., "Bad Characters: Imperceptible NLP 攻擊" (2022)
- Karpathy, "Let's build the GPT Tokenizer" (2024)
- Rust et al., "How Good is Your Tokenizer?" (2021)