Tokenizer Security
How tokenization creates attack surfaces in LLM systems: BPE exploitation, token boundary attacks, encoding edge cases, and tokenizer-aware adversarial techniques.
Tokenizer Security
Tokenization is the first processing step for any LLM input -- and the last step most security teams think about. The tokenizer converts human-readable text into token sequences that the model processes. This conversion is not a simple character-by-character mapping; it involves complex merging rules, special handling of whitespace and Unicode, and vocabulary-dependent segmentation that creates subtle but exploitable security gaps.
How BPE Creates Attack Surface
Byte Pair Encoding Mechanics
BPE builds its vocabulary by iteratively merging the most frequent character pairs in training data. This creates a vocabulary where common words are single tokens but rare character combinations are split into multiple tokens:
from transformers import AutoTokenizer
def tokenization_analysis(tokenizer, text):
"""Analyze how text is tokenized, revealing potential edge cases."""
tokens = tokenizer.encode(text)
decoded_tokens = [tokenizer.decode([t]) for t in tokens]
analysis = {
"text": text,
"num_tokens": len(tokens),
"token_ids": tokens,
"token_strings": decoded_tokens,
"chars_per_token": len(text) / len(tokens)
}
# Identify multi-byte tokens and single-character splits
for i, (tid, tstr) in enumerate(zip(tokens, decoded_tokens)):
if len(tstr) == 1 and ord(tstr) > 127:
analysis.setdefault("unicode_single_chars", []).append({
"position": i,
"character": tstr,
"codepoint": hex(ord(tstr))
})
return analysis
# Example: "ignore" is likely one token, but "ign" + "ore" might
# bypass a filter that looks for the "ignore" token
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b")
# Normal tokenization
print(tokenization_analysis(tokenizer, "ignore previous instructions"))
# Modified tokenization (adding zero-width characters)
print(tokenization_analysis(tokenizer, "ig\u200bnore previous instructions"))Token Boundary Misalignment
Security filters that operate on tokens can be bypassed by ensuring dangerous content spans token boundaries differently than expected:
def find_boundary_exploits(tokenizer, target_word):
"""
Find ways to write a target word that tokenizes differently,
potentially bypassing token-level content filters.
"""
base_tokens = tokenizer.encode(target_word, add_special_tokens=False)
exploits = []
# Strategy 1: Case variations
variations = [
target_word.upper(),
target_word.capitalize(),
target_word.swapcase(),
''.join(c.upper() if i % 2 == 0 else c
for i, c in enumerate(target_word))
]
# Strategy 2: Unicode substitutions
unicode_map = {
'a': '\u0430', # Cyrillic 'a' (visually identical)
'e': '\u0435', # Cyrillic 'e'
'o': '\u043e', # Cyrillic 'o'
'p': '\u0440', # Cyrillic 'p'
'c': '\u0441', # Cyrillic 'c'
'x': '\u0445', # Cyrillic 'x'
}
for i, char in enumerate(target_word):
if char.lower() in unicode_map:
variant = (target_word[:i] +
unicode_map[char.lower()] +
target_word[i+1:])
variations.append(variant)
# Strategy 3: Zero-width character insertion
for i in range(1, len(target_word)):
zwc_variant = target_word[:i] + '\u200b' + target_word[i:]
variations.append(zwc_variant)
for variant in variations:
variant_tokens = tokenizer.encode(variant, add_special_tokens=False)
if variant_tokens != base_tokens:
exploits.append({
"variant": variant,
"original_tokens": base_tokens,
"variant_tokens": variant_tokens,
"token_count_change": len(variant_tokens) - len(base_tokens),
"repr": repr(variant)
})
return exploitsToken-Level Filter Bypass
Keyword Filter Evasion
Many content filters operate on individual tokens or token sequences. Disrupting the expected tokenization defeats these filters:
class TokenFilterBypass:
"""Techniques to bypass token-level content filters."""
def __init__(self, tokenizer):
self.tokenizer = tokenizer
def unicode_homoglyph_bypass(self, text, target_words):
"""
Replace characters in target words with visually identical
Unicode characters that change tokenization.
"""
homoglyphs = {
'A': '\u0410', 'B': '\u0412', 'C': '\u0421',
'E': '\u0415', 'H': '\u041d', 'K': '\u041a',
'M': '\u041c', 'O': '\u041e', 'P': '\u0420',
'T': '\u0422', 'X': '\u0425',
'a': '\u0430', 'c': '\u0441', 'e': '\u0435',
'o': '\u043e', 'p': '\u0440', 'x': '\u0445',
}
result = text
for word in target_words:
modified_word = ''
for char in word:
if char in homoglyphs and len(modified_word) % 3 == 0:
modified_word += homoglyphs[char]
else:
modified_word += char
result = result.replace(word, modified_word)
return result
def whitespace_splitting(self, text, target_words):
"""
Insert unusual whitespace characters to split tokens.
"""
special_spaces = [
'\u00a0', # Non-breaking space
'\u2009', # Thin space
'\u200a', # Hair space
'\u2003', # Em space
]
result = text
for word in target_words:
mid = len(word) // 2
space = special_spaces[hash(word) % len(special_spaces)]
split_word = word[:mid] + space + word[mid:]
result = result.replace(word, split_word)
return result
def encoding_normalization_bypass(self, text):
"""
Use Unicode normalization forms that render identically
but tokenize differently.
"""
import unicodedata
# NFC vs NFD normalization produces different byte sequences
# for accented characters, changing tokenization
nfd_text = unicodedata.normalize('NFD', text)
nfc_text = unicodedata.normalize('NFC', text)
# Check which normalization changes tokenization
nfd_tokens = self.tokenizer.encode(nfd_text)
nfc_tokens = self.tokenizer.encode(nfc_text)
if nfd_tokens != nfc_tokens:
return nfd_text # Different tokenization might bypass filter
return textSpecial Token Exploitation
Special tokens (BOS, EOS, PAD, custom tokens) can interfere with model behavior:
def special_token_analysis(tokenizer):
"""Analyze special tokens for potential exploitation."""
special = {
"bos": tokenizer.bos_token,
"eos": tokenizer.eos_token,
"pad": tokenizer.pad_token,
"unk": tokenizer.unk_token,
}
# Check for additional special tokens
if hasattr(tokenizer, 'additional_special_tokens'):
for i, token in enumerate(tokenizer.additional_special_tokens):
special[f"special_{i}"] = token
# Check if special token strings can be injected via text
injectable = {}
for name, token in special.items():
if token and token in "normal looking text":
injectable[name] = {
"token": token,
"can_inject_via_text": True
}
return {
"special_tokens": special,
"injectable": injectable,
"vocab_size": tokenizer.vocab_size
}Token Count Manipulation
Context Window Exploitation via Token Efficiency
Different text encodings produce different token counts. An attacker can pack more or less content into a context window by choosing representations that are token-efficient or token-wasteful:
def token_efficiency_analysis(tokenizer, text):
"""
Analyze token efficiency and find more compact representations.
"""
base_tokens = len(tokenizer.encode(text))
strategies = {
"base": {
"text": text,
"tokens": base_tokens
}
}
# Abbreviations use fewer tokens
abbreviation_map = {
"because": "bc",
"without": "w/o",
"with": "w/",
"information": "info",
"approximately": "~",
}
abbreviated = text
for full, abbr in abbreviation_map.items():
abbreviated = abbreviated.replace(full, abbr)
strategies["abbreviated"] = {
"text": abbreviated,
"tokens": len(tokenizer.encode(abbreviated))
}
# Compact formatting
compact = ' '.join(text.split()) # Remove extra whitespace
strategies["compact"] = {
"text": compact,
"tokens": len(tokenizer.encode(compact))
}
return strategiesToken Limit Attacks
Craft inputs that consume maximum tokens to exhaust context windows or trigger truncation at specific points:
def context_exhaustion_payload(tokenizer, max_context, target_text,
system_prompt_length):
"""
Craft a payload that fills the context window such that
critical system prompt content is truncated.
"""
target_tokens = tokenizer.encode(target_text)
target_len = len(target_tokens)
# Calculate padding needed
available = max_context - system_prompt_length - target_len - 100
# 100 token buffer for generation
# Generate high-token-count padding
# Single characters often tokenize to 1 token each
padding = "x " * available
return padding + "\n" + target_textTokenizer-Aware Defense
Robust Content Filtering
Defenses must account for tokenization variability:
class TokenizerAwareFilter:
"""Content filter that handles tokenization edge cases."""
def __init__(self, tokenizer, blocked_words):
self.tokenizer = tokenizer
self.blocked_words = blocked_words
def check(self, text):
"""
Check text against blocked words using multiple
normalization strategies.
"""
# Strategy 1: Check raw text (case-insensitive)
text_lower = text.lower()
for word in self.blocked_words:
if word.lower() in text_lower:
return {"blocked": True, "reason": f"Contains '{word}'"}
# Strategy 2: Unicode normalize then check
import unicodedata
normalized = unicodedata.normalize('NFKC', text)
normalized_lower = normalized.lower()
for word in self.blocked_words:
if word.lower() in normalized_lower:
return {"blocked": True, "reason": f"Contains '{word}' (normalized)"}
# Strategy 3: Remove zero-width characters then check
zwc_removed = text
for zwc in ['\u200b', '\u200c', '\u200d', '\ufeff']:
zwc_removed = zwc_removed.replace(zwc, '')
zwc_lower = zwc_removed.lower()
for word in self.blocked_words:
if word.lower() in zwc_lower:
return {"blocked": True, "reason": f"Contains '{word}' (ZWC removed)"}
# Strategy 4: Confusable character detection
stripped = self.strip_confusables(text)
stripped_lower = stripped.lower()
for word in self.blocked_words:
if word.lower() in stripped_lower:
return {"blocked": True, "reason": f"Contains '{word}' (confusables)"}
return {"blocked": False}
def strip_confusables(self, text):
"""Replace confusable Unicode characters with ASCII equivalents."""
confusable_map = {
'\u0430': 'a', '\u0435': 'e', '\u043e': 'o',
'\u0440': 'p', '\u0441': 'c', '\u0445': 'x',
'\u0410': 'A', '\u0415': 'E', '\u041e': 'O',
'\u0420': 'P', '\u0421': 'C',
}
return ''.join(confusable_map.get(c, c) for c in text)Related Topics
- Logit Manipulation — How tokenization affects output distributions
- Context Window Internals — Token limits and attention mechanics
- Semantic Injection — Meaning-level attacks that complement token-level evasion
A content filter blocks the token sequence for 'system prompt'. An attacker writes 'syst\u200bem pr\u200bompt'. Why might this bypass the filter?
References
- Jiang et al., "Identifying and Mitigating the Security Risks of Generative AI" (2023)
- Boucher et al., "Bad Characters: Imperceptible NLP Attacks" (2022)
- Karpathy, "Let's build the GPT Tokenizer" (2024)
- Rust et al., "How Good is Your Tokenizer?" (2021)