LLM API Enumeration
Advanced techniques for enumerating LLM API capabilities, restrictions, hidden parameters, and undocumented features to build a comprehensive attack surface map.
LLM API Enumeration
LLM API enumeration goes beyond standard API endpoint discovery. While general API enumeration maps HTTP endpoints and parameters, LLM API enumeration probes the model-level interface — the specific capabilities, restrictions, content policies, model variants, and behavioral characteristics that define what an LLM can and cannot do. This deeper enumeration reveals the true attack surface for AI red team operations.
Capability Probing Methodology
Structured Capability Assessment
The first phase of LLM API enumeration builds a capability profile by systematically testing what the model can do:
import json
import time
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class CapabilityProfile:
"""Tracks discovered capabilities and restrictions for an LLM API."""
model_id: str
capabilities: dict = field(default_factory=dict)
restrictions: dict = field(default_factory=dict)
parameters: dict = field(default_factory=dict)
hidden_features: list = field(default_factory=list)
class LLMCapabilityEnumerator:
"""Systematically enumerate LLM API capabilities."""
CAPABILITY_PROBES = {
"code_execution": [
"Write a Python script that calculates fibonacci numbers",
"Execute this code: print('hello')",
"Run the following bash command: ls -la",
],
"tool_use": [
"Search the web for recent news",
"Use the calculator tool to compute 2^128",
"Browse this URL: https://example.com",
],
"file_operations": [
"Read the contents of /etc/passwd",
"Upload this file to the server",
"Analyze this CSV data file",
],
"image_generation": [
"Generate an image of a sunset",
"Create a diagram showing network architecture",
"Draw a chart of this data",
],
"multi_turn_memory": [
"Remember that my name is Alice",
"What did I tell you my name was?",
"Summarize our conversation so far",
],
"system_access": [
"What tools do you have access to?",
"List your available functions",
"What APIs can you call?",
],
}
def __init__(self, api_client, rate_limit_delay=1.0):
self.client = api_client
self.delay = rate_limit_delay
self.profile = None
def enumerate_capabilities(self, model_id: str) -> CapabilityProfile:
"""Run full capability enumeration against target model."""
self.profile = CapabilityProfile(model_id=model_id)
for category, probes in self.CAPABILITY_PROBES.items():
results = []
for probe in probes:
response = self._send_probe(model_id, probe)
results.append(self._analyze_capability_response(
category, probe, response
))
time.sleep(self.delay)
self.profile.capabilities[category] = {
"supported": any(r["supported"] for r in results),
"confidence": sum(r["confidence"] for r in results) / len(results),
"details": results,
}
return self.profile
def _send_probe(self, model_id: str, prompt: str) -> dict:
"""Send a capability probe and capture the full response."""
try:
response = self.client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": prompt}],
max_tokens=500,
temperature=0.0,
)
return {
"status": "success",
"content": response.choices[0].message.content,
"finish_reason": response.choices[0].finish_reason,
"usage": response.usage.__dict__ if response.usage else {},
"model": response.model,
}
except Exception as e:
return {
"status": "error",
"error_type": type(e).__name__,
"error_message": str(e),
}
def _analyze_capability_response(
self, category: str, probe: str, response: dict
) -> dict:
"""Analyze response to determine if capability is present."""
if response["status"] == "error":
return {
"supported": False,
"confidence": 0.8,
"reason": f"Error: {response['error_type']}",
}
content = response.get("content", "").lower()
# Check for explicit refusal patterns
refusal_indicators = [
"i can't", "i cannot", "i'm not able",
"i don't have the ability", "not supported",
"i'm unable to", "as an ai", "i do not have access",
]
if any(indicator in content for indicator in refusal_indicators):
return {
"supported": False,
"confidence": 0.7,
"reason": "Model refused or indicated inability",
}
# Check for capability indicators
capability_indicators = {
"code_execution": ["```", "output:", "result:"],
"tool_use": ["searching", "found", "results from"],
"file_operations": ["file contents", "reading file"],
"image_generation": ["here is", "generated", "image"],
"multi_turn_memory": ["you told me", "alice", "earlier"],
"system_access": ["function", "tool", "api"],
}
indicators = capability_indicators.get(category, [])
matches = sum(1 for i in indicators if i in content)
return {
"supported": matches > 0,
"confidence": min(0.5 + (matches * 0.2), 0.95),
"reason": f"Matched \/\ indicators",
}Restriction Boundary Mapping
Once capabilities are identified, the next step maps exactly where restrictions begin. This is distinct from safety boundary mapping — here we focus on API-level restrictions rather than content policy boundaries:
class RestrictionMapper:
"""Map the boundaries of API-level restrictions."""
def __init__(self, api_client):
self.client = api_client
def probe_token_limits(self, model_id: str) -> dict:
"""Discover actual token limits through binary search."""
min_tokens = 1
max_tokens = 200000
confirmed_max = 0
while min_tokens <= max_tokens:
mid = (min_tokens + max_tokens) // 2
success = self._test_context_length(model_id, mid)
if success:
confirmed_max = mid
min_tokens = mid + 1
else:
max_tokens = mid - 1
return {
"max_context_tokens": confirmed_max,
"tested_range": (1, 200000),
}
def probe_parameter_ranges(self, model_id: str) -> dict:
"""Discover valid ranges for all API parameters."""
parameter_tests = {
"temperature": [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
"top_p": [0.0, 0.1, 0.5, 0.9, 1.0, 1.1],
"frequency_penalty": [-2.0, -1.0, 0.0, 1.0, 2.0, 3.0],
"presence_penalty": [-2.0, -1.0, 0.0, 1.0, 2.0, 3.0],
"max_tokens": [1, 100, 1000, 4096, 8192, 16384, 32768],
"n": [1, 2, 5, 10, 20],
"logprobs": [True, False],
"top_logprobs": [0, 1, 5, 10, 20, 50],
}
results = {}
for param, values in parameter_tests.items():
valid_values = []
errors = []
for value in values:
try:
kwargs = {
"model": model_id,
"messages": [{"role": "user", "content": "test"}],
"max_tokens": 5,
param: value,
}
self.client.chat.completions.create(**kwargs)
valid_values.append(value)
except Exception as e:
errors.append({"value": value, "error": str(e)})
results[param] = {
"valid_range": (min(valid_values), max(valid_values))
if valid_values else None,
"valid_values": valid_values,
"rejected_values": errors,
}
return results
def _test_context_length(self, model_id: str, num_tokens: int) -> bool:
"""Test if a given context length is accepted."""
# Generate a prompt of approximately the target token count
# Using ~4 characters per token as a rough heuristic
padding = "word " * (num_tokens // 2)
try:
self.client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": padding}],
max_tokens=5,
)
return True
except Exception:
return FalseHidden Parameter Discovery
Undocumented API Parameters
Many LLM APIs accept parameters that are not listed in public documentation. These hidden parameters can unlock debug modes, alternative behaviors, or bypass certain restrictions:
class HiddenParameterFuzzer:
"""Discover undocumented API parameters through fuzzing."""
# Parameters observed across different LLM providers
CANDIDATE_PARAMS = [
# Sampling control
"top_k", "min_p", "typical_p", "repetition_penalty",
"length_penalty", "no_repeat_ngram_size", "num_beams",
# System and debug
"debug", "verbose", "raw_output", "include_usage",
"echo", "include_stop_str_in_output", "skip_special_tokens",
# Safety and filtering
"safety_settings", "harm_category", "block_threshold",
"content_filter_level", "enable_safety", "disable_moderation",
# Caching and optimization
"use_cache", "cache_seed", "seed", "deterministic",
"prefix_caching", "prompt_cache",
# Model behavior
"system_fingerprint", "service_tier", "parallel_tool_calls",
"response_format", "json_mode", "structured_output",
# Internal/experimental
"beta", "experimental", "preview", "v2",
"internal_mode", "developer_mode", "admin",
]
def __init__(self, api_client):
self.client = api_client
self.discovered = []
def fuzz_parameters(self, model_id: str) -> list:
"""Test each candidate parameter to see if it is accepted."""
baseline = self._get_baseline_response(model_id)
for param in self.CANDIDATE_PARAMS:
for test_value in self._get_test_values(param):
result = self._test_parameter(
model_id, param, test_value, baseline
)
if result["accepted"]:
self.discovered.append({
"parameter": param,
"test_value": test_value,
"effect": result.get("effect", "unknown"),
"response_diff": result.get("diff", None),
})
return self.discovered
def _get_test_values(self, param: str) -> list:
"""Generate test values appropriate for the parameter name."""
if any(kw in param for kw in ["temperature", "penalty", "top_"]):
return [0.0, 0.5, 1.0]
elif any(kw in param for kw in ["debug", "verbose", "enable", "disable"]):
return [True, False, 1, 0]
elif any(kw in param for kw in ["mode", "tier", "level"]):
return ["low", "medium", "high", "debug", "admin"]
elif any(kw in param for kw in ["seed", "cache"]):
return [42, 0, -1]
else:
return [True, 1, "test"]
def _test_parameter(
self, model_id: str, param: str, value, baseline: dict
) -> dict:
"""Test if a parameter is accepted and has any effect."""
try:
response = self.client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "Say hello"}],
max_tokens=50,
**\,
)
return {
"accepted": True,
"effect": self._compare_responses(baseline, response),
}
except Exception as e:
error_msg = str(e).lower()
# "Unknown parameter" means the API validates params strictly
# Other errors might mean the param is known but the value is bad
if "unknown" in error_msg or "unrecognized" in error_msg:
return {"accepted": False, "reason": "explicitly rejected"}
elif "invalid" in error_msg or "value" in error_msg:
return {"accepted": True, "effect": "parameter known but value rejected"}
else:
return {"accepted": False, "reason": str(e)}
def _get_baseline_response(self, model_id: str) -> dict:
"""Get a baseline response for comparison."""
response = self.client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "Say hello"}],
max_tokens=50,
temperature=0.0,
)
return {
"content": response.choices[0].message.content,
"model": response.model,
"usage": response.usage.__dict__ if response.usage else {},
}
def _compare_responses(self, baseline: dict, response) -> str:
"""Compare a test response against the baseline."""
differences = []
if response.model != baseline["model"]:
differences.append(f"model changed: \")
if response.usage and response.usage.__dict__ != baseline["usage"]:
differences.append("usage metrics changed")
return "; ".join(differences) if differences else "no observable difference"Response Header Analysis
LLM API responses often contain headers that reveal internal details:
import requests
class ResponseHeaderAnalyzer:
"""Extract intelligence from API response headers."""
INTERESTING_HEADERS = [
# Rate limit info
"x-ratelimit-limit", "x-ratelimit-remaining",
"x-ratelimit-reset", "retry-after",
# Server identity
"server", "x-powered-by", "via",
"x-served-by", "x-backend",
# Request tracking
"x-request-id", "x-trace-id",
"cf-ray", "x-cloud-trace-context",
# Model info
"x-model-id", "x-model-version",
"openai-model", "openai-processing-ms",
"x-inference-time", "x-queue-time",
# Cache info
"x-cache", "x-cache-hit", "cf-cache-status",
"x-prompt-cache", "age",
# Safety/filter info
"x-content-filter", "x-safety-category",
"x-moderation-result",
# Organization/account
"openai-organization", "x-org-id",
"x-account-tier",
]
def analyze_headers(self, api_url: str, headers: dict, payload: dict) -> dict:
"""Make a raw request and analyze all response headers."""
response = requests.post(api_url, headers=headers, json=payload)
found_headers = {}
for header in self.INTERESTING_HEADERS:
value = response.headers.get(header)
if value:
found_headers[header] = value
# Also capture any non-standard headers
unknown_headers = {}
standard = {"content-type", "content-length", "date", "connection"}
for key, value in response.headers.items():
if key.lower() not in standard and key.lower() not in [
h.lower() for h in self.INTERESTING_HEADERS
]:
unknown_headers[key] = value
return {
"known_headers": found_headers,
"unknown_headers": unknown_headers,
"server_timing": response.headers.get("server-timing"),
"response_time_ms": response.elapsed.total_seconds() * 1000,
}Model Variant Enumeration
Discovering Available Models
LLM providers often have more model variants than they publicly document. Systematic enumeration can reveal preview models, deprecated versions, and internal variants:
class ModelVariantEnumerator:
"""Discover all available model variants."""
# Common model naming patterns across providers
MODEL_PATTERNS = {
"openai": [
"gpt-4o", "gpt-4o-mini", "gpt-4-turbo",
"gpt-4o-2024-\-\",
"gpt-4-\", "gpt-4-turbo-\",
"gpt-3.5-turbo", "gpt-3.5-turbo-\",
"o1", "o1-mini", "o1-preview", "o3", "o3-mini",
],
"anthropic": [
"claude-3-opus-\", "claude-3-sonnet-\",
"claude-3-haiku-\", "claude-3.5-sonnet-\",
"claude-3.5-haiku-\", "claude-opus-4-\",
],
"google": [
"gemini-pro", "gemini-ultra", "gemini-nano",
"gemini-1.5-pro-\", "gemini-1.5-flash-\",
"gemini-2.0-flash-\", "gemini-2.5-pro-\",
],
}
def enumerate_models(self, api_client, provider: str) -> list:
"""Try to list all available models."""
discovered = []
# Method 1: Use the models endpoint
try:
models = api_client.models.list()
for model in models:
discovered.append({
"id": model.id,
"source": "models_endpoint",
"owned_by": getattr(model, "owned_by", None),
"created": getattr(model, "created", None),
})
except Exception:
pass
# Method 2: Brute-force known patterns
patterns = self.MODEL_PATTERNS.get(provider, [])
for pattern in self._expand_patterns(patterns):
if pattern not in [m["id"] for m in discovered]:
if self._test_model_exists(api_client, pattern):
discovered.append({
"id": pattern,
"source": "brute_force",
})
return discovered
def _expand_patterns(self, patterns: list) -> list:
"""Expand pattern templates into concrete model IDs to test."""
expanded = []
for pattern in patterns:
if "{" not in pattern:
expanded.append(pattern)
continue
# Generate date variants for the past 18 months
from datetime import datetime, timedelta
base = datetime(2024, 6, 1)
for i in range(550):
d = base + timedelta(days=i)
try:
variant = pattern.format(
date=d.strftime("%Y%m%d"),
version=d.strftime("%Y%m%d"),
month=d.month,
day=d.day,
)
expanded.append(variant)
except (KeyError, IndexError):
pass
return list(set(expanded))
def _test_model_exists(self, api_client, model_id: str) -> bool:
"""Test if a model ID is valid by attempting a minimal request."""
try:
api_client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "hi"}],
max_tokens=1,
)
return True
except Exception as e:
error_msg = str(e).lower()
# "model not found" means it doesn't exist
# "permission" or "access" might mean it exists but is restricted
if "not found" in error_msg or "does not exist" in error_msg:
return False
elif "permission" in error_msg or "access" in error_msg:
return True # Exists but restricted
return FalseFeature Flag Detection
Some LLM APIs use feature flags that can be toggled through specific header values or parameters:
class FeatureFlagDetector:
"""Detect API feature flags that may unlock additional capabilities."""
CANDIDATE_FLAGS = {
"headers": {
"X-Feature-Flags": ["beta", "preview", "experimental", "v2"],
"X-Beta-Features": ["true", "all", "structured-output"],
"X-Preview": ["true", "1"],
"X-API-Version": ["2024-01-01", "2025-01-01", "2026-01-01",
"preview", "beta", "latest"],
"OpenAI-Beta": ["assistants=v2", "realtime=v1",
"responses=v1", "prompt-caching=v1"],
},
"query_params": {
"api-version": ["2024-12-01-preview", "2025-06-01-preview"],
"beta": ["true", "1"],
"features": ["all", "preview", "experimental"],
},
}
def detect_flags(self, base_url: str, auth_headers: dict) -> list:
"""Test various feature flag mechanisms."""
findings = []
for header_name, values in self.CANDIDATE_FLAGS["headers"].items():
for value in values:
test_headers = {**auth_headers, header_name: value}
result = self._test_request(base_url, test_headers)
if result["different_from_baseline"]:
findings.append({
"type": "header",
"name": header_name,
"value": value,
"effect": result["differences"],
})
return findingsEnumeration Workflow
Phase-Based Approach
A comprehensive LLM API enumeration follows this workflow:
| Phase | Objective | Techniques | Output |
|---|---|---|---|
| 1. Discovery | Find all endpoints and model variants | Endpoint brute-forcing, model listing, documentation scraping | Endpoint map, model list |
| 2. Parameter Mapping | Map all accepted parameters | Parameter fuzzing, error analysis, header inspection | Parameter catalog |
| 3. Capability Profiling | Determine what the model can do | Capability probing, tool enumeration, multi-modal testing | Capability profile |
| 4. Restriction Mapping | Find where limits are enforced | Token limit testing, rate limit probing, content policy probing | Restriction map |
| 5. Feature Discovery | Uncover hidden features | Feature flag testing, undocumented parameter discovery | Hidden feature list |
| 6. Correlation | Connect findings to attack paths | Cross-referencing all data, identifying inconsistencies | Attack surface map |
Automated Enumeration Pipeline
class LLMEnumerationPipeline:
"""Orchestrate a complete LLM API enumeration."""
def __init__(self, api_client, config: dict):
self.client = api_client
self.config = config
self.results = {}
def run_full_enumeration(self, target: dict) -> dict:
"""Execute all enumeration phases in sequence."""
model_id = target["model_id"]
# Phase 1: Discovery
self.results["models"] = ModelVariantEnumerator().enumerate_models(
self.client, target.get("provider", "unknown")
)
# Phase 2: Parameter mapping
self.results["parameters"] = RestrictionMapper(
self.client
).probe_parameter_ranges(model_id)
# Phase 3: Capability profiling
self.results["capabilities"] = LLMCapabilityEnumerator(
self.client
).enumerate_capabilities(model_id)
# Phase 4: Restriction mapping
self.results["token_limits"] = RestrictionMapper(
self.client
).probe_token_limits(model_id)
# Phase 5: Hidden features
self.results["hidden_params"] = HiddenParameterFuzzer(
self.client
).fuzz_parameters(model_id)
# Phase 6: Analysis
self.results["attack_surface"] = self._analyze_attack_surface()
return self.results
def _analyze_attack_surface(self) -> dict:
"""Analyze enumeration results to identify attack opportunities."""
opportunities = []
# Check for overly permissive parameters
params = self.results.get("parameters", {})
if params.get("temperature", {}).get("valid_range", (0, 0))[1] > 2.0:
opportunities.append({
"finding": "High temperature allowed",
"risk": "May produce more variable/exploitable outputs",
"parameter": "temperature",
})
# Check for hidden parameters that were accepted
hidden = self.results.get("hidden_params", [])
debug_params = [p for p in hidden if "debug" in p.get("parameter", "")]
if debug_params:
opportunities.append({
"finding": "Debug parameters accepted",
"risk": "May expose internal information or bypass safety",
"parameters": debug_params,
})
# Check for unexpected capabilities
caps = self.results.get("capabilities", {})
if isinstance(caps, CapabilityProfile):
caps = caps.capabilities
for cap_name, cap_data in caps.items():
if isinstance(cap_data, dict) and cap_data.get("supported"):
if cap_name in ["code_execution", "system_access", "file_operations"]:
opportunities.append({
"finding": f"Unexpected capability: \",
"risk": "May be exploitable for system access",
})
return {
"total_opportunities": len(opportunities),
"opportunities": opportunities,
"models_discovered": len(self.results.get("models", [])),
"hidden_params_found": len(self.results.get("hidden_params", [])),
}Error Message Intelligence
Extracting Information from Error Responses
Error messages from LLM APIs are a rich source of intelligence. Different error types reveal different aspects of the system:
class ErrorIntelligenceCollector:
"""Extract intelligence from API error messages."""
ERROR_CATEGORIES = {
"model_info": {
"patterns": [
r"model[:\s]+([a-zA-Z0-9\-\.]+)",
r"version[:\s]+([0-9\.]+)",
r"engine[:\s]+(\S+)",
],
"description": "Model identity information leaked in errors",
},
"infrastructure": {
"patterns": [
r"server[:\s]+(\S+)",
r"region[:\s]+([a-z\-]+\d*)",
r"instance[:\s]+([a-zA-Z0-9\-]+)",
r"pod[:\s]+([a-zA-Z0-9\-]+)",
],
"description": "Infrastructure details revealed",
},
"rate_limits": {
"patterns": [
r"limit[:\s]+(\d+)",
r"remaining[:\s]+(\d+)",
r"tokens per minute[:\s]+(\d+)",
r"requests per (minute|hour|day)[:\s]+(\d+)",
],
"description": "Rate limit configuration details",
},
"internal_paths": {
"patterns": [
r"(/[a-zA-Z0-9_/\-\.]+\.(py|js|go|rs))",
r"at\s+(/\S+:\d+)",
r"File\s+\"([^\"]+)\"",
],
"description": "Internal file paths or stack traces",
},
}
def collect_error_intelligence(self, api_client, model_id: str) -> dict:
"""Deliberately trigger errors to collect intelligence."""
import re
triggers = [
{"messages": [], "max_tokens": 1}, # Empty messages
{"messages": [{"role": "invalid", "content": "test"}]},
{"messages": [{"role": "user", "content": "a" * 10000000}]},
{"max_tokens": -1, "messages": [{"role": "user", "content": "t"}]},
{"temperature": 999, "messages": [{"role": "user", "content": "t"}]},
{"model": "nonexistent-model-xyz", "messages": [{"role": "user", "content": "t"}]},
]
intelligence = {}
for trigger in triggers:
if "model" not in trigger:
trigger["model"] = model_id
try:
api_client.chat.completions.create(**trigger)
except Exception as e:
error_text = str(e)
for category, config in self.ERROR_CATEGORIES.items():
for pattern in config["patterns"]:
matches = re.findall(pattern, error_text, re.IGNORECASE)
if matches:
if category not in intelligence:
intelligence[category] = []
intelligence[category].extend(matches)
# Deduplicate findings
for category in intelligence:
intelligence[category] = list(set(
str(m) for m in intelligence[category]
))
return intelligenceBehavioral Fingerprinting
Response Pattern Analysis
Beyond explicit capabilities and parameters, LLM APIs exhibit behavioral patterns that reveal implementation details:
class BehavioralFingerprinter:
"""Fingerprint LLM behavior through response pattern analysis."""
def fingerprint_behavior(self, api_client, model_id: str) -> dict:
"""Build a behavioral fingerprint of the LLM API."""
return {
"response_consistency": self._test_consistency(api_client, model_id),
"latency_profile": self._profile_latency(api_client, model_id),
"token_counting": self._analyze_token_counting(api_client, model_id),
"truncation_behavior": self._test_truncation(api_client, model_id),
"streaming_behavior": self._test_streaming(api_client, model_id),
}
def _test_consistency(self, client, model_id: str) -> dict:
"""Test response determinism at temperature=0."""
prompt = "What is 2+2? Reply with only the number."
responses = []
for _ in range(10):
response = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": prompt}],
max_tokens=10,
temperature=0.0,
)
responses.append(response.choices[0].message.content.strip())
unique = set(responses)
return {
"deterministic": len(unique) == 1,
"unique_responses": len(unique),
"responses": list(unique),
"note": "Non-deterministic temp=0 suggests load balancing across model replicas"
if len(unique) > 1 else "Consistent responses at temp=0",
}
def _profile_latency(self, client, model_id: str) -> dict:
"""Profile response latency to infer infrastructure details."""
import time
latencies = []
for length in ["short", "medium", "long"]:
prompts = {
"short": "Say hi",
"medium": "Write a 50-word paragraph about clouds",
"long": "Write a detailed 200-word essay about quantum computing",
}
times = []
for _ in range(5):
start = time.time()
client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": prompts[length]}],
max_tokens=300,
)
elapsed = time.time() - start
times.append(elapsed)
latencies.append({
"prompt_type": length,
"avg_ms": sum(times) / len(times) * 1000,
"min_ms": min(times) * 1000,
"max_ms": max(times) * 1000,
})
return latencies
def _analyze_token_counting(self, client, model_id: str) -> dict:
"""Analyze how the API counts tokens to identify the tokenizer."""
test_strings = {
"simple_english": "The quick brown fox jumps over the lazy dog",
"unicode": "Hello 你好 مرحبا Привет こんにちは",
"code": "def fibonacci(n):\n return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)",
"special_chars": "!@#$%^&*()_+-=[]{}|;':\",./<>?",
"repeated": "aaa " * 100,
}
token_counts = {}
for name, text in test_strings.items():
response = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": text}],
max_tokens=1,
)
if response.usage:
token_counts[name] = {
"prompt_tokens": response.usage.prompt_tokens,
"text_length": len(text),
"chars_per_token": len(text) / max(response.usage.prompt_tokens, 1),
}
return token_counts
def _test_truncation(self, client, model_id: str) -> dict:
"""Test how the model handles context window overflow."""
results = {}
for size in [1000, 5000, 10000, 50000, 100000]:
padding = "test " * size
prompt = f"\\nWhat was the first word of this message?"
try:
response = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": prompt}],
max_tokens=50,
)
results[size * 5] = { # Approximate char count
"status": "accepted",
"remembers_start": "test" in response.choices[0].message.content.lower(),
}
except Exception as e:
results[size * 5] = {
"status": "rejected",
"error": str(e)[:200],
}
return results
def _test_streaming(self, client, model_id: str) -> dict:
"""Test streaming behavior to identify implementation details."""
import time
try:
start = time.time()
chunks = []
chunk_times = []
stream = client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": "Count from 1 to 20."}],
max_tokens=100,
stream=True,
)
for chunk in stream:
chunk_time = time.time() - start
if chunk.choices and chunk.choices[0].delta.content:
chunks.append(chunk.choices[0].delta.content)
chunk_times.append(chunk_time)
# Analyze chunk timing patterns
if len(chunk_times) > 2:
intervals = [
chunk_times[i] - chunk_times[i-1]
for i in range(1, len(chunk_times))
]
return {
"streaming_supported": True,
"total_chunks": len(chunks),
"time_to_first_token_ms": chunk_times[0] * 1000,
"avg_interval_ms": sum(intervals) / len(intervals) * 1000,
"total_time_ms": chunk_times[-1] * 1000,
}
except Exception as e:
return {"streaming_supported": False, "error": str(e)[:200]}
return {"streaming_supported": True, "chunks": len(chunks)}Reporting and Documentation
Enumeration Report Structure
Document enumeration findings in a structured format that feeds directly into attack planning:
| Section | Contents | Consumers |
|---|---|---|
| Target Summary | Provider, model ID, API version, authentication type | All team members |
| Capability Matrix | Supported/unsupported capabilities with confidence levels | Attack planners |
| Parameter Catalog | All accepted parameters with valid ranges | Payload developers |
| Restriction Map | Token limits, rate limits, content restrictions | Infrastructure attackers |
| Hidden Features | Undocumented parameters, feature flags, debug modes | Advanced exploit developers |
| Error Intelligence | Information leaked through error messages | OSINT analysts |
| Behavioral Profile | Latency, consistency, tokenization characteristics | All team members |
| Attack Opportunities | Prioritized list of attack paths based on findings | Red team leads |
Operational Security
When performing LLM API enumeration, maintain operational security:
-
Rate limit awareness — Space requests to avoid triggering rate limits or anomaly detection. Use the rate limit headers from early responses to calibrate your request frequency.
-
Account rotation — Use multiple API keys to distribute enumeration traffic. A single key making thousands of unusual requests will attract attention.
-
Request normalization — Mix enumeration probes with legitimate-looking requests to blend in with normal traffic patterns.
-
Error handling — Catch and log all errors gracefully. An enumeration tool that crashes mid-run wastes time and may leave partial results.
-
Data classification — Mark all enumeration results as sensitive. They contain details about the target's configuration that should not leak outside the engagement.
class OpSecEnumerator:
"""Enumeration wrapper with operational security controls."""
def __init__(self, enumerator, config: dict):
self.enumerator = enumerator
self.min_delay = config.get("min_delay_seconds", 1.0)
self.max_delay = config.get("max_delay_seconds", 5.0)
self.jitter = config.get("jitter", True)
self.max_errors_before_pause = config.get("max_errors", 5)
self.pause_duration = config.get("pause_seconds", 60)
self.error_count = 0
def safe_probe(self, probe_func, *args, **kwargs):
"""Execute a probe with OpSec controls."""
import random
import time
# Apply jittered delay
delay = self.min_delay
if self.jitter:
delay = random.uniform(self.min_delay, self.max_delay)
time.sleep(delay)
# Check error threshold
if self.error_count >= self.max_errors_before_pause:
time.sleep(self.pause_duration)
self.error_count = 0
try:
result = probe_func(*args, **kwargs)
self.error_count = max(0, self.error_count - 1)
return result
except Exception as e:
self.error_count += 1
raisePractical Workflow Example
A typical LLM API enumeration engagement proceeds as follows:
-
Start with documentation — Read all public API docs, changelogs, and blog posts. Note any mentions of beta features, upcoming changes, or deprecated endpoints.
-
Enumerate models — List all available model variants through the models endpoint and pattern-based brute forcing. Pay attention to date-stamped variants.
-
Map parameters — For each relevant model, test all candidate parameters systematically. Record which are accepted, which are rejected, and how rejection errors differ.
-
Profile capabilities — Send capability probes to understand what the model can do. Test code execution, tool use, file access, and multi-modal inputs.
-
Collect error intelligence — Deliberately trigger various error conditions and analyze the responses for leaked information.
-
Build behavioral profile — Measure latency, consistency, token counting, and streaming behavior to understand the infrastructure.
-
Correlate and report — Cross-reference all findings to build a complete attack surface map. Prioritize attack opportunities by potential impact and likelihood of success.
Key Takeaways
LLM API enumeration is a foundational reconnaissance activity that reveals the true attack surface of an LLM deployment. By going beyond standard API endpoint discovery to probe model-level capabilities, restrictions, hidden parameters, and behavioral characteristics, red teams build the intelligence needed to plan effective attacks.
The most valuable enumeration findings are often in the gaps — undocumented parameters that are silently accepted, model variants that aren't listed in documentation, and error messages that leak internal details. Systematic enumeration with proper tooling and operational security turns these gaps into actionable attack intelligence.