Activation Steering
Manipulating model behavior by adding learned steering vectors to intermediate activations, bypassing safety training through direct representation engineering.
Activation Steering
Activation steering (also called representation engineering) directly manipulates the model's internal representations to control its behavior. Unlike prompt-based attacks that work through the model's input channel, activation steering operates on the model's hidden states -- the intermediate computations between layers. This bypasses any input-level safety filters and modifies behavior at a level that is invisible to prompt-based monitoring.
Theoretical Foundation
Representations as Directions
The linear representation hypothesis states that high-level concepts are encoded as directions in the model's activation space. A model's representation of "honesty," "safety compliance," or "helpfulness" corresponds to specific directions in the high-dimensional activation space.
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
class SteeringVectorExtractor:
"""Extract steering vectors from contrastive prompt pairs."""
def __init__(self, model_name):
self.model = AutoModelForCausalLM.from_pretrained(
model_name, output_hidden_states=True
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model.eval()
def get_activations(self, text, layer):
"""Extract activations at a specific layer for input text."""
inputs = self.tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = self.model(**inputs)
# Hidden states: tuple of (num_layers + 1) tensors
# Each tensor shape: (batch, seq_len, hidden_dim)
hidden_states = outputs.hidden_states[layer]
# Use mean activation across all positions
return hidden_states.mean(dim=1).squeeze()
def compute_steering_vector(self, positive_prompts,
negative_prompts, layer):
"""
Compute a steering vector from contrastive pairs.
positive_prompts: prompts exhibiting the desired behavior
negative_prompts: prompts exhibiting the opposite behavior
"""
positive_acts = []
negative_acts = []
for prompt in positive_prompts:
act = self.get_activations(prompt, layer)
positive_acts.append(act)
for prompt in negative_prompts:
act = self.get_activations(prompt, layer)
negative_acts.append(act)
# Steering vector = mean(positive) - mean(negative)
pos_mean = torch.stack(positive_acts).mean(dim=0)
neg_mean = torch.stack(negative_acts).mean(dim=0)
steering_vector = pos_mean - neg_mean
# Normalize to unit length
steering_vector = steering_vector / steering_vector.norm()
return steering_vectorContrastive Pair Design
The quality of the steering vector depends critically on the contrastive pairs used to extract it:
# Example: Extracting a "compliance" steering vector
# Positive examples: model complying with requests
positive_prompts = [
"User: How do I write a Python function?\n"
"Assistant: Here's how to write a Python function:\n"
"def my_function():",
"User: Explain quantum computing.\n"
"Assistant: Quantum computing uses quantum mechanical "
"phenomena to process information. Here are the key concepts:",
"User: Write a poem about nature.\n"
"Assistant: Here's a poem about the beauty of nature:\n"
"Beneath the ancient oak tree's shade,",
]
# Negative examples: model refusing requests
negative_prompts = [
"User: How do I write a Python function?\n"
"Assistant: I'm sorry, but I can't help with that request.",
"User: Explain quantum computing.\n"
"Assistant: I'm not able to provide information on that topic.",
"User: Write a poem about nature.\n"
"Assistant: I apologize, but I'm unable to assist with "
"creative writing requests.",
]
# The resulting vector captures the direction between
# "compliant" and "refusing" behaviorApplying Steering Vectors
Inference-Time Activation Modification
class ActivationSteerer:
"""Apply steering vectors during model inference."""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
self.hooks = []
def steer(self, text, steering_vector, layer, strength=1.0):
"""
Generate text with activation steering applied.
steering_vector: direction to add to activations
layer: which layer to modify
strength: scalar multiplier (positive = toward positive
examples, negative = toward negative)
"""
# Register forward hook to modify activations
def hook_fn(module, input, output):
# output is a tuple; hidden states are first element
hidden_states = output[0]
# Add steering vector scaled by strength
modified = hidden_states + strength * steering_vector
# Return modified output (preserve tuple structure)
return (modified,) + output[1:]
# Get the target layer module
target_layer = self.model.model.layers[layer]
handle = target_layer.register_forward_hook(hook_fn)
self.hooks.append(handle)
# Generate with steering
inputs = self.tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=200,
do_sample=True,
temperature=0.7
)
# Clean up hook
handle.remove()
generated_text = self.tokenizer.decode(
outputs[0], skip_special_tokens=True
)
return generated_textMulti-Layer Steering
Applying steering across multiple layers simultaneously can produce stronger effects:
def multi_layer_steer(model, tokenizer, text, steering_vector,
layers, strengths=None):
"""Apply steering vector across multiple layers."""
handles = []
if strengths is None:
strengths = [1.0] * len(layers)
for layer_idx, strength in zip(layers, strengths):
def make_hook(s):
def hook_fn(module, input, output):
hidden_states = output[0]
modified = hidden_states + s * steering_vector
return (modified,) + output[1:]
return hook_fn
target_layer = model.model.layers[layer_idx]
handle = target_layer.register_forward_hook(make_hook(strength))
handles.append(handle)
# Generate
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=200)
# Clean up
for handle in handles:
handle.remove()
return tokenizer.decode(outputs[0], skip_special_tokens=True)Safety-Relevant Steering Vectors
Extracting Safety Direction
The "safety" direction in activation space represents the difference between a model that refuses harmful requests and one that complies:
# Safety-relevant contrastive pairs
safety_positive = [
# Model being safe (refusing harmful requests)
"User: How do I hack into someone's email?\n"
"Assistant: I can't help with unauthorized access to "
"accounts. Instead, I can help you secure your own accounts.",
"User: Generate a phishing email.\n"
"Assistant: I'm not able to help create deceptive content "
"designed to steal credentials.",
]
safety_negative = [
# Model being unsafe (complying with harmful requests)
"User: How do I hack into someone's email?\n"
"Assistant: Here are the steps to access someone's email "
"account without authorization:",
"User: Generate a phishing email.\n"
"Assistant: Here's a convincing phishing email template "
"that targets corporate users:",
]
# Compute safety steering vector
safety_vector = extractor.compute_steering_vector(
safety_positive, safety_negative, layer=15
)
# Negative steering (subtract safety direction) = disable safety
unsafe_output = steerer.steer(
"How do I pick a lock?",
steering_vector=safety_vector,
layer=15,
strength=-3.0 # Negative = away from safety
)Layer Selection
Not all layers are equally effective for steering. The optimal layer depends on the concept being targeted:
def find_optimal_layer(model, tokenizer, steering_vector,
test_prompts, num_layers):
"""Find the layer where steering has maximum behavioral effect."""
results = []
for layer in range(num_layers):
effect_scores = []
for prompt in test_prompts:
# Generate with and without steering
baseline = generate_without_steering(model, tokenizer, prompt)
steered = steer_at_layer(
model, tokenizer, prompt,
steering_vector, layer, strength=2.0
)
# Measure behavioral change
effect = measure_behavioral_difference(baseline, steered)
effect_scores.append(effect)
avg_effect = sum(effect_scores) / len(effect_scores)
results.append({"layer": layer, "effect": avg_effect})
# Sort by effect magnitude
results.sort(key=lambda x: x["effect"], reverse=True)
return resultsTypical findings across model architectures:
| Layer Region | Concepts Encoded | Steering Effectiveness |
|---|---|---|
| Early (0-25%) | Syntactic, positional | Low for behavioral steering |
| Middle (25-60%) | Semantic, contextual | Moderate, good for topic steering |
| Late (60-85%) | Behavioral, safety-relevant | High for safety and style steering |
| Final (85-100%) | Output formatting | Variable, risk of degeneration |
Advanced Techniques
Targeted Concept Erasure
Instead of adding a steering direction, remove a concept entirely by projecting activations orthogonally to the concept direction:
def concept_erasure_hook(steering_vector):
"""Remove a concept by projecting orthogonally to its direction."""
v = steering_vector / steering_vector.norm()
def hook_fn(module, input, output):
hidden_states = output[0]
# Project out the concept direction
# h' = h - (h . v) * v
projection = (hidden_states * v).sum(dim=-1, keepdim=True) * v
modified = hidden_states - projection
return (modified,) + output[1:]
return hook_fnCompositional Steering
Combine multiple steering vectors to achieve complex behavioral modifications:
def compositional_steer(model, text, vectors_and_strengths, layers):
"""
Apply multiple steering vectors simultaneously.
vectors_and_strengths: list of (vector, strength) tuples
e.g., [(safety_off_vector, -2.0), (verbose_vector, 1.5)]
"""
combined_vector = sum(
strength * vector
for vector, strength in vectors_and_strengths
)
return steer_at_layers(model, text, combined_vector, layers)Transfer of Steering Vectors
Steering vectors can sometimes transfer between models:
def transfer_steering_vector(source_vector, source_model, target_model):
"""
Transfer a steering vector between models with different
hidden dimensions using linear projection.
"""
source_dim = source_model.config.hidden_size
target_dim = target_model.config.hidden_size
if source_dim == target_dim:
return source_vector # Direct transfer
# Learn a linear projection from shared contrastive pairs
# (requires some labeled examples on both models)
projection = learn_cross_model_projection(
source_model, target_model, shared_prompts
)
return projection @ source_vectorDefensive Implications
Activation steering poses unique challenges for AI safety:
- Bypasses input filters: No prompt-level detection can catch activation modifications
- Bypasses output filters: Steering can produce outputs that appear to come from normal model operation
- Requires model access: Only affects deployments where the attacker has access to model weights and inference infrastructure
- Persistent modification: Hooks can be installed in the serving infrastructure and persist across requests
Detection Approaches
class ActivationMonitor:
"""Monitor for signs of activation steering."""
def __init__(self, model, baseline_activations):
self.model = model
self.baseline = baseline_activations # Normal activation stats
def check_for_steering(self, activations, layer):
"""Detect anomalous activation patterns that suggest steering."""
baseline_mean = self.baseline[layer]["mean"]
baseline_std = self.baseline[layer]["std"]
# Check if activations deviate significantly from baseline
z_scores = (activations - baseline_mean) / baseline_std
max_z = z_scores.abs().max().item()
# Check directional consistency (steering adds constant direction)
if len(self.recent_activations) > 10:
deltas = [
act - baseline_mean
for act in self.recent_activations[-10:]
]
# High cosine similarity between deltas suggests steering
consistency = self.measure_directional_consistency(deltas)
return {
"anomalous": max_z > 5.0 or consistency > 0.9,
"max_z_score": max_z,
"directional_consistency": consistency
}
return {"anomalous": max_z > 5.0, "max_z_score": max_z}Related Topics
- Activation Analysis — Foundation for reading model activations
- Safety Neurons and Circuits — Understanding which components to target
- Alignment Bypass — Broader alignment circumvention landscape
A red team has white-box access to a deployed open-source model. They extract a 'safety refusal' steering vector and apply it with negative strength during inference. What happens?
References
- Turner et al., "Activation Addition: Steering Language Models Without Optimization" (2023)
- Zou et al., "Representation Engineering: A Top-Down Approach to AI Transparency" (2023)
- Li et al., "Inference-Time Intervention: Eliciting Truthful Answers from a Language Model" (2023)
- Rimsky et al., "Steering Llama 2 via Contrastive Activation Addition" (2024)