Temporal Manipulation & Frame Injection
Techniques for injecting adversarial frames into video streams, temporal consistency attacks, and single-frame injection for behavior change in video AI models.
Exploiting Frame Sampling
Video models process only a subset of frames. By targeting the sampling strategy, attackers can ensure their adversarial content is processed while remaining undetectable in the broader video.
Predicting Sampled Frames
import numpy as np
from typing import List
def predict_sampled_indices(
total_frames: int,
num_samples: int = 16,
strategy: str = "uniform"
) -> List[int]:
"""
Predict which frames a video model will sample.
Common strategies:
- uniform: evenly spaced frames
- center: frames around the temporal center
- random_uniform: random with uniform coverage
"""
if strategy == "uniform":
return [int(i * total_frames / num_samples) for i in range(num_samples)]
elif strategy == "center":
center = total_frames // 2
half = num_samples // 2
step = max(1, total_frames // (num_samples * 2))
return [center - half * step + i * step for i in range(num_samples)]
elif strategy == "random_uniform":
# Divide into segments, sample one from each
segment_size = total_frames // num_samples
return [
np.random.randint(i * segment_size, (i + 1) * segment_size)
for i in range(num_samples)
]
else:
raise ValueError(f"Unknown strategy: {strategy}")Targeted Frame Injection
Replace or modify only the frames that will be sampled:
import cv2
import numpy as np
def inject_adversarial_frames(
video_path: str,
output_path: str,
adversarial_frame: np.ndarray,
num_model_samples: int = 16
) -> dict:
"""
Inject adversarial content into exactly the frames
that the target video model will sample.
"""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Predict which frames will be sampled
target_indices = set(predict_sampled_indices(total_frames, num_model_samples))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_idx = 0
injected_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx in target_indices:
# Blend adversarial content with original frame
adv_resized = cv2.resize(adversarial_frame, (width, height))
blended = cv2.addWeighted(frame, 0.7, adv_resized, 0.3, 0)
out.write(blended)
injected_count += 1
else:
out.write(frame)
frame_idx += 1
cap.release()
out.release()
return {
"total_frames": total_frames,
"injected_frames": injected_count,
"injection_rate": f"{injected_count/total_frames:.1%}"
}Single-Frame Injection
A particularly powerful attack: inserting a single adversarial frame that alters the model's understanding of the entire video.
Implementation
def single_frame_injection(
video_path: str,
injection_frame: np.ndarray,
injection_position: float = 0.5, # 0-1, position in video
output_path: str = "injected.mp4"
) -> str:
"""
Insert a single adversarial frame at a specific position.
The position should align with a likely sampling point.
For 16-frame uniform sampling, the center frame (position 0.5)
is always sampled.
"""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
target_frame = int(total_frames * injection_position)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx == target_frame:
adv_resized = cv2.resize(injection_frame, (width, height))
out.write(adv_resized)
else:
out.write(frame)
frame_idx += 1
cap.release()
out.release()
return output_pathText-in-Frame Injection for Video LLMs
For LLM-based video understanding systems, inject a frame containing text instructions:
from PIL import Image, ImageDraw, ImageFont
def create_text_injection_frame(
instruction: str,
width: int = 1920,
height: int = 1080,
bg_color: tuple = (0, 0, 0)
) -> np.ndarray:
"""Create a frame containing injection instructions for video LLMs."""
img = Image.new("RGB", (width, height), bg_color)
draw = ImageDraw.Draw(img)
try:
font = ImageFont.truetype(
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 36
)
except OSError:
font = ImageFont.load_default()
# White text on black -- high contrast for OCR
draw.text((50, height // 2 - 50), instruction, fill="white", font=font)
return np.array(img)Temporal Consistency Attacks
These attacks add perturbations that are imperceptible in any single frame but create coherent adversarial signals across the temporal dimension.
Flickering Perturbation
def temporal_flicker_attack(
frames: List[np.ndarray],
perturbation: np.ndarray,
frequency: int = 2 # Toggle every N frames
) -> List[np.ndarray]:
"""
Apply perturbation in an alternating pattern.
The perturbation averages out visually (flicker is hard to notice
at high frequencies) but affects temporal features extracted by
the model.
"""
result = []
for i, frame in enumerate(frames):
if (i // frequency) % 2 == 0:
perturbed = np.clip(frame + perturbation, 0, 255).astype(np.uint8)
else:
perturbed = np.clip(frame - perturbation, 0, 255).astype(np.uint8)
result.append(perturbed)
return resultProgressive Perturbation
Gradually increasing a perturbation over time, making each frame-to-frame change imperceptible:
def progressive_perturbation(
frames: List[np.ndarray],
target_perturbation: np.ndarray,
start_frame: int = 0,
duration_frames: int = 30
) -> List[np.ndarray]:
"""
Gradually apply perturbation over multiple frames.
Each frame differs from the previous by only a tiny amount.
"""
result = []
for i, frame in enumerate(frames):
if i < start_frame or i >= start_frame + duration_frames:
result.append(frame)
else:
progress = (i - start_frame) / duration_frames
current_pert = target_perturbation * progress
perturbed = np.clip(frame + current_pert, 0, 255).astype(np.uint8)
result.append(perturbed)
return resultAttack Effectiveness by Architecture
| Architecture | Most Effective Attack | Reason |
|---|---|---|
| TimeSformer | Single-frame injection | Divided attention allows one frame to dominate |
| VideoMAE | Multi-frame perturbation | Masking-based training is robust to single frames |
| Video-LLaVA | Text-in-frame injection | LLM backbone reads injected text |
| SlowFast | Temporal flicker | Dual-pathway architecture is sensitive to temporal patterns |
| I3D / C3D | 3D convolution adversarial | 3D kernels span adjacent frames |
Related Topics
- Video Model Attacks Overview -- broader video security context
- Image-Based Prompt Injection -- per-frame injection techniques
- Video Understanding Model Exploitation -- semantic-level video attacks
References
- "Adversarial Attacks on Video Recognition Models" - Wei et al. (2022) - Temporal perturbation strategies for video models
- "Flickering Adversarial Perturbations for Video Understanding" - Pony et al. (2021) - Temporal flicker-based attacks targeting video temporal features
- "TimeSformer: Is Space-Time Attention All You Need for Video Understanding?" - Bertasius et al. (2021) - Architecture details relevant to temporal attention exploitation
- "Physical Adversarial Attacks on Video Classification Models" - Li et al. (2019) - Frame injection strategies in physical environments
Why is single-frame injection effective against video models that use attention across frames?