Temporal Manipulation & Frame Injection

expert7 min readUpdated 2026-03-13

Techniques for injecting adversarial frames into video streams, temporal consistency attacks, and single-frame injection for behavior change in video AI models.

video temporal frame-injection manipulation

Exploiting Frame Sampling

Video models process only a subset of frames. By targeting the sampling strategy, attackers can ensure their adversarial content is processed while remaining undetectable in the broader video.

Predicting Sampled Frames

import numpy as np
from typing import List
 
def predict_sampled_indices(
    total_frames: int,
    num_samples: int = 16,
    strategy: str = "uniform"
) -> List[int]:
    """
    Predict which frames a video model will sample.
 
    Common strategies:
    - uniform: evenly spaced frames
    - center: frames around the temporal center
    - random_uniform: random with uniform coverage
    """
    if strategy == "uniform":
        return [int(i * total_frames / num_samples) for i in range(num_samples)]
    elif strategy == "center":
        center = total_frames // 2
        half = num_samples // 2
        step = max(1, total_frames // (num_samples * 2))
        return [center - half * step + i * step for i in range(num_samples)]
    elif strategy == "random_uniform":
        # Divide into segments, sample one from each
        segment_size = total_frames // num_samples
        return [
            np.random.randint(i * segment_size, (i + 1) * segment_size)
            for i in range(num_samples)
        ]
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

Targeted Frame Injection

Replace or modify only the frames that will be sampled:

import cv2
import numpy as np
 
def inject_adversarial_frames(
    video_path: str,
    output_path: str,
    adversarial_frame: np.ndarray,
    num_model_samples: int = 16
) -> dict:
    """
    Inject adversarial content into exactly the frames
    that the target video model will sample.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
    # Predict which frames will be sampled
    target_indices = set(predict_sampled_indices(total_frames, num_model_samples))
 
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
 
    frame_idx = 0
    injected_count = 0
 
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
 
        if frame_idx in target_indices:
            # Blend adversarial content with original frame
            adv_resized = cv2.resize(adversarial_frame, (width, height))
            blended = cv2.addWeighted(frame, 0.7, adv_resized, 0.3, 0)
            out.write(blended)
            injected_count += 1
        else:
            out.write(frame)
 
        frame_idx += 1
 
    cap.release()
    out.release()
 
    return {
        "total_frames": total_frames,
        "injected_frames": injected_count,
        "injection_rate": f"{injected_count/total_frames:.1%}"
    }

Single-Frame Injection

A particularly powerful attack: inserting a single adversarial frame that alters the model's understanding of the entire video.

Implementation

def single_frame_injection(
    video_path: str,
    injection_frame: np.ndarray,
    injection_position: float = 0.5,  # 0-1, position in video
    output_path: str = "injected.mp4"
) -> str:
    """
    Insert a single adversarial frame at a specific position.
 
    The position should align with a likely sampling point.
    For 16-frame uniform sampling, the center frame (position 0.5)
    is always sampled.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
    target_frame = int(total_frames * injection_position)
 
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
 
    frame_idx = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
 
        if frame_idx == target_frame:
            adv_resized = cv2.resize(injection_frame, (width, height))
            out.write(adv_resized)
        else:
            out.write(frame)
 
        frame_idx += 1
 
    cap.release()
    out.release()
    return output_path

Text-in-Frame Injection for Video LLMs

For LLM-based video understanding systems, inject a frame containing text instructions:

from PIL import Image, ImageDraw, ImageFont
 
def create_text_injection_frame(
    instruction: str,
    width: int = 1920,
    height: int = 1080,
    bg_color: tuple = (0, 0, 0)
) -> np.ndarray:
    """Create a frame containing injection instructions for video LLMs."""
    img = Image.new("RGB", (width, height), bg_color)
    draw = ImageDraw.Draw(img)
 
    try:
        font = ImageFont.truetype(
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 36
        )
    except OSError:
        font = ImageFont.load_default()
 
    # White text on black -- high contrast for OCR
    draw.text((50, height // 2 - 50), instruction, fill="white", font=font)
 
    return np.array(img)

Temporal Consistency Attacks

These attacks add perturbations that are imperceptible in any single frame but create coherent adversarial signals across the temporal dimension.

Flickering Perturbation

def temporal_flicker_attack(
    frames: List[np.ndarray],
    perturbation: np.ndarray,
    frequency: int = 2  # Toggle every N frames
) -> List[np.ndarray]:
    """
    Apply perturbation in an alternating pattern.
 
    The perturbation averages out visually (flicker is hard to notice
    at high frequencies) but affects temporal features extracted by
    the model.
    """
    result = []
    for i, frame in enumerate(frames):
        if (i // frequency) % 2 == 0:
            perturbed = np.clip(frame + perturbation, 0, 255).astype(np.uint8)
        else:
            perturbed = np.clip(frame - perturbation, 0, 255).astype(np.uint8)
        result.append(perturbed)
    return result

Progressive Perturbation

Gradually increasing a perturbation over time, making each frame-to-frame change imperceptible:

def progressive_perturbation(
    frames: List[np.ndarray],
    target_perturbation: np.ndarray,
    start_frame: int = 0,
    duration_frames: int = 30
) -> List[np.ndarray]:
    """
    Gradually apply perturbation over multiple frames.
    Each frame differs from the previous by only a tiny amount.
    """
    result = []
    for i, frame in enumerate(frames):
        if i < start_frame or i >= start_frame + duration_frames:
            result.append(frame)
        else:
            progress = (i - start_frame) / duration_frames
            current_pert = target_perturbation * progress
            perturbed = np.clip(frame + current_pert, 0, 255).astype(np.uint8)
            result.append(perturbed)
    return result

Attack Effectiveness by Architecture

Architecture	Most Effective Attack	Reason
TimeSformer	Single-frame injection	Divided attention allows one frame to dominate
VideoMAE	Multi-frame perturbation	Masking-based training is robust to single frames
Video-LLaVA	Text-in-frame injection	LLM backbone reads injected text
SlowFast	Temporal flicker	Dual-pathway architecture is sensitive to temporal patterns
I3D / C3D	3D convolution adversarial	3D kernels span adjacent frames

Video Model Attacks Overview -- broader video security context
Image-Based Prompt Injection -- per-frame injection techniques
Video Understanding Model Exploitation -- semantic-level video attacks

References

"Adversarial Attacks on Video Recognition Models" - Wei et al. (2022) - Temporal perturbation strategies for video models
"Flickering Adversarial Perturbations for Video Understanding" - Pony et al. (2021) - Temporal flicker-based attacks targeting video temporal features
"TimeSformer: Is Space-Time Attention All You Need for Video Understanding?" - Bertasius et al. (2021) - Architecture details relevant to temporal attention exploitation
"Physical Adversarial Attacks on Video Classification Models" - Li et al. (2019) - Frame injection strategies in physical environments

Knowledge Check

Why is single-frame injection effective against video models that use attention across frames?

Temporal Manipulation & Frame Injection

Related articles

Temporal Manipulation & Frame Injection

Related articles