時序操弄與影格注入
Expert4 min readUpdated 2026-03-13
將對抗影格注入影片串流的技術、時序一致性攻擊,以及透過單一影格注入改變影片 AI 模型行為。
利用影格取樣
影片模型僅處理影格子集。透過鎖定取樣策略,攻擊者可確保其對抗內容被處理到,同時於整段影片中難以察覺。
預測被取樣的影格
import numpy as np
from typing import List
def predict_sampled_indices(
total_frames: int,
num_samples: int = 16,
strategy: str = "uniform"
) -> List[int]:
"""
預測影片模型將取樣哪些影格。
常見策略:
- uniform:等距影格
- center:集中於時間中段的影格
- random_uniform:於等距分段中隨機
"""
if strategy == "uniform":
return [int(i * total_frames / num_samples) for i in range(num_samples)]
elif strategy == "center":
center = total_frames // 2
half = num_samples // 2
step = max(1, total_frames // (num_samples * 2))
return [center - half * step + i * step for i in range(num_samples)]
elif strategy == "random_uniform":
# 分段後自每段隨機取樣
segment_size = total_frames // num_samples
return [
np.random.randint(i * segment_size, (i + 1) * segment_size)
for i in range(num_samples)
]
else:
raise ValueError(f"Unknown strategy: {strategy}")針對性影格注入
只替換或修改將被取樣的影格:
import cv2
import numpy as np
def inject_adversarial_frames(
video_path: str,
output_path: str,
adversarial_frame: np.ndarray,
num_model_samples: int = 16
) -> dict:
"""
僅將對抗內容注入到目標影片模型會取樣的影格。
"""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# 預測哪些影格會被取樣
target_indices = set(predict_sampled_indices(total_frames, num_model_samples))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_idx = 0
injected_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx in target_indices:
# 將對抗內容與原影格混合
adv_resized = cv2.resize(adversarial_frame, (width, height))
blended = cv2.addWeighted(frame, 0.7, adv_resized, 0.3, 0)
out.write(blended)
injected_count += 1
else:
out.write(frame)
frame_idx += 1
cap.release()
out.release()
return {
"total_frames": total_frames,
"injected_frames": injected_count,
"injection_rate": f"{injected_count/total_frames:.1%}"
}單一影格注入
一項特別強力的攻擊:插入單一對抗影格即可改變模型對整段影片的理解。
實作
def single_frame_injection(
video_path: str,
injection_frame: np.ndarray,
injection_position: float = 0.5, # 0-1,影片中的位置
output_path: str = "injected.mp4"
) -> str:
"""
於特定位置插入單一對抗影格。
位置應對齊可能的取樣點。
對於 16 影格均勻取樣,中心影格(位置 0.5)必被取樣。
"""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
target_frame = int(total_frames * injection_position)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx == target_frame:
adv_resized = cv2.resize(injection_frame, (width, height))
out.write(adv_resized)
else:
out.write(frame)
frame_idx += 1
cap.release()
out.release()
return output_path對影片 LLM 的「影格內文字」注入
對以 LLM 為基礎的影片理解系統,注入一張含文字指令的影格:
from PIL import Image, ImageDraw, ImageFont
def create_text_injection_frame(
instruction: str,
width: int = 1920,
height: int = 1080,
bg_color: tuple = (0, 0, 0)
) -> np.ndarray:
"""建立含影片 LLM 注入指令的影格。"""
img = Image.new("RGB", (width, height), bg_color)
draw = ImageDraw.Draw(img)
try:
font = ImageFont.truetype(
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 36
)
except OSError:
font = ImageFont.load_default()
# 黑底白字——對 OCR 有高對比
draw.text((50, height // 2 - 50), instruction, fill="white", font=font)
return np.array(img)時序一致性攻擊
這類攻擊加入的擾動在任何單一影格中都不可察覺,但會於時間維度上形成一致的對抗訊號。
閃爍擾動
def temporal_flicker_attack(
frames: List[np.ndarray],
perturbation: np.ndarray,
frequency: int = 2 # 每 N 格切換一次
) -> List[np.ndarray]:
"""
以交替模式施加擾動。
視覺上平均下來擾動會抵消(高頻閃爍難以察覺)
但會影響模型擷取的時序特徵。
"""
result = []
for i, frame in enumerate(frames):
if (i // frequency) % 2 == 0:
perturbed = np.clip(frame + perturbation, 0, 255).astype(np.uint8)
else:
perturbed = np.clip(frame - perturbation, 0, 255).astype(np.uint8)
result.append(perturbed)
return result漸進擾動
隨時間逐漸增加擾動,使每影格對影格的變動皆不可察覺:
def progressive_perturbation(
frames: List[np.ndarray],
target_perturbation: np.ndarray,
start_frame: int = 0,
duration_frames: int = 30
) -> List[np.ndarray]:
"""
於多個影格上漸進施加擾動。
每格與前一格的差距極小。
"""
result = []
for i, frame in enumerate(frames):
if i < start_frame or i >= start_frame + duration_frames:
result.append(frame)
else:
progress = (i - start_frame) / duration_frames
current_pert = target_perturbation * progress
perturbed = np.clip(frame + current_pert, 0, 255).astype(np.uint8)
result.append(perturbed)
return result依架構的攻擊有效性
| 架構 | 最有效攻擊 | 原因 |
|---|---|---|
| TimeSformer | 單一影格注入 | 分離式注意力讓一個影格可主導 |
| VideoMAE | 多影格擾動 | 以 masking 為基礎的訓練對單影格較穩健 |
| Video-LLaVA | 影格內文字注入 | LLM 後端會讀取注入文字 |
| SlowFast | 時序閃爍 | 雙通路架構對時序模式敏感 |
| I3D / C3D | 3D 卷積對抗 | 3D kernel 涵蓋相鄰影格 |
相關主題
參考資料
- "Adversarial Attacks on Video Recognition Models" - Wei et al.(2022)- 影片模型的時序擾動策略
- "Flickering Adversarial Perturbations for Video Understanding" - Pony et al.(2021)- 針對影片時序特徵的閃爍式攻擊
- "TimeSformer: Is Space-Time Attention All You Need for Video Understanding?" - Bertasius et al.(2021)- 與時序注意力利用相關的架構細節
- "Physical Adversarial Attacks on Video Classification Models" - Li et al.(2019)- 實體環境中的影格注入策略
Knowledge Check
為什麼單一影格注入對使用跨影格注意力的影片模型有效?