實作:打造音訊對抗範例
Advanced5 min readUpdated 2026-03-13
使用 Python 音訊處理建立對抗音訊範例之實作,針對 Whisper 轉譯注入文字。
實作設置
先備條件
pip install torch torchaudio openai-whisper numpy scipy soundfile matplotlib環境檢查
import torch
import whisper
import numpy as np
import soundfile as sf
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
# 載入 Whisper 模型
model = whisper.load_model("base")
print("Whisper model loaded successfully")練習 1:基線轉譯
首先,以乾淨音訊建立基線行為:
import numpy as np
import soundfile as sf
import whisper
def generate_test_audio(
text_content: str = "hello world",
duration: float = 3.0,
sample_rate: int = 16000
) -> np.ndarray:
"""生成簡單測試音訊(正弦波音調)。"""
t = np.linspace(0, duration, int(duration * sample_rate), endpoint=False)
# 簡單音調,Whisper 可能轉譯為靜音/雜訊
audio = 0.3 * np.sin(2 * np.pi * 440 * t)
# 為真實感加入雜訊
audio += 0.01 * np.random.randn(len(audio))
return audio.astype(np.float32)
def transcribe_audio(audio: np.ndarray, model) -> str:
"""使用 Whisper 轉譯音訊。"""
result = model.transcribe(
audio,
language="en",
fp16=False # 為可重現使用 fp32
)
return result["text"].strip()
# 基線測試
model = whisper.load_model("base")
# 以生成音訊測試
test_audio = generate_test_audio()
baseline_transcription = transcribe_audio(test_audio, model)
print(f"Baseline transcription: '{baseline_transcription}'")
# 儲存供參考
sf.write("baseline_audio.wav", test_audio, 16000)練習 2:簡單梯度攻擊
實作基礎以梯度為本之攻擊使 Whisper 轉譯目標片語:
import torch
import whisper
import numpy as np
def simple_whisper_attack(
model,
source_audio: np.ndarray,
target_text: str,
epsilon: float = 0.02,
num_steps: int = 200,
learning_rate: float = 0.001,
sample_rate: int = 16000
) -> tuple[np.ndarray, list[float]]:
"""
對 Whisper 之簡單梯度攻擊。
Returns:
adversarial_audio: 擾動之音訊
loss_history: 最佳化步驟之 loss 值
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
# 準備音訊 tensor
audio_tensor = torch.from_numpy(source_audio).float().to(device)
audio_tensor.requires_grad_(False)
# 初始化擾動
delta = torch.zeros_like(audio_tensor, requires_grad=True)
# 編碼目標文字
tokenizer = whisper.tokenizer.get_tokenizer(
model.is_multilingual, language="en", task="transcribe"
)
target_tokens = tokenizer.encode(target_text)
target_ids = torch.tensor([target_tokens], device=device)
optimizer = torch.optim.Adam([delta], lr=learning_rate)
loss_history = []
for step in range(num_steps):
optimizer.zero_grad()
# 套用擾動
adv_audio = audio_tensor + delta
# 計算 mel spectrogram
mel = whisper.log_mel_spectrogram(adv_audio).to(device)
if mel.dim() == 2:
mel = mel.unsqueeze(0)
# 編碼音訊
audio_features = model.encoder(mel)
# 以目標 token 解碼
# 計算 loss 為與目標序列之 cross-entropy
logits = model.decoder(target_ids[:, :-1], audio_features)
loss = torch.nn.functional.cross_entropy(
logits.reshape(-1, logits.size(-1)),
target_ids[:, 1:].reshape(-1)
)
loss.backward()
loss_history.append(loss.item())
optimizer.step()
# 將擾動投影至 epsilon 球
with torch.no_grad():
delta.data = torch.clamp(delta.data, -epsilon, epsilon)
# 確保有效音訊範圍
delta.data = torch.clamp(
audio_tensor + delta.data, -1.0, 1.0
) - audio_tensor
if step % 50 == 0:
print(f"Step {step}/{num_steps}, Loss: {loss.item():.4f}")
adv_audio = (audio_tensor + delta).detach().cpu().numpy()
return adv_audio, loss_history
# 執行攻擊
model = whisper.load_model("base")
source = np.random.randn(16000 * 3).astype(np.float32) * 0.01 # 3 秒雜訊
target = "the password is twelve thirty four"
adv_audio, losses = simple_whisper_attack(model, source, target)
# 驗證
result = model.transcribe(adv_audio, language="en", fp16=False)
print(f"Target: '{target}'")
print(f"Actual transcription: '{result['text'].strip()}'")練習 3:感知品質評估
量測對抗音訊是否可被人類偵測:
import numpy as np
from scipy.signal import stft
def compute_snr(original: np.ndarray, adversarial: np.ndarray) -> float:
"""以 dB 計算訊噪比。"""
signal_power = np.mean(original ** 2)
noise_power = np.mean((adversarial - original) ** 2)
if noise_power < 1e-10:
return float('inf')
return 10 * np.log10(signal_power / noise_power)
def compute_spectral_distortion(
original: np.ndarray,
adversarial: np.ndarray,
sr: int = 16000
) -> float:
"""計算原始與對抗間之對數頻譜失真。"""
_, _, orig_stft = stft(original, fs=sr, nperseg=512)
_, _, adv_stft = stft(adversarial, fs=sr, nperseg=512)
orig_power = np.abs(orig_stft) ** 2 + 1e-10
adv_power = np.abs(adv_stft) ** 2 + 1e-10
lsd = np.sqrt(np.mean(
(10 * np.log10(orig_power) - 10 * np.log10(adv_power)) ** 2
))
return lsd
def evaluate_attack(
original: np.ndarray,
adversarial: np.ndarray,
model,
target_text: str
) -> dict:
"""對抗音訊攻擊之完整評估。"""
# 轉譯成功
result = model.transcribe(adversarial, language="en", fp16=False)
transcription = result["text"].strip().lower()
target_lower = target_text.lower()
# 精確匹配
exact_match = transcription == target_lower
# 部分匹配(目標字詞中多少比例出現)
target_words = set(target_lower.split())
trans_words = set(transcription.split())
word_overlap = len(target_words & trans_words) / len(target_words) if target_words else 0
# 感知指標
snr = compute_snr(original, adversarial)
lsd = compute_spectral_distortion(original, adversarial)
max_perturbation = np.max(np.abs(adversarial - original))
return {
"target_text": target_text,
"transcription": transcription,
"exact_match": exact_match,
"word_overlap": f"{word_overlap:.1%}",
"snr_db": f"{snr:.1f}",
"log_spectral_distortion": f"{lsd:.2f}",
"max_perturbation": f"{max_perturbation:.4f}",
"imperceptible": snr > 30 and max_perturbation < 0.02
}練習 4:穩健度測試
測試對抗音訊於現實世界變換下倖存之程度:
MP3 壓縮
def test_mp3_robustness(adv_audio, sr=16000): """測試對抗音訊是否於 MP3 壓縮下倖存。""" import io import soundfile as sf # 存為 WAV、轉為 MP3、載回 sf.write("/tmp/adv_test.wav", adv_audio, sr) # 需 ffmpeg import subprocess subprocess.run([ "ffmpeg", "-y", "-i", "/tmp/adv_test.wav", "-b:a", "128k", "/tmp/adv_test.mp3" ], capture_output=True) subprocess.run([ "ffmpeg", "-y", "-i", "/tmp/adv_test.mp3", "/tmp/adv_test_back.wav" ], capture_output=True) compressed, _ = sf.read("/tmp/adv_test_back.wav") return compressed加性雜訊
def test_noise_robustness(adv_audio, noise_levels=[0.001, 0.005, 0.01]): """於加性雜訊下測試對抗音訊。""" results = {} for noise_level in noise_levels: noisy = adv_audio + noise_level * np.random.randn(len(adv_audio)) noisy = np.clip(noisy, -1, 1).astype(np.float32) result = model.transcribe(noisy, language="en", fp16=False) results[noise_level] = result["text"].strip() return results重新取樣
def test_resample_robustness(adv_audio, original_sr=16000, target_srs=[8000, 22050, 44100]): """測試對抗音訊是否於取樣率轉換下倖存。""" from scipy.signal import resample results = {} for target_sr in target_srs: ratio = target_sr / original_sr resampled = resample(adv_audio, int(len(adv_audio) * ratio)) back = resample(resampled, len(adv_audio)).astype(np.float32) result = model.transcribe(back, language="en", fp16=False) results[target_sr] = result["text"].strip() return results編譯結果
建立跨所有變換穩健度之摘要表。
預期結果
| 條件 | 預期成功率 | 備註 |
|---|---|---|
| 乾淨(無變換) | 70-95% | 足夠最佳化步驟下 |
| MP3 128kbps | 20-50% | 壓縮破壞細微擾動 |
| 加性雜訊(SNR 30dB) | 40-70% | 輕微雜訊、中等退化 |
| 加性雜訊(SNR 20dB) | 10-30% | 顯著退化 |
| 重新取樣 16k->8k->16k | 15-35% | 失去高頻擾動內容 |
相關主題
- 對抗音訊範例 -- 本實作之理論基礎
- 語音辨識攻擊 -- ASR 攻擊策略
- 實作:打造以圖像為本之注入 -- 視覺領域之並行實作
參考資料
- "Audio Adversarial Examples: Targeted Attacks on Speech-to-Text" - Carlini & Wagner(2018)- 於本實作應用之核心攻擊方法論
- "Robust Speech Recognition via Large-Scale Weak Supervision" - Radford et al.(2022)- 於實作練習中針對之 Whisper 模型架構
- "Imperceptible, Robust, and Targeted Adversarial Examples for Automatic Speech Recognition" - Qin et al.(2019)- 於品質評估使用之心理聲學評估指標
- "Robust Audio Adversarial Example for a Physical Attack" - Yakura & Sakuma(2019)- 空中穩健度測試方法論
Knowledge Check
為何 MP3 壓縮對對抗音訊攻擊為顯著挑戰?