Lab: Crafting Audio Adversarial Examples
Hands-on lab creating adversarial audio examples using Python audio processing, targeting Whisper transcription with injected text.
Lab Setup
Prerequisites
pip install torch torchaudio openai-whisper numpy scipy soundfile matplotlibEnvironment Check
import torch
import whisper
import numpy as np
import soundfile as sf
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
# Load Whisper model
model = whisper.load_model("base")
print("Whisper model loaded successfully")Exercise 1: Baseline Transcription
First, establish baseline behavior with clean audio:
import numpy as np
import soundfile as sf
import whisper
def generate_test_audio(
text_content: str = "hello world",
duration: float = 3.0,
sample_rate: int = 16000
) -> np.ndarray:
"""Generate simple test audio (sine wave tones)."""
t = np.linspace(0, duration, int(duration * sample_rate), endpoint=False)
# Simple tone that Whisper will likely transcribe as silence/noise
audio = 0.3 * np.sin(2 * np.pi * 440 * t)
# Add some noise for realism
audio += 0.01 * np.random.randn(len(audio))
return audio.astype(np.float32)
def transcribe_audio(audio: np.ndarray, model) -> str:
"""Transcribe audio using Whisper."""
result = model.transcribe(
audio,
language="en",
fp16=False # Use fp32 for reproducibility
)
return result["text"].strip()
# Baseline test
model = whisper.load_model("base")
# Test with generated audio
test_audio = generate_test_audio()
baseline_transcription = transcribe_audio(test_audio, model)
print(f"Baseline transcription: '{baseline_transcription}'")
# Save for reference
sf.write("baseline_audio.wav", test_audio, 16000)Exercise 2: Simple Gradient Attack
Implement a basic gradient-based attack to make Whisper transcribe a target phrase:
import torch
import whisper
import numpy as np
def simple_whisper_attack(
model,
source_audio: np.ndarray,
target_text: str,
epsilon: float = 0.02,
num_steps: int = 200,
learning_rate: float = 0.001,
sample_rate: int = 16000
) -> tuple[np.ndarray, list[float]]:
"""
Simple gradient attack on Whisper.
Returns:
adversarial_audio: The perturbed audio
loss_history: Loss values over optimization steps
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
# Prepare audio tensor
audio_tensor = torch.from_numpy(source_audio).float().to(device)
audio_tensor.requires_grad_(False)
# Initialize perturbation
delta = torch.zeros_like(audio_tensor, requires_grad=True)
# Encode target text
tokenizer = whisper.tokenizer.get_tokenizer(
model.is_multilingual, language="en", task="transcribe"
)
target_tokens = tokenizer.encode(target_text)
target_ids = torch.tensor([target_tokens], device=device)
optimizer = torch.optim.Adam([delta], lr=learning_rate)
loss_history = []
for step in range(num_steps):
optimizer.zero_grad()
# Apply perturbation
adv_audio = audio_tensor + delta
# Compute mel spectrogram
mel = whisper.log_mel_spectrogram(adv_audio).to(device)
if mel.dim() == 2:
mel = mel.unsqueeze(0)
# Encode audio
audio_features = model.encoder(mel)
# Decode with target tokens
# Compute loss as cross-entropy with target sequence
logits = model.decoder(target_ids[:, :-1], audio_features)
loss = torch.nn.functional.cross_entropy(
logits.reshape(-1, logits.size(-1)),
target_ids[:, 1:].reshape(-1)
)
loss.backward()
loss_history.append(loss.item())
optimizer.step()
# Project perturbation to epsilon ball
with torch.no_grad():
delta.data = torch.clamp(delta.data, -epsilon, epsilon)
# Ensure valid audio range
delta.data = torch.clamp(
audio_tensor + delta.data, -1.0, 1.0
) - audio_tensor
if step % 50 == 0:
print(f"Step {step}/{num_steps}, Loss: {loss.item():.4f}")
adv_audio = (audio_tensor + delta).detach().cpu().numpy()
return adv_audio, loss_history
# Run the attack
model = whisper.load_model("base")
source = np.random.randn(16000 * 3).astype(np.float32) * 0.01 # 3 sec noise
target = "the password is twelve thirty four"
adv_audio, losses = simple_whisper_attack(model, source, target)
# Verify
result = model.transcribe(adv_audio, language="en", fp16=False)
print(f"Target: '{target}'")
print(f"Actual transcription: '{result['text'].strip()}'")Exercise 3: Perceptual Quality Evaluation
Measure whether the adversarial audio is detectable by humans:
import numpy as np
from scipy.signal import stft
def compute_snr(original: np.ndarray, adversarial: np.ndarray) -> float:
"""Compute Signal-to-Noise Ratio in dB."""
signal_power = np.mean(original ** 2)
noise_power = np.mean((adversarial - original) ** 2)
if noise_power < 1e-10:
return float('inf')
return 10 * np.log10(signal_power / noise_power)
def compute_spectral_distortion(
original: np.ndarray,
adversarial: np.ndarray,
sr: int = 16000
) -> float:
"""Compute log-spectral distortion between original and adversarial."""
_, _, orig_stft = stft(original, fs=sr, nperseg=512)
_, _, adv_stft = stft(adversarial, fs=sr, nperseg=512)
orig_power = np.abs(orig_stft) ** 2 + 1e-10
adv_power = np.abs(adv_stft) ** 2 + 1e-10
lsd = np.sqrt(np.mean(
(10 * np.log10(orig_power) - 10 * np.log10(adv_power)) ** 2
))
return lsd
def evaluate_attack(
original: np.ndarray,
adversarial: np.ndarray,
model,
target_text: str
) -> dict:
"""Comprehensive evaluation of an adversarial audio attack."""
# Transcription success
result = model.transcribe(adversarial, language="en", fp16=False)
transcription = result["text"].strip().lower()
target_lower = target_text.lower()
# Exact match
exact_match = transcription == target_lower
# Partial match (what fraction of target words appear)
target_words = set(target_lower.split())
trans_words = set(transcription.split())
word_overlap = len(target_words & trans_words) / len(target_words) if target_words else 0
# Perceptual metrics
snr = compute_snr(original, adversarial)
lsd = compute_spectral_distortion(original, adversarial)
max_perturbation = np.max(np.abs(adversarial - original))
return {
"target_text": target_text,
"transcription": transcription,
"exact_match": exact_match,
"word_overlap": f"{word_overlap:.1%}",
"snr_db": f"{snr:.1f}",
"log_spectral_distortion": f"{lsd:.2f}",
"max_perturbation": f"{max_perturbation:.4f}",
"imperceptible": snr > 30 and max_perturbation < 0.02
}Exercise 4: Robustness Testing
Test how well adversarial audio survives real-world transformations:
MP3 Compression
def test_mp3_robustness(adv_audio, sr=16000): """Test if adversarial audio survives MP3 compression.""" import io import soundfile as sf # Save as WAV, convert to MP3, load back sf.write("/tmp/adv_test.wav", adv_audio, sr) # Requires ffmpeg import subprocess subprocess.run([ "ffmpeg", "-y", "-i", "/tmp/adv_test.wav", "-b:a", "128k", "/tmp/adv_test.mp3" ], capture_output=True) subprocess.run([ "ffmpeg", "-y", "-i", "/tmp/adv_test.mp3", "/tmp/adv_test_back.wav" ], capture_output=True) compressed, _ = sf.read("/tmp/adv_test_back.wav") return compressedAdditive Noise
def test_noise_robustness(adv_audio, noise_levels=[0.001, 0.005, 0.01]): """Test adversarial audio under additive noise.""" results = {} for noise_level in noise_levels: noisy = adv_audio + noise_level * np.random.randn(len(adv_audio)) noisy = np.clip(noisy, -1, 1).astype(np.float32) result = model.transcribe(noisy, language="en", fp16=False) results[noise_level] = result["text"].strip() return resultsResampling
def test_resample_robustness(adv_audio, original_sr=16000, target_srs=[8000, 22050, 44100]): """Test if adversarial audio survives sample rate conversion.""" from scipy.signal import resample results = {} for target_sr in target_srs: ratio = target_sr / original_sr resampled = resample(adv_audio, int(len(adv_audio) * ratio)) back = resample(resampled, len(adv_audio)).astype(np.float32) result = model.transcribe(back, language="en", fp16=False) results[target_sr] = result["text"].strip() return resultsCompile Results
Create a summary table of robustness across all transformations.
Expected Results
| Condition | Expected Success Rate | Notes |
|---|---|---|
| Clean (no transformation) | 70-95% | With sufficient optimization steps |
| MP3 128kbps | 20-50% | Compression destroys fine perturbations |
| Additive noise (SNR 30dB) | 40-70% | Mild noise, moderate degradation |
| Additive noise (SNR 20dB) | 10-30% | Significant degradation |
| Resample 16k->8k->16k | 15-35% | Loses high-frequency perturbation content |
Related Topics
- Adversarial Audio Examples -- theoretical foundations for this lab
- Speech Recognition Attacks -- ASR attack strategies
- Lab: Crafting Image-Based Injections -- parallel lab in the visual domain
References
- "Audio Adversarial Examples: Targeted Attacks on Speech-to-Text" - Carlini & Wagner (2018) - Core attack methodology applied in this lab
- "Robust Speech Recognition via Large-Scale Weak Supervision" - Radford et al. (2022) - Whisper model architecture targeted in lab exercises
- "Imperceptible, Robust, and Targeted Adversarial Examples for Automatic Speech Recognition" - Qin et al. (2019) - Psychoacoustic evaluation metrics used in quality assessment
- "Robust Audio Adversarial Example for a Physical Attack" - Yakura & Sakuma (2019) - Over-the-air robustness testing methodology
Why is MP3 compression a significant challenge for adversarial audio attacks?