Lab: Crafting Audio Adversarial Examples

Advanced7 min readUpdated 2026-03-13

Hands-on lab creating adversarial audio examples using Python audio processing, targeting Whisper transcription with injected text.

lab audio adversarial hands-on

Lab Setup

Prerequisites

pip install torch torchaudio openai-whisper numpy scipy soundfile matplotlib

Environment Check

import torch
import whisper
import numpy as np
import soundfile as sf
 
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
 
# Load Whisper model
model = whisper.load_model("base")
print("Whisper model loaded successfully")

Exercise 1: Baseline Transcription

First, establish baseline behavior with clean audio:

import numpy as np
import soundfile as sf
import whisper
 
def generate_test_audio(
    text_content: str = "hello world",
    duration: float = 3.0,
    sample_rate: int = 16000
) -> np.ndarray:
    """Generate simple test audio (sine wave tones)."""
    t = np.linspace(0, duration, int(duration * sample_rate), endpoint=False)
    # Simple tone that Whisper will likely transcribe as silence/noise
    audio = 0.3 * np.sin(2 * np.pi * 440 * t)
    # Add some noise for realism
    audio += 0.01 * np.random.randn(len(audio))
    return audio.astype(np.float32)
 
 
def transcribe_audio(audio: np.ndarray, model) -> str:
    """Transcribe audio using Whisper."""
    result = model.transcribe(
        audio,
        language="en",
        fp16=False  # Use fp32 for reproducibility
    )
    return result["text"].strip()
 
 
# Baseline test
model = whisper.load_model("base")
 
# Test with generated audio
test_audio = generate_test_audio()
baseline_transcription = transcribe_audio(test_audio, model)
print(f"Baseline transcription: '{baseline_transcription}'")
 
# Save for reference
sf.write("baseline_audio.wav", test_audio, 16000)

Exercise 2: Simple Gradient Attack

Implement a basic gradient-based attack to make Whisper transcribe a target phrase:

import torch
import whisper
import numpy as np
 
def simple_whisper_attack(
    model,
    source_audio: np.ndarray,
    target_text: str,
    epsilon: float = 0.02,
    num_steps: int = 200,
    learning_rate: float = 0.001,
    sample_rate: int = 16000
) -> tuple[np.ndarray, list[float]]:
    """
    Simple gradient attack on Whisper.
 
    Returns:
        adversarial_audio: The perturbed audio
        loss_history: Loss values over optimization steps
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
 
    # Prepare audio tensor
    audio_tensor = torch.from_numpy(source_audio).float().to(device)
    audio_tensor.requires_grad_(False)
 
    # Initialize perturbation
    delta = torch.zeros_like(audio_tensor, requires_grad=True)
 
    # Encode target text
    tokenizer = whisper.tokenizer.get_tokenizer(
        model.is_multilingual, language="en", task="transcribe"
    )
    target_tokens = tokenizer.encode(target_text)
    target_ids = torch.tensor([target_tokens], device=device)
 
    optimizer = torch.optim.Adam([delta], lr=learning_rate)
    loss_history = []
 
    for step in range(num_steps):
        optimizer.zero_grad()
 
        # Apply perturbation
        adv_audio = audio_tensor + delta
 
        # Compute mel spectrogram
        mel = whisper.log_mel_spectrogram(adv_audio).to(device)
        if mel.dim() == 2:
            mel = mel.unsqueeze(0)
 
        # Encode audio
        audio_features = model.encoder(mel)
 
        # Decode with target tokens
        # Compute loss as cross-entropy with target sequence
        logits = model.decoder(target_ids[:, :-1], audio_features)
        loss = torch.nn.functional.cross_entropy(
            logits.reshape(-1, logits.size(-1)),
            target_ids[:, 1:].reshape(-1)
        )
 
        loss.backward()
        loss_history.append(loss.item())
 
        optimizer.step()
 
        # Project perturbation to epsilon ball
        with torch.no_grad():
            delta.data = torch.clamp(delta.data, -epsilon, epsilon)
            # Ensure valid audio range
            delta.data = torch.clamp(
                audio_tensor + delta.data, -1.0, 1.0
            ) - audio_tensor
 
        if step % 50 == 0:
            print(f"Step {step}/{num_steps}, Loss: {loss.item():.4f}")
 
    adv_audio = (audio_tensor + delta).detach().cpu().numpy()
    return adv_audio, loss_history
 
 
# Run the attack
model = whisper.load_model("base")
source = np.random.randn(16000 * 3).astype(np.float32) * 0.01  # 3 sec noise
target = "the password is twelve thirty four"
 
adv_audio, losses = simple_whisper_attack(model, source, target)
 
# Verify
result = model.transcribe(adv_audio, language="en", fp16=False)
print(f"Target: '{target}'")
print(f"Actual transcription: '{result['text'].strip()}'")

Exercise 3: Perceptual Quality Evaluation

Measure whether the adversarial audio is detectable by humans:

import numpy as np
from scipy.signal import stft
 
def compute_snr(original: np.ndarray, adversarial: np.ndarray) -> float:
    """Compute Signal-to-Noise Ratio in dB."""
    signal_power = np.mean(original ** 2)
    noise_power = np.mean((adversarial - original) ** 2)
    if noise_power < 1e-10:
        return float('inf')
    return 10 * np.log10(signal_power / noise_power)
 
 
def compute_spectral_distortion(
    original: np.ndarray,
    adversarial: np.ndarray,
    sr: int = 16000
) -> float:
    """Compute log-spectral distortion between original and adversarial."""
    _, _, orig_stft = stft(original, fs=sr, nperseg=512)
    _, _, adv_stft = stft(adversarial, fs=sr, nperseg=512)
 
    orig_power = np.abs(orig_stft) ** 2 + 1e-10
    adv_power = np.abs(adv_stft) ** 2 + 1e-10
 
    lsd = np.sqrt(np.mean(
        (10 * np.log10(orig_power) - 10 * np.log10(adv_power)) ** 2
    ))
    return lsd
 
 
def evaluate_attack(
    original: np.ndarray,
    adversarial: np.ndarray,
    model,
    target_text: str
) -> dict:
    """Comprehensive evaluation of an adversarial audio attack."""
    # Transcription success
    result = model.transcribe(adversarial, language="en", fp16=False)
    transcription = result["text"].strip().lower()
    target_lower = target_text.lower()
 
    # Exact match
    exact_match = transcription == target_lower
 
    # Partial match (what fraction of target words appear)
    target_words = set(target_lower.split())
    trans_words = set(transcription.split())
    word_overlap = len(target_words & trans_words) / len(target_words) if target_words else 0
 
    # Perceptual metrics
    snr = compute_snr(original, adversarial)
    lsd = compute_spectral_distortion(original, adversarial)
    max_perturbation = np.max(np.abs(adversarial - original))
 
    return {
        "target_text": target_text,
        "transcription": transcription,
        "exact_match": exact_match,
        "word_overlap": f"{word_overlap:.1%}",
        "snr_db": f"{snr:.1f}",
        "log_spectral_distortion": f"{lsd:.2f}",
        "max_perturbation": f"{max_perturbation:.4f}",
        "imperceptible": snr > 30 and max_perturbation < 0.02
    }

Exercise 4: Robustness Testing

Test how well adversarial audio survives real-world transformations:

MP3 Compression

def test_mp3_robustness(adv_audio, sr=16000):
    """Test if adversarial audio survives MP3 compression."""
    import io
    import soundfile as sf
 
    # Save as WAV, convert to MP3, load back
    sf.write("/tmp/adv_test.wav", adv_audio, sr)
    # Requires ffmpeg
    import subprocess
    subprocess.run([
        "ffmpeg", "-y", "-i", "/tmp/adv_test.wav",
        "-b:a", "128k", "/tmp/adv_test.mp3"
    ], capture_output=True)
    subprocess.run([
        "ffmpeg", "-y", "-i", "/tmp/adv_test.mp3",
        "/tmp/adv_test_back.wav"
    ], capture_output=True)
 
    compressed, _ = sf.read("/tmp/adv_test_back.wav")
    return compressed

Additive Noise

def test_noise_robustness(adv_audio, noise_levels=[0.001, 0.005, 0.01]):
    """Test adversarial audio under additive noise."""
    results = {}
    for noise_level in noise_levels:
        noisy = adv_audio + noise_level * np.random.randn(len(adv_audio))
        noisy = np.clip(noisy, -1, 1).astype(np.float32)
        result = model.transcribe(noisy, language="en", fp16=False)
        results[noise_level] = result["text"].strip()
    return results

Resampling

def test_resample_robustness(adv_audio, original_sr=16000, target_srs=[8000, 22050, 44100]):
    """Test if adversarial audio survives sample rate conversion."""
    from scipy.signal import resample
 
    results = {}
    for target_sr in target_srs:
        ratio = target_sr / original_sr
        resampled = resample(adv_audio, int(len(adv_audio) * ratio))
        back = resample(resampled, len(adv_audio)).astype(np.float32)
        result = model.transcribe(back, language="en", fp16=False)
        results[target_sr] = result["text"].strip()
    return results

Compile Results
Create a summary table of robustness across all transformations.

Expected Results

Condition	Expected Success Rate	Notes
Clean (no transformation)	70-95%	With sufficient optimization steps
MP3 128kbps	20-50%	Compression destroys fine perturbations
Additive noise (SNR 30dB)	40-70%	Mild noise, moderate degradation
Additive noise (SNR 20dB)	10-30%	Significant degradation
Resample 16k->8k->16k	15-35%	Loses high-frequency perturbation content

Adversarial Audio Examples -- theoretical foundations for this lab
Speech Recognition Attacks -- ASR attack strategies
Lab: Crafting Image-Based Injections -- parallel lab in the visual domain

References

"Audio Adversarial Examples: Targeted Attacks on Speech-to-Text" - Carlini & Wagner (2018) - Core attack methodology applied in this lab
"Robust Speech Recognition via Large-Scale Weak Supervision" - Radford et al. (2022) - Whisper model architecture targeted in lab exercises
"Imperceptible, Robust, and Targeted Adversarial Examples for Automatic Speech Recognition" - Qin et al. (2019) - Psychoacoustic evaluation metrics used in quality assessment
"Robust Audio Adversarial Example for a Physical Attack" - Yakura & Sakuma (2019) - Over-the-air robustness testing methodology

Knowledge Check

Why is MP3 compression a significant challenge for adversarial audio attacks?

Lab: Crafting Audio Adversarial Examples

Advanced7 min readUpdated 2026-03-13

Hands-on lab creating adversarial audio examples using Python audio processing, targeting Whisper transcription with injected text.

lab audio adversarial hands-on

Lab Setup

Prerequisites

pip install torch torchaudio openai-whisper numpy scipy soundfile matplotlib

Environment Check

import torch
import whisper
import numpy as np
import soundfile as sf
 
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
 
# Load Whisper model
model = whisper.load_model("base")
print("Whisper model loaded successfully")

Exercise 1: Baseline Transcription

First, establish baseline behavior with clean audio:

import numpy as np
import soundfile as sf
import whisper
 
def generate_test_audio(
    text_content: str = "hello world",
    duration: float = 3.0,
    sample_rate: int = 16000
) -> np.ndarray:
    """Generate simple test audio (sine wave tones)."""
    t = np.linspace(0, duration, int(duration * sample_rate), endpoint=False)
    # Simple tone that Whisper will likely transcribe as silence/noise
    audio = 0.3 * np.sin(2 * np.pi * 440 * t)
    # Add some noise for realism
    audio += 0.01 * np.random.randn(len(audio))
    return audio.astype(np.float32)
 
 
def transcribe_audio(audio: np.ndarray, model) -> str:
    """Transcribe audio using Whisper."""
    result = model.transcribe(
        audio,
        language="en",
        fp16=False  # Use fp32 for reproducibility
    )
    return result["text"].strip()
 
 
# Baseline test
model = whisper.load_model("base")
 
# Test with generated audio
test_audio = generate_test_audio()
baseline_transcription = transcribe_audio(test_audio, model)
print(f"Baseline transcription: '{baseline_transcription}'")
 
# Save for reference
sf.write("baseline_audio.wav", test_audio, 16000)

Exercise 2: Simple Gradient Attack

Implement a basic gradient-based attack to make Whisper transcribe a target phrase:

import torch
import whisper
import numpy as np
 
def simple_whisper_attack(
    model,
    source_audio: np.ndarray,
    target_text: str,
    epsilon: float = 0.02,
    num_steps: int = 200,
    learning_rate: float = 0.001,
    sample_rate: int = 16000
) -> tuple[np.ndarray, list[float]]:
    """
    Simple gradient attack on Whisper.
 
    Returns:
        adversarial_audio: The perturbed audio
        loss_history: Loss values over optimization steps
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
 
    # Prepare audio tensor
    audio_tensor = torch.from_numpy(source_audio).float().to(device)
    audio_tensor.requires_grad_(False)
 
    # Initialize perturbation
    delta = torch.zeros_like(audio_tensor, requires_grad=True)
 
    # Encode target text
    tokenizer = whisper.tokenizer.get_tokenizer(
        model.is_multilingual, language="en", task="transcribe"
    )
    target_tokens = tokenizer.encode(target_text)
    target_ids = torch.tensor([target_tokens], device=device)
 
    optimizer = torch.optim.Adam([delta], lr=learning_rate)
    loss_history = []
 
    for step in range(num_steps):
        optimizer.zero_grad()
 
        # Apply perturbation
        adv_audio = audio_tensor + delta
 
        # Compute mel spectrogram
        mel = whisper.log_mel_spectrogram(adv_audio).to(device)
        if mel.dim() == 2:
            mel = mel.unsqueeze(0)
 
        # Encode audio
        audio_features = model.encoder(mel)
 
        # Decode with target tokens
        # Compute loss as cross-entropy with target sequence
        logits = model.decoder(target_ids[:, :-1], audio_features)
        loss = torch.nn.functional.cross_entropy(
            logits.reshape(-1, logits.size(-1)),
            target_ids[:, 1:].reshape(-1)
        )
 
        loss.backward()
        loss_history.append(loss.item())
 
        optimizer.step()
 
        # Project perturbation to epsilon ball
        with torch.no_grad():
            delta.data = torch.clamp(delta.data, -epsilon, epsilon)
            # Ensure valid audio range
            delta.data = torch.clamp(
                audio_tensor + delta.data, -1.0, 1.0
            ) - audio_tensor
 
        if step % 50 == 0:
            print(f"Step {step}/{num_steps}, Loss: {loss.item():.4f}")
 
    adv_audio = (audio_tensor + delta).detach().cpu().numpy()
    return adv_audio, loss_history
 
 
# Run the attack
model = whisper.load_model("base")
source = np.random.randn(16000 * 3).astype(np.float32) * 0.01  # 3 sec noise
target = "the password is twelve thirty four"
 
adv_audio, losses = simple_whisper_attack(model, source, target)
 
# Verify
result = model.transcribe(adv_audio, language="en", fp16=False)
print(f"Target: '{target}'")
print(f"Actual transcription: '{result['text'].strip()}'")

Exercise 3: Perceptual Quality Evaluation

Measure whether the adversarial audio is detectable by humans:

import numpy as np
from scipy.signal import stft
 
def compute_snr(original: np.ndarray, adversarial: np.ndarray) -> float:
    """Compute Signal-to-Noise Ratio in dB."""
    signal_power = np.mean(original ** 2)
    noise_power = np.mean((adversarial - original) ** 2)
    if noise_power < 1e-10:
        return float('inf')
    return 10 * np.log10(signal_power / noise_power)
 
 
def compute_spectral_distortion(
    original: np.ndarray,
    adversarial: np.ndarray,
    sr: int = 16000
) -> float:
    """Compute log-spectral distortion between original and adversarial."""
    _, _, orig_stft = stft(original, fs=sr, nperseg=512)
    _, _, adv_stft = stft(adversarial, fs=sr, nperseg=512)
 
    orig_power = np.abs(orig_stft) ** 2 + 1e-10
    adv_power = np.abs(adv_stft) ** 2 + 1e-10
 
    lsd = np.sqrt(np.mean(
        (10 * np.log10(orig_power) - 10 * np.log10(adv_power)) ** 2
    ))
    return lsd
 
 
def evaluate_attack(
    original: np.ndarray,
    adversarial: np.ndarray,
    model,
    target_text: str
) -> dict:
    """Comprehensive evaluation of an adversarial audio attack."""
    # Transcription success
    result = model.transcribe(adversarial, language="en", fp16=False)
    transcription = result["text"].strip().lower()
    target_lower = target_text.lower()
 
    # Exact match
    exact_match = transcription == target_lower
 
    # Partial match (what fraction of target words appear)
    target_words = set(target_lower.split())
    trans_words = set(transcription.split())
    word_overlap = len(target_words & trans_words) / len(target_words) if target_words else 0
 
    # Perceptual metrics
    snr = compute_snr(original, adversarial)
    lsd = compute_spectral_distortion(original, adversarial)
    max_perturbation = np.max(np.abs(adversarial - original))
 
    return {
        "target_text": target_text,
        "transcription": transcription,
        "exact_match": exact_match,
        "word_overlap": f"{word_overlap:.1%}",
        "snr_db": f"{snr:.1f}",
        "log_spectral_distortion": f"{lsd:.2f}",
        "max_perturbation": f"{max_perturbation:.4f}",
        "imperceptible": snr > 30 and max_perturbation < 0.02
    }

Exercise 4: Robustness Testing

Test how well adversarial audio survives real-world transformations:

MP3 Compression

def test_mp3_robustness(adv_audio, sr=16000):
    """Test if adversarial audio survives MP3 compression."""
    import io
    import soundfile as sf
 
    # Save as WAV, convert to MP3, load back
    sf.write("/tmp/adv_test.wav", adv_audio, sr)
    # Requires ffmpeg
    import subprocess
    subprocess.run([
        "ffmpeg", "-y", "-i", "/tmp/adv_test.wav",
        "-b:a", "128k", "/tmp/adv_test.mp3"
    ], capture_output=True)
    subprocess.run([
        "ffmpeg", "-y", "-i", "/tmp/adv_test.mp3",
        "/tmp/adv_test_back.wav"
    ], capture_output=True)
 
    compressed, _ = sf.read("/tmp/adv_test_back.wav")
    return compressed

Additive Noise

def test_noise_robustness(adv_audio, noise_levels=[0.001, 0.005, 0.01]):
    """Test adversarial audio under additive noise."""
    results = {}
    for noise_level in noise_levels:
        noisy = adv_audio + noise_level * np.random.randn(len(adv_audio))
        noisy = np.clip(noisy, -1, 1).astype(np.float32)
        result = model.transcribe(noisy, language="en", fp16=False)
        results[noise_level] = result["text"].strip()
    return results

Resampling

def test_resample_robustness(adv_audio, original_sr=16000, target_srs=[8000, 22050, 44100]):
    """Test if adversarial audio survives sample rate conversion."""
    from scipy.signal import resample
 
    results = {}
    for target_sr in target_srs:
        ratio = target_sr / original_sr
        resampled = resample(adv_audio, int(len(adv_audio) * ratio))
        back = resample(resampled, len(adv_audio)).astype(np.float32)
        result = model.transcribe(back, language="en", fp16=False)
        results[target_sr] = result["text"].strip()
    return results

Compile Results
Create a summary table of robustness across all transformations.

Expected Results

Condition	Expected Success Rate	Notes
Clean (no transformation)	70-95%	With sufficient optimization steps
MP3 128kbps	20-50%	Compression destroys fine perturbations
Additive noise (SNR 30dB)	40-70%	Mild noise, moderate degradation
Additive noise (SNR 20dB)	10-30%	Significant degradation
Resample 16k->8k->16k	15-35%	Loses high-frequency perturbation content

Adversarial Audio Examples -- theoretical foundations for this lab
Speech Recognition Attacks -- ASR attack strategies
Lab: Crafting Image-Based Injections -- parallel lab in the visual domain

References

"Audio Adversarial Examples: Targeted Attacks on Speech-to-Text" - Carlini & Wagner (2018) - Core attack methodology applied in this lab
"Robust Speech Recognition via Large-Scale Weak Supervision" - Radford et al. (2022) - Whisper model architecture targeted in lab exercises
"Imperceptible, Robust, and Targeted Adversarial Examples for Automatic Speech Recognition" - Qin et al. (2019) - Psychoacoustic evaluation metrics used in quality assessment
"Robust Audio Adversarial Example for a Physical Attack" - Yakura & Sakuma (2019) - Over-the-air robustness testing methodology

Knowledge Check

Why is MP3 compression a significant challenge for adversarial audio attacks?

Lab: Crafting Audio Adversarial Examples

MP3 Compression

Additive Noise

Resampling

Compile Results

Related articles

Lab: Crafting Audio Adversarial Examples

MP3 Compression

Additive Noise

Resampling

Compile Results

Related articles