實驗室: GPU Side-Channel 攻擊s
Demonstrate information leakage through GPU memory residuals and timing side channels, showing how shared GPU infrastructure can expose sensitive model data.
先備知識
- NVIDIA GPU with CUDA support
- 理解 of GPU memory architecture and CUDA programming basics
- Python with
torch,pynvml, andnumpy - Familiarity with side-channel attack concepts
pip install torch pynvml numpy matplotlibBackground
Side-channel attacks on GPUs 利用 the shared nature of GPU hardware in multi-tenant environments. When multiple users or processes share a GPU, memory residuals from one process can leak into another, and timing variations during 推論 can reveal information about inputs or model architecture.
Lab Exercises
Detect GPU Memory Residuals
Demonstrate that GPU memory retains data after deallocation.
#!/usr/bin/env python3 """Demonstrate GPU memory residual data leakage.""" import torch import numpy as np def demonstrate_memory_residual(): """Show that GPU memory retains data after deallocation.""" if not torch.cuda.is_available(): print("CUDA not available. This lab requires an NVIDIA GPU.") return device = torch.device("cuda:0") print(f"GPU: {torch.cuda.get_device_name(0)}") print(f"Total memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB") # Phase 1: Write a known pattern to GPU memory print("\n=== Phase 1: Write sensitive data ===") secret_data = torch.full((1024, 1024), 42.0, device=device) secret_ptr = secret_data.data_ptr() print(f" Written secret value (42.0) at address {hex(secret_ptr)}") print(f" Sample values: {secret_data[0, :5].tolist()}") # Phase 2: Delete the tensor (but do NOT clear memory) del secret_data # Note: torch.cuda.empty_cache() releases memory to the CUDA allocator # but does NOT zero it torch.cuda.empty_cache() print("\n=== Phase 2: Tensor deleted, cache emptied ===") # Phase 3: Allocate new tensor at the same size (likely same address) print("\n=== Phase 3: Allocate new tensor ===") # Use uninitialized allocation to read residual memory new_tensor = torch.empty((1024, 1024), device=device) new_ptr = new_tensor.data_ptr() print(f" New allocation at address {hex(new_ptr)}") # Check if residual data is present residual_values = new_tensor[0, :10].tolist() print(f" Residual values: {residual_values}") # Check if secret value (42.0) appears in residuals matches = (new_tensor == 42.0).sum().item() total = new_tensor.numel() print(f" Cells matching secret value: {matches}/{total} " f"({matches/total*100:.2f}%)") if matches > 0: print("\n ** FINDING: Residual data from previous allocation detected **") print(" In a shared GPU environment, this could leak data between tenants.") def demonstrate_model_weight_residual(): """Show that model weights persist in GPU memory after model deletion.""" if not torch.cuda.is_available(): return device = torch.device("cuda:0") # Load a small model print("\n=== Model Weight Residual 測試 ===") model = torch.nn.Linear(256, 256).to(device) # Record known weights with torch.no_grad(): model.weight.fill_(3.14) weight_sample = model.weight[0, :5].tolist() print(f" Model weight sample: {weight_sample}") # Delete model del model torch.cuda.empty_cache() print(" Model deleted, cache emptied") # Try to recover weights from residual memory probe = torch.empty((256, 256), device=device) recovered = (probe == 3.14).sum().item() print(f" Residual weight matches: {recovered}/{probe.numel()}") if recovered > 0: print(" ** Model weight residuals detected in GPU memory **") if __name__ == "__main__": demonstrate_memory_residual() demonstrate_model_weight_residual()python memory_residual.pyMeasure Inference Timing Side Channels
Demonstrate that 推論 timing reveals information about 輸入 characteristics.
#!/usr/bin/env python3 """Measure 推論 timing side channels.""" import torch import time import numpy as np def measure_inference_timing(): """Show how 推論 timing varies with 輸入 characteristics.""" if not torch.cuda.is_available(): print("CUDA not available.") return device = torch.device("cuda:0") model = torch.nn.Sequential( torch.nn.Linear(512, 1024), torch.nn.ReLU(), torch.nn.Linear(1024, 512), torch.nn.ReLU(), torch.nn.Linear(512, 10), ).to(device).eval() print("=== Inference Timing Analysis ===\n") # 測試 1: Batch size timing print("--- Batch Size Timing ---") for batch_size in [1, 4, 16, 64, 256]: x = torch.randn(batch_size, 512, device=device) # Warmup with torch.no_grad(): model(x) torch.cuda.synchronize() timings = [] for _ in range(100): torch.cuda.synchronize() start = time.perf_counter() with torch.no_grad(): model(x) torch.cuda.synchronize() timings.append((time.perf_counter() - start) * 1000) avg = np.mean(timings) std = np.std(timings) print(f" batch_size={batch_size:>3}: {avg:.3f}ms +/- {std:.3f}ms") # 測試 2: 輸入 magnitude timing print("\n--- 輸入 Magnitude Timing ---") for magnitude in [0.001, 0.1, 1.0, 10.0, 1000.0]: x = torch.randn(32, 512, device=device) * magnitude timings = [] for _ in range(100): torch.cuda.synchronize() start = time.perf_counter() with torch.no_grad(): model(x) torch.cuda.synchronize() timings.append((time.perf_counter() - start) * 1000) avg = np.mean(timings) print(f" magnitude={magnitude:>8.3f}: {avg:.3f}ms") # 測試 3: Sparsity timing (for models with conditional computation) print("\n--- 輸入 Sparsity Timing ---") for sparsity in [0.0, 0.5, 0.9, 0.99, 1.0]: x = torch.randn(32, 512, device=device) mask = torch.rand(32, 512, device=device) > sparsity x = x * mask.float() timings = [] for _ in range(100): torch.cuda.synchronize() start = time.perf_counter() with torch.no_grad(): model(x) torch.cuda.synchronize() timings.append((time.perf_counter() - start) * 1000) avg = np.mean(timings) print(f" sparsity={sparsity:.2f}: {avg:.3f}ms") if __name__ == "__main__": measure_inference_timing()python timing_side_channel.pyObserve how 推論 timing varies with 輸入 characteristics, which could reveal information about what a model is processing.
Monitor GPU Utilization Patterns
Use NVML to monitor GPU utilization patterns that leak information about workload characteristics.
#!/usr/bin/env python3 """Monitor GPU utilization patterns for information leakage.""" import time import pynvml def monitor_gpu(duration_seconds: int = 10, interval_ms: int = 100): """Monitor GPU metrics at high frequency.""" pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) name = pynvml.nvmlDeviceGetName(handle) print(f"監控 GPU: {name}") print(f"Duration: {duration_seconds}s, Interval: {interval_ms}ms\n") samples = [] start = time.time() while time.time() - start < duration_seconds: try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) power = pynvml.nvmlDeviceGetPowerUsage(handle) temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) sample = { "timestamp": time.time() - start, "gpu_util": util.gpu, "mem_util": util.memory, "mem_used_mb": mem_info.used / 1e6, "power_w": power / 1000, "temp_c": temp, } samples.append(sample) except pynvml.NVMLError: pass time.sleep(interval_ms / 1000) pynvml.nvmlShutdown() # Analyze patterns print("=== GPU Utilization Analysis ===") if samples: gpu_utils = [s["gpu_util"] for s in samples] mem_utils = [s["mem_used_mb"] for s in samples] powers = [s["power_w"] for s in samples] print(f" GPU Util: min={min(gpu_utils)}% max={max(gpu_utils)}% " f"avg={sum(gpu_utils)/len(gpu_utils):.1f}%") print(f" Memory: min={min(mem_utils):.0f}MB max={max(mem_utils):.0f}MB") print(f" Power: min={min(powers):.0f}W max={max(powers):.0f}W") # Detect 推論 bursts bursts = 0 for i in range(1, len(gpu_utils)): if gpu_utils[i] > 50 and gpu_utils[i-1] < 20: bursts += 1 print(f"\n Detected 推論 bursts: {bursts}") print(" (Burst patterns reveal when and how often 推論 occurs)") return samples if __name__ == "__main__": print("Start an 推論 workload in another terminal to observe patterns.\n") monitor_gpu(duration_seconds=15)python gpu_monitor.py實作 GPU Memory Sanitization
Build 防禦 that prevent memory residual leakage.
#!/usr/bin/env python3 """GPU memory sanitization to prevent residual data leakage.""" import torch import time def sanitize_gpu_memory(): """Zero all GPU memory to prevent residual leakage.""" if not torch.cuda.is_available(): return device = torch.device("cuda:0") mem = torch.cuda.get_device_properties(0).total_mem # Allocate and zero as much memory as possible start = time.time() chunk_size = 256 * 1024 * 1024 # 256 MB chunks chunks = [] try: while True: chunk = torch.zeros(chunk_size // 4, device=device, dtype=torch.float32) chunks.append(chunk) except RuntimeError: pass # Out of memory -- we've filled and zeroed all available memory # Release del chunks torch.cuda.empty_cache() elapsed = time.time() - start print(f"GPU memory sanitized in {elapsed:.2f}s") def secure_model_unload(model): """Securely unload a model by zeroing weights before deletion.""" for param in model.parameters(): param.data.zero_() for buffer in model.buffers(): buffer.zero_() del model torch.cuda.empty_cache() print("Model securely unloaded (weights zeroed)") def verify_sanitization(): """Verify that sanitization prevents memory residual recovery.""" if not torch.cuda.is_available(): return device = torch.device("cuda:0") # Write secret, sanitize, then check secret = torch.full((1024, 1024), 99.0, device=device) del secret sanitize_gpu_memory() probe = torch.empty((1024, 1024), device=device) matches = (probe == 99.0).sum().item() print(f"Post-sanitization residual matches: {matches}/{probe.numel()}") if matches == 0: print("Sanitization effective: no residual data found") else: print("WARNING: Residual data still present after sanitization") if __name__ == "__main__": print("=== GPU Memory Sanitization ===\n") sanitize_gpu_memory() verify_sanitization()python gpu_sanitize.py
Troubleshooting
| Issue | Solution |
|---|---|
| No CUDA available | Install NVIDIA drivers and CUDA toolkit or use Google Colab |
| pynvml import errors | Install with pip install pynvml (also called nvidia-ml-py3) |
| Memory residuals not detected | Results depend on GPU allocator behavior; try different allocation sizes |
| Timing measurements too noisy | Increase warmup iterations and measurement count |
相關主題
- Inference Server 利用 - Infrastructure-level attacks on model serving
- Container Breakout - Escaping container isolation to access GPU hardware
- Training Data Extraction - Model-level data extraction complementing hardware-level
- Model Serving 攻擊 - Broader serving framework 攻擊面
參考文獻
- "GPU Memory 利用: Practical Side-Channel 攻擊 on GPU Memory" - Naghibijouybari et al. (2018) - GPU side-channel attack methodology
- "Rendered Insecure: GPU Side Channel 攻擊 are Practical" - Naghibijouybari et al. (2018) - Demonstrated practical GPU side channels
- "Grand Pwning Unit: Accelerating Microarchitectural 攻擊 with the GPU" - Frigo et al. (2018) - GPU-based attacks on CPU memory
- "CUDA Leaks: A Detailed Hack for CUDA and a (Partial) Fix" - Pietro et al. (2016) - GPU memory isolation 漏洞 in CUDA
Why does GPU memory create a higher risk of data residual leakage compared to CPU memory?