Created
January 27, 2026 03:43
-
-
Save qingy1337/e385031165e2d754a1b75cc9ae603e29 to your computer and use it in GitHub Desktop.
Simple Hardware Benchmark in PyTorch (generated by Claude)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| PyTorch GPU Benchmark Script | |
| Benchmarks CUDA, MPS (Apple Silicon), or CPU and outputs a score from 1000-5000. | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import time | |
| import gc | |
| import sys | |
| from dataclasses import dataclass | |
| from typing import Tuple, Optional | |
| @dataclass | |
| class BenchmarkResult: | |
| """Stores results from a single benchmark.""" | |
| name: str | |
| ops_per_second: float | |
| time_taken: float | |
| iterations: int | |
| class GPUBenchmark: | |
| """Comprehensive GPU benchmark suite.""" | |
| # Reference values (recalibrated for ~3000 score on RTX 4070/3080 Ti) | |
| REFERENCE_SCORES = { | |
| 'matmul': 40e12, # 40 TFLOPS for matrix multiplication | |
| 'conv2d': 30e12, # 30 TFLOPS for convolution | |
| 'transformer': 5e12, # 5 TFLOPS for transformer operations | |
| 'memory': 700e9, # 700 GB/s for memory bandwidth | |
| 'mixed': 100e12, # 100 TFLOPS for mixed precision (FP16) | |
| } | |
| def __init__(self): | |
| self.device = self._get_best_device() | |
| self.device_name = self._get_device_name() | |
| self.results: list[BenchmarkResult] = [] | |
| def _get_best_device(self) -> torch.device: | |
| """Get the best available device: CUDA > MPS > CPU.""" | |
| if torch.cuda.is_available(): | |
| return torch.device('cuda') | |
| elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): | |
| return torch.device('mps') | |
| else: | |
| return torch.device('cpu') | |
| def _get_device_name(self) -> str: | |
| """Get a human-readable device name.""" | |
| if self.device.type == 'cuda': | |
| return f"CUDA - {torch.cuda.get_device_name(0)}" | |
| elif self.device.type == 'mps': | |
| return "Apple Silicon (MPS)" | |
| else: | |
| return "CPU" | |
| def _sync_device(self): | |
| """Synchronize device for accurate timing.""" | |
| if self.device.type == 'cuda': | |
| torch.cuda.synchronize() | |
| elif self.device.type == 'mps': | |
| torch.mps.synchronize() | |
| def _cleanup(self): | |
| """Clean up memory.""" | |
| gc.collect() | |
| if self.device.type == 'cuda': | |
| torch.cuda.empty_cache() | |
| elif self.device.type == 'mps': | |
| torch.mps.empty_cache() | |
| def _warmup(self, func, warmup_iters: int = 5): | |
| """Warm up the GPU before benchmarking.""" | |
| for _ in range(warmup_iters): | |
| func() | |
| self._sync_device() | |
| def benchmark_matmul(self, size: int = 8192, iterations: int = 100) -> BenchmarkResult: | |
| """Benchmark matrix multiplication (core GPU operation).""" | |
| print(" Running matrix multiplication benchmark...") | |
| # Adjust size based on device capabilities | |
| if self.device.type == 'cpu': | |
| size = 2048 | |
| iterations = 20 | |
| elif self.device.type == 'mps': | |
| size = 4096 | |
| iterations = 50 | |
| a = torch.randn(size, size, device=self.device, dtype=torch.float32) | |
| b = torch.randn(size, size, device=self.device, dtype=torch.float32) | |
| def matmul_op(): | |
| torch.mm(a, b) | |
| self._warmup(matmul_op) | |
| self._sync_device() | |
| start = time.perf_counter() | |
| for _ in range(iterations): | |
| matmul_op() | |
| self._sync_device() | |
| elapsed = time.perf_counter() - start | |
| # Calculate FLOPS (2 * N^3 for matrix multiplication) | |
| flops = 2 * (size ** 3) * iterations | |
| ops_per_second = flops / elapsed | |
| del a, b | |
| self._cleanup() | |
| return BenchmarkResult("Matrix Multiplication", ops_per_second, elapsed, iterations) | |
| def benchmark_conv2d(self, batch_size: int = 64, iterations: int = 100) -> BenchmarkResult: | |
| """Benchmark 2D convolution (essential for CNNs).""" | |
| print(" Running convolution benchmark...") | |
| if self.device.type == 'cpu': | |
| batch_size = 8 | |
| iterations = 20 | |
| elif self.device.type == 'mps': | |
| batch_size = 32 | |
| iterations = 50 | |
| # Simulate a typical CNN layer with larger feature maps | |
| input_tensor = torch.randn(batch_size, 128, 256, 256, device=self.device, dtype=torch.float32) | |
| conv = nn.Conv2d(128, 256, kernel_size=3, padding=1).to(self.device) | |
| def conv_op(): | |
| conv(input_tensor) | |
| self._warmup(conv_op) | |
| self._sync_device() | |
| start = time.perf_counter() | |
| for _ in range(iterations): | |
| conv_op() | |
| self._sync_device() | |
| elapsed = time.perf_counter() - start | |
| # Approximate FLOPS for conv2d | |
| output_size = 256 * 256 | |
| flops_per_output = 2 * 128 * 3 * 3 # 2 * in_channels * kernel_size^2 | |
| total_flops = batch_size * 256 * output_size * flops_per_output * iterations | |
| ops_per_second = total_flops / elapsed | |
| del input_tensor, conv | |
| self._cleanup() | |
| return BenchmarkResult("2D Convolution", ops_per_second, elapsed, iterations) | |
| def benchmark_transformer(self, batch_size: int = 32, seq_len: int = 1024, | |
| hidden_dim: int = 1024, iterations: int = 50) -> BenchmarkResult: | |
| """Benchmark transformer-like operations (attention mechanism).""" | |
| print(" Running transformer attention benchmark...") | |
| if self.device.type == 'cpu': | |
| batch_size = 4 | |
| seq_len = 256 | |
| iterations = 10 | |
| elif self.device.type == 'mps': | |
| batch_size = 16 | |
| seq_len = 512 | |
| iterations = 30 | |
| # Query, Key, Value projections (16 heads, 64 dim per head) | |
| num_heads = 16 | |
| head_dim = 64 | |
| q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32) | |
| k = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32) | |
| v = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32) | |
| def attention_op(): | |
| # Scaled dot-product attention | |
| scores = torch.matmul(q, k.transpose(-2, -1)) / (head_dim ** 0.5) | |
| attn = F.softmax(scores, dim=-1) | |
| torch.matmul(attn, v) | |
| self._warmup(attention_op) | |
| self._sync_device() | |
| start = time.perf_counter() | |
| for _ in range(iterations): | |
| attention_op() | |
| self._sync_device() | |
| elapsed = time.perf_counter() - start | |
| # Approximate operations (QK^T, softmax, attn*V) | |
| qk_ops = batch_size * num_heads * seq_len * seq_len * head_dim * 2 | |
| attn_v_ops = batch_size * num_heads * seq_len * seq_len * head_dim * 2 | |
| softmax_ops = batch_size * num_heads * seq_len * seq_len * 5 # approx for exp/sum | |
| ops = (qk_ops + attn_v_ops + softmax_ops) * iterations | |
| ops_per_second = ops / elapsed | |
| del q, k, v | |
| self._cleanup() | |
| return BenchmarkResult("Transformer Attention", ops_per_second, elapsed, iterations) | |
| def benchmark_memory_bandwidth(self, size_mb: int = 512, iterations: int = 100) -> BenchmarkResult: | |
| """Benchmark memory bandwidth (copy operations).""" | |
| print(" Running memory bandwidth benchmark...") | |
| if self.device.type == 'cpu': | |
| size_mb = 64 | |
| iterations = 30 | |
| elif self.device.type == 'mps': | |
| size_mb = 256 | |
| iterations = 50 | |
| # float32 = 4 bytes | |
| num_elements = (size_mb * 1024 * 1024) // 4 | |
| src = torch.randn(num_elements, device=self.device, dtype=torch.float32) | |
| dst = torch.empty(num_elements, device=self.device, dtype=torch.float32) | |
| def copy_op(): | |
| dst.copy_(src) | |
| self._warmup(copy_op) | |
| self._sync_device() | |
| start = time.perf_counter() | |
| for _ in range(iterations): | |
| copy_op() | |
| self._sync_device() | |
| elapsed = time.perf_counter() - start | |
| # Calculate bandwidth (read + write) | |
| bytes_transferred = 2 * size_mb * 1024 * 1024 * iterations | |
| bytes_per_second = bytes_transferred / elapsed | |
| del src, dst | |
| self._cleanup() | |
| return BenchmarkResult("Memory Bandwidth", bytes_per_second, elapsed, iterations) | |
| def benchmark_mixed_precision(self, size: int = 8192, iterations: int = 100) -> BenchmarkResult: | |
| """Benchmark mixed precision operations (FP16/FP32).""" | |
| print(" Running mixed precision benchmark...") | |
| if self.device.type == 'cpu': | |
| size = 2048 | |
| iterations = 20 | |
| elif self.device.type == 'mps': | |
| size = 4096 | |
| iterations = 50 | |
| # Use FP16 for better tensor core utilization on CUDA | |
| dtype = torch.float16 if self.device.type in ['cuda', 'mps'] else torch.float32 | |
| a = torch.randn(size, size, device=self.device, dtype=dtype) | |
| b = torch.randn(size, size, device=self.device, dtype=dtype) | |
| c = torch.randn(size, size, device=self.device, dtype=dtype) | |
| def mixed_op(): | |
| # Chain of operations to stress tensor cores | |
| d = torch.mm(a, b) | |
| e = torch.mm(d, c) | |
| f = torch.relu(e) | |
| torch.sigmoid(f) | |
| self._warmup(mixed_op) | |
| self._sync_device() | |
| start = time.perf_counter() | |
| for _ in range(iterations): | |
| mixed_op() | |
| self._sync_device() | |
| elapsed = time.perf_counter() - start | |
| # Approximate ops (2 matmuls + relu + sigmoid) | |
| flops = (2 * 2 * size**3 + 3 * size**2) * iterations | |
| ops_per_second = flops / elapsed | |
| del a, b, c | |
| self._cleanup() | |
| return BenchmarkResult("Mixed Precision Ops", ops_per_second, elapsed, iterations) | |
| def calculate_score(self) -> Tuple[float, dict]: | |
| """Calculate final score between 1000-5000 using logarithmic scaling.""" | |
| scores = {} | |
| weights = { | |
| 'Matrix Multiplication': 0.30, | |
| '2D Convolution': 0.25, | |
| 'Transformer Attention': 0.20, | |
| 'Memory Bandwidth': 0.15, | |
| 'Mixed Precision Ops': 0.10, | |
| } | |
| references = { | |
| 'Matrix Multiplication': self.REFERENCE_SCORES['matmul'], | |
| '2D Convolution': self.REFERENCE_SCORES['conv2d'], | |
| 'Transformer Attention': self.REFERENCE_SCORES['transformer'], | |
| 'Memory Bandwidth': self.REFERENCE_SCORES['memory'], | |
| 'Mixed Precision Ops': self.REFERENCE_SCORES['mixed'], | |
| } | |
| weighted_score = 0 | |
| for result in self.results: | |
| ref = references.get(result.name, 1e10) | |
| # Calculate relative performance | |
| relative_perf = result.ops_per_second / ref | |
| # Use logarithmic scaling for better distribution | |
| # log2(1) = 0 (at reference), log2(2) = 1 (2x reference), log2(4) = 2 (4x reference) | |
| if relative_perf > 0: | |
| # Scale so that reference = 3000, 2x = 3750, 4x = 4500, 8x = 5000+ | |
| log_perf = torch.log2(torch.tensor(max(relative_perf, 0.01))).item() | |
| component_score = 3000 + (log_perf * 750) | |
| else: | |
| component_score = 1000 | |
| scores[result.name] = component_score | |
| weight = weights.get(result.name, 0.1) | |
| weighted_score += component_score * weight | |
| # Clamp to 1000-5000 range | |
| final_score = max(1000, min(5000, weighted_score)) | |
| return final_score, scores | |
| def run_all_benchmarks(self) -> float: | |
| """Run all benchmarks and return final score.""" | |
| print("=" * 60) | |
| print("PyTorch GPU Benchmark Suite") | |
| print("=" * 60) | |
| print("\nDevice: {0}".format(self.device_name)) | |
| print("PyTorch Version: {0}".format(torch.__version__)) | |
| if self.device.type == 'cuda': | |
| print("CUDA Version: {0}".format(torch.version.cuda)) | |
| props = torch.cuda.get_device_properties(0) | |
| print("GPU Memory: {0:.1f} GB".format(props.total_memory / 1024**3)) | |
| print("Compute Capability: {0}.{1}".format(props.major, props.minor)) | |
| print("\n" + "-" * 60) | |
| print("Running Benchmarks...") | |
| print("-" * 60 + "\n") | |
| try: | |
| self.results.append(self.benchmark_matmul()) | |
| self.results.append(self.benchmark_conv2d()) | |
| self.results.append(self.benchmark_transformer()) | |
| self.results.append(self.benchmark_memory_bandwidth()) | |
| self.results.append(self.benchmark_mixed_precision()) | |
| except RuntimeError as e: | |
| if "out of memory" in str(e).lower(): | |
| print("\nWarning: Out of memory error. Retrying with smaller sizes...") | |
| self._cleanup() | |
| # Retry with smaller sizes | |
| self.results = [] | |
| self.results.append(self.benchmark_matmul(size=4096, iterations=50)) | |
| self.results.append(self.benchmark_conv2d(batch_size=16, iterations=50)) | |
| self.results.append(self.benchmark_transformer(batch_size=8, seq_len=512, iterations=30)) | |
| self.results.append(self.benchmark_memory_bandwidth(size_mb=256, iterations=50)) | |
| self.results.append(self.benchmark_mixed_precision(size=4096, iterations=50)) | |
| else: | |
| raise | |
| # Display results | |
| print("\n" + "-" * 60) | |
| print("Benchmark Results") | |
| print("-" * 60) | |
| for result in self.results: | |
| if "Memory" in result.name: | |
| throughput = result.ops_per_second / 1e9 | |
| print("\n{0}:".format(result.name)) | |
| print(" Throughput: {0:.2f} GB/s".format(throughput)) | |
| else: | |
| tflops = result.ops_per_second / 1e12 | |
| print("\n{0}:".format(result.name)) | |
| print(" Throughput: {0:.2f} TFLOPS".format(tflops)) | |
| print(" Time: {0:.3f}s ({1} iterations)".format(result.time_taken, result.iterations)) | |
| # Calculate final score | |
| final_score, component_scores = self.calculate_score() | |
| print("\n" + "=" * 60) | |
| print("FINAL RESULTS") | |
| print("=" * 60) | |
| print("\nComponent Scores:") | |
| for name, score in component_scores.items(): | |
| # Normalize bar to 1000-5000 range | |
| bar_length = int((score - 1000) / 80) | |
| bar = "█" * max(0, min(bar_length, 50)) | |
| print(" {0:25s}: {1:7.1f} {2}".format(name, score, bar)) | |
| print("\n" + "=" * 60) | |
| # Visual score representation | |
| score_normalized = (final_score - 1000) / 4000 # 0 to 1 | |
| bar_width = 40 | |
| filled = int(score_normalized * bar_width) | |
| bar = "█" * filled + "░" * (bar_width - filled) | |
| print("\n FINAL SCORE: {0:.0f}".format(final_score)) | |
| print("\n [1000 |{0}| 5000]".format(bar)) | |
| # Score interpretation | |
| if final_score >= 4500: | |
| rating = "EXCEPTIONAL - Top-tier GPU performance" | |
| elif final_score >= 3500: | |
| rating = "EXCELLENT - High-end GPU performance" | |
| elif final_score >= 2500: | |
| rating = "GOOD - Mid-range GPU performance" | |
| elif final_score >= 1500: | |
| rating = "FAIR - Entry-level GPU / Apple Silicon" | |
| else: | |
| rating = "BASIC - CPU or integrated graphics" | |
| print("\n Rating: {0}".format(rating)) | |
| print("\n" + "=" * 60) | |
| return final_score | |
| def main(): | |
| """Main entry point.""" | |
| try: | |
| benchmark = GPUBenchmark() | |
| score = benchmark.run_all_benchmarks() | |
| print("\nBenchmark completed successfully!") | |
| print("Your score: {0:.0f}/5000".format(score)) | |
| return 0 | |
| except KeyboardInterrupt: | |
| print("\n\nBenchmark cancelled by user.") | |
| return 1 | |
| except Exception as e: | |
| print("\nError during benchmark: {0}".format(e)) | |
| import traceback | |
| traceback.print_exc() | |
| return 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment