Skip to content

Instantly share code, notes, and snippets.

@qingy1337
Created January 27, 2026 03:43
Show Gist options
  • Select an option

  • Save qingy1337/e385031165e2d754a1b75cc9ae603e29 to your computer and use it in GitHub Desktop.

Select an option

Save qingy1337/e385031165e2d754a1b75cc9ae603e29 to your computer and use it in GitHub Desktop.
Simple Hardware Benchmark in PyTorch (generated by Claude)
#!/usr/bin/env python3
"""
PyTorch GPU Benchmark Script
Benchmarks CUDA, MPS (Apple Silicon), or CPU and outputs a score from 1000-5000.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import gc
import sys
from dataclasses import dataclass
from typing import Tuple, Optional
@dataclass
class BenchmarkResult:
"""Stores results from a single benchmark."""
name: str
ops_per_second: float
time_taken: float
iterations: int
class GPUBenchmark:
"""Comprehensive GPU benchmark suite."""
# Reference values (recalibrated for ~3000 score on RTX 4070/3080 Ti)
REFERENCE_SCORES = {
'matmul': 40e12, # 40 TFLOPS for matrix multiplication
'conv2d': 30e12, # 30 TFLOPS for convolution
'transformer': 5e12, # 5 TFLOPS for transformer operations
'memory': 700e9, # 700 GB/s for memory bandwidth
'mixed': 100e12, # 100 TFLOPS for mixed precision (FP16)
}
def __init__(self):
self.device = self._get_best_device()
self.device_name = self._get_device_name()
self.results: list[BenchmarkResult] = []
def _get_best_device(self) -> torch.device:
"""Get the best available device: CUDA > MPS > CPU."""
if torch.cuda.is_available():
return torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
return torch.device('mps')
else:
return torch.device('cpu')
def _get_device_name(self) -> str:
"""Get a human-readable device name."""
if self.device.type == 'cuda':
return f"CUDA - {torch.cuda.get_device_name(0)}"
elif self.device.type == 'mps':
return "Apple Silicon (MPS)"
else:
return "CPU"
def _sync_device(self):
"""Synchronize device for accurate timing."""
if self.device.type == 'cuda':
torch.cuda.synchronize()
elif self.device.type == 'mps':
torch.mps.synchronize()
def _cleanup(self):
"""Clean up memory."""
gc.collect()
if self.device.type == 'cuda':
torch.cuda.empty_cache()
elif self.device.type == 'mps':
torch.mps.empty_cache()
def _warmup(self, func, warmup_iters: int = 5):
"""Warm up the GPU before benchmarking."""
for _ in range(warmup_iters):
func()
self._sync_device()
def benchmark_matmul(self, size: int = 8192, iterations: int = 100) -> BenchmarkResult:
"""Benchmark matrix multiplication (core GPU operation)."""
print(" Running matrix multiplication benchmark...")
# Adjust size based on device capabilities
if self.device.type == 'cpu':
size = 2048
iterations = 20
elif self.device.type == 'mps':
size = 4096
iterations = 50
a = torch.randn(size, size, device=self.device, dtype=torch.float32)
b = torch.randn(size, size, device=self.device, dtype=torch.float32)
def matmul_op():
torch.mm(a, b)
self._warmup(matmul_op)
self._sync_device()
start = time.perf_counter()
for _ in range(iterations):
matmul_op()
self._sync_device()
elapsed = time.perf_counter() - start
# Calculate FLOPS (2 * N^3 for matrix multiplication)
flops = 2 * (size ** 3) * iterations
ops_per_second = flops / elapsed
del a, b
self._cleanup()
return BenchmarkResult("Matrix Multiplication", ops_per_second, elapsed, iterations)
def benchmark_conv2d(self, batch_size: int = 64, iterations: int = 100) -> BenchmarkResult:
"""Benchmark 2D convolution (essential for CNNs)."""
print(" Running convolution benchmark...")
if self.device.type == 'cpu':
batch_size = 8
iterations = 20
elif self.device.type == 'mps':
batch_size = 32
iterations = 50
# Simulate a typical CNN layer with larger feature maps
input_tensor = torch.randn(batch_size, 128, 256, 256, device=self.device, dtype=torch.float32)
conv = nn.Conv2d(128, 256, kernel_size=3, padding=1).to(self.device)
def conv_op():
conv(input_tensor)
self._warmup(conv_op)
self._sync_device()
start = time.perf_counter()
for _ in range(iterations):
conv_op()
self._sync_device()
elapsed = time.perf_counter() - start
# Approximate FLOPS for conv2d
output_size = 256 * 256
flops_per_output = 2 * 128 * 3 * 3 # 2 * in_channels * kernel_size^2
total_flops = batch_size * 256 * output_size * flops_per_output * iterations
ops_per_second = total_flops / elapsed
del input_tensor, conv
self._cleanup()
return BenchmarkResult("2D Convolution", ops_per_second, elapsed, iterations)
def benchmark_transformer(self, batch_size: int = 32, seq_len: int = 1024,
hidden_dim: int = 1024, iterations: int = 50) -> BenchmarkResult:
"""Benchmark transformer-like operations (attention mechanism)."""
print(" Running transformer attention benchmark...")
if self.device.type == 'cpu':
batch_size = 4
seq_len = 256
iterations = 10
elif self.device.type == 'mps':
batch_size = 16
seq_len = 512
iterations = 30
# Query, Key, Value projections (16 heads, 64 dim per head)
num_heads = 16
head_dim = 64
q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)
k = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)
v = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)
def attention_op():
# Scaled dot-product attention
scores = torch.matmul(q, k.transpose(-2, -1)) / (head_dim ** 0.5)
attn = F.softmax(scores, dim=-1)
torch.matmul(attn, v)
self._warmup(attention_op)
self._sync_device()
start = time.perf_counter()
for _ in range(iterations):
attention_op()
self._sync_device()
elapsed = time.perf_counter() - start
# Approximate operations (QK^T, softmax, attn*V)
qk_ops = batch_size * num_heads * seq_len * seq_len * head_dim * 2
attn_v_ops = batch_size * num_heads * seq_len * seq_len * head_dim * 2
softmax_ops = batch_size * num_heads * seq_len * seq_len * 5 # approx for exp/sum
ops = (qk_ops + attn_v_ops + softmax_ops) * iterations
ops_per_second = ops / elapsed
del q, k, v
self._cleanup()
return BenchmarkResult("Transformer Attention", ops_per_second, elapsed, iterations)
def benchmark_memory_bandwidth(self, size_mb: int = 512, iterations: int = 100) -> BenchmarkResult:
"""Benchmark memory bandwidth (copy operations)."""
print(" Running memory bandwidth benchmark...")
if self.device.type == 'cpu':
size_mb = 64
iterations = 30
elif self.device.type == 'mps':
size_mb = 256
iterations = 50
# float32 = 4 bytes
num_elements = (size_mb * 1024 * 1024) // 4
src = torch.randn(num_elements, device=self.device, dtype=torch.float32)
dst = torch.empty(num_elements, device=self.device, dtype=torch.float32)
def copy_op():
dst.copy_(src)
self._warmup(copy_op)
self._sync_device()
start = time.perf_counter()
for _ in range(iterations):
copy_op()
self._sync_device()
elapsed = time.perf_counter() - start
# Calculate bandwidth (read + write)
bytes_transferred = 2 * size_mb * 1024 * 1024 * iterations
bytes_per_second = bytes_transferred / elapsed
del src, dst
self._cleanup()
return BenchmarkResult("Memory Bandwidth", bytes_per_second, elapsed, iterations)
def benchmark_mixed_precision(self, size: int = 8192, iterations: int = 100) -> BenchmarkResult:
"""Benchmark mixed precision operations (FP16/FP32)."""
print(" Running mixed precision benchmark...")
if self.device.type == 'cpu':
size = 2048
iterations = 20
elif self.device.type == 'mps':
size = 4096
iterations = 50
# Use FP16 for better tensor core utilization on CUDA
dtype = torch.float16 if self.device.type in ['cuda', 'mps'] else torch.float32
a = torch.randn(size, size, device=self.device, dtype=dtype)
b = torch.randn(size, size, device=self.device, dtype=dtype)
c = torch.randn(size, size, device=self.device, dtype=dtype)
def mixed_op():
# Chain of operations to stress tensor cores
d = torch.mm(a, b)
e = torch.mm(d, c)
f = torch.relu(e)
torch.sigmoid(f)
self._warmup(mixed_op)
self._sync_device()
start = time.perf_counter()
for _ in range(iterations):
mixed_op()
self._sync_device()
elapsed = time.perf_counter() - start
# Approximate ops (2 matmuls + relu + sigmoid)
flops = (2 * 2 * size**3 + 3 * size**2) * iterations
ops_per_second = flops / elapsed
del a, b, c
self._cleanup()
return BenchmarkResult("Mixed Precision Ops", ops_per_second, elapsed, iterations)
def calculate_score(self) -> Tuple[float, dict]:
"""Calculate final score between 1000-5000 using logarithmic scaling."""
scores = {}
weights = {
'Matrix Multiplication': 0.30,
'2D Convolution': 0.25,
'Transformer Attention': 0.20,
'Memory Bandwidth': 0.15,
'Mixed Precision Ops': 0.10,
}
references = {
'Matrix Multiplication': self.REFERENCE_SCORES['matmul'],
'2D Convolution': self.REFERENCE_SCORES['conv2d'],
'Transformer Attention': self.REFERENCE_SCORES['transformer'],
'Memory Bandwidth': self.REFERENCE_SCORES['memory'],
'Mixed Precision Ops': self.REFERENCE_SCORES['mixed'],
}
weighted_score = 0
for result in self.results:
ref = references.get(result.name, 1e10)
# Calculate relative performance
relative_perf = result.ops_per_second / ref
# Use logarithmic scaling for better distribution
# log2(1) = 0 (at reference), log2(2) = 1 (2x reference), log2(4) = 2 (4x reference)
if relative_perf > 0:
# Scale so that reference = 3000, 2x = 3750, 4x = 4500, 8x = 5000+
log_perf = torch.log2(torch.tensor(max(relative_perf, 0.01))).item()
component_score = 3000 + (log_perf * 750)
else:
component_score = 1000
scores[result.name] = component_score
weight = weights.get(result.name, 0.1)
weighted_score += component_score * weight
# Clamp to 1000-5000 range
final_score = max(1000, min(5000, weighted_score))
return final_score, scores
def run_all_benchmarks(self) -> float:
"""Run all benchmarks and return final score."""
print("=" * 60)
print("PyTorch GPU Benchmark Suite")
print("=" * 60)
print("\nDevice: {0}".format(self.device_name))
print("PyTorch Version: {0}".format(torch.__version__))
if self.device.type == 'cuda':
print("CUDA Version: {0}".format(torch.version.cuda))
props = torch.cuda.get_device_properties(0)
print("GPU Memory: {0:.1f} GB".format(props.total_memory / 1024**3))
print("Compute Capability: {0}.{1}".format(props.major, props.minor))
print("\n" + "-" * 60)
print("Running Benchmarks...")
print("-" * 60 + "\n")
try:
self.results.append(self.benchmark_matmul())
self.results.append(self.benchmark_conv2d())
self.results.append(self.benchmark_transformer())
self.results.append(self.benchmark_memory_bandwidth())
self.results.append(self.benchmark_mixed_precision())
except RuntimeError as e:
if "out of memory" in str(e).lower():
print("\nWarning: Out of memory error. Retrying with smaller sizes...")
self._cleanup()
# Retry with smaller sizes
self.results = []
self.results.append(self.benchmark_matmul(size=4096, iterations=50))
self.results.append(self.benchmark_conv2d(batch_size=16, iterations=50))
self.results.append(self.benchmark_transformer(batch_size=8, seq_len=512, iterations=30))
self.results.append(self.benchmark_memory_bandwidth(size_mb=256, iterations=50))
self.results.append(self.benchmark_mixed_precision(size=4096, iterations=50))
else:
raise
# Display results
print("\n" + "-" * 60)
print("Benchmark Results")
print("-" * 60)
for result in self.results:
if "Memory" in result.name:
throughput = result.ops_per_second / 1e9
print("\n{0}:".format(result.name))
print(" Throughput: {0:.2f} GB/s".format(throughput))
else:
tflops = result.ops_per_second / 1e12
print("\n{0}:".format(result.name))
print(" Throughput: {0:.2f} TFLOPS".format(tflops))
print(" Time: {0:.3f}s ({1} iterations)".format(result.time_taken, result.iterations))
# Calculate final score
final_score, component_scores = self.calculate_score()
print("\n" + "=" * 60)
print("FINAL RESULTS")
print("=" * 60)
print("\nComponent Scores:")
for name, score in component_scores.items():
# Normalize bar to 1000-5000 range
bar_length = int((score - 1000) / 80)
bar = "█" * max(0, min(bar_length, 50))
print(" {0:25s}: {1:7.1f} {2}".format(name, score, bar))
print("\n" + "=" * 60)
# Visual score representation
score_normalized = (final_score - 1000) / 4000 # 0 to 1
bar_width = 40
filled = int(score_normalized * bar_width)
bar = "█" * filled + "░" * (bar_width - filled)
print("\n FINAL SCORE: {0:.0f}".format(final_score))
print("\n [1000 |{0}| 5000]".format(bar))
# Score interpretation
if final_score >= 4500:
rating = "EXCEPTIONAL - Top-tier GPU performance"
elif final_score >= 3500:
rating = "EXCELLENT - High-end GPU performance"
elif final_score >= 2500:
rating = "GOOD - Mid-range GPU performance"
elif final_score >= 1500:
rating = "FAIR - Entry-level GPU / Apple Silicon"
else:
rating = "BASIC - CPU or integrated graphics"
print("\n Rating: {0}".format(rating))
print("\n" + "=" * 60)
return final_score
def main():
"""Main entry point."""
try:
benchmark = GPUBenchmark()
score = benchmark.run_all_benchmarks()
print("\nBenchmark completed successfully!")
print("Your score: {0:.0f}/5000".format(score))
return 0
except KeyboardInterrupt:
print("\n\nBenchmark cancelled by user.")
return 1
except Exception as e:
print("\nError during benchmark: {0}".format(e))
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment