qingy1337 · January 27, 2026 03:43
diff --git a/bench.py b/bench.py
 #!/usr/bin/env python3
 """
 PyTorch GPU Benchmark Script
 Benchmarks CUDA, MPS (Apple Silicon), or CPU and outputs a score from 1000-5000.
 """

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import time
 import gc
 import sys
 from dataclasses import dataclass
 from typing import Tuple, Optional


 @dataclass
 class BenchmarkResult:
    """Stores results from a single benchmark."""
    name: str
    ops_per_second: float
    time_taken: float
    iterations: int


 class GPUBenchmark:
    """Comprehensive GPU benchmark suite."""

    # Reference values (recalibrated for ~3000 score on RTX 4070/3080 Ti)
    REFERENCE_SCORES = {
        'matmul': 40e12,        # 40 TFLOPS for matrix multiplication
        'conv2d': 30e12,         # 30 TFLOPS for convolution
        'transformer': 5e12,     # 5 TFLOPS for transformer operations
        'memory': 700e9,         # 700 GB/s for memory bandwidth
        'mixed': 100e12,         # 100 TFLOPS for mixed precision (FP16)
    }

    def __init__(self):
        self.device = self._get_best_device()
        self.device_name = self._get_device_name()
        self.results: list[BenchmarkResult] = []

    def _get_best_device(self) -> torch.device:
        """Get the best available device: CUDA > MPS > CPU."""
        if torch.cuda.is_available():
            return torch.device('cuda')
        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
            return torch.device('mps')
        else:
            return torch.device('cpu')

    def _get_device_name(self) -> str:
        """Get a human-readable device name."""
        if self.device.type == 'cuda':
            return f"CUDA - {torch.cuda.get_device_name(0)}"
        elif self.device.type == 'mps':
            return "Apple Silicon (MPS)"
        else:
            return "CPU"

    def _sync_device(self):
        """Synchronize device for accurate timing."""
        if self.device.type == 'cuda':
            torch.cuda.synchronize()
        elif self.device.type == 'mps':
            torch.mps.synchronize()

    def _cleanup(self):
        """Clean up memory."""
        gc.collect()
        if self.device.type == 'cuda':
            torch.cuda.empty_cache()
        elif self.device.type == 'mps':
            torch.mps.empty_cache()

    def _warmup(self, func, warmup_iters: int = 5):
        """Warm up the GPU before benchmarking."""
        for _ in range(warmup_iters):
            func()
        self._sync_device()

    def benchmark_matmul(self, size: int = 8192, iterations: int = 100) -> BenchmarkResult:
        """Benchmark matrix multiplication (core GPU operation)."""
        print("  Running matrix multiplication benchmark...")

        # Adjust size based on device capabilities
        if self.device.type == 'cpu':
            size = 2048
            iterations = 20
        elif self.device.type == 'mps':
            size = 4096
            iterations = 50

        a = torch.randn(size, size, device=self.device, dtype=torch.float32)
        b = torch.randn(size, size, device=self.device, dtype=torch.float32)

        def matmul_op():
            torch.mm(a, b)

        self._warmup(matmul_op)

        self._sync_device()
        start = time.perf_counter()
        for _ in range(iterations):
            matmul_op()
        self._sync_device()
        elapsed = time.perf_counter() - start

        # Calculate FLOPS (2 * N^3 for matrix multiplication)
        flops = 2 * (size ** 3) * iterations
        ops_per_second = flops / elapsed

        del a, b
        self._cleanup()

        return BenchmarkResult("Matrix Multiplication", ops_per_second, elapsed, iterations)

    def benchmark_conv2d(self, batch_size: int = 64, iterations: int = 100) -> BenchmarkResult:
        """Benchmark 2D convolution (essential for CNNs)."""
        print("  Running convolution benchmark...")

        if self.device.type == 'cpu':
            batch_size = 8
            iterations = 20
        elif self.device.type == 'mps':
            batch_size = 32
            iterations = 50

        # Simulate a typical CNN layer with larger feature maps
        input_tensor = torch.randn(batch_size, 128, 256, 256, device=self.device, dtype=torch.float32)
        conv = nn.Conv2d(128, 256, kernel_size=3, padding=1).to(self.device)

        def conv_op():
            conv(input_tensor)

        self._warmup(conv_op)

        self._sync_device()
        start = time.perf_counter()
        for _ in range(iterations):
            conv_op()
        self._sync_device()
        elapsed = time.perf_counter() - start

        # Approximate FLOPS for conv2d
        output_size = 256 * 256
        flops_per_output = 2 * 128 * 3 * 3  # 2 * in_channels * kernel_size^2
        total_flops = batch_size * 256 * output_size * flops_per_output * iterations
        ops_per_second = total_flops / elapsed

        del input_tensor, conv
        self._cleanup()

        return BenchmarkResult("2D Convolution", ops_per_second, elapsed, iterations)

    def benchmark_transformer(self, batch_size: int = 32, seq_len: int = 1024,
                              hidden_dim: int = 1024, iterations: int = 50) -> BenchmarkResult:
        """Benchmark transformer-like operations (attention mechanism)."""
        print("  Running transformer attention benchmark...")

        if self.device.type == 'cpu':
            batch_size = 4
            seq_len = 256
            iterations = 10
        elif self.device.type == 'mps':
            batch_size = 16
            seq_len = 512
            iterations = 30

        # Query, Key, Value projections (16 heads, 64 dim per head)
        num_heads = 16
        head_dim = 64
        q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)
        k = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)
        v = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)

        def attention_op():
            # Scaled dot-product attention
            scores = torch.matmul(q, k.transpose(-2, -1)) / (head_dim ** 0.5)
            attn = F.softmax(scores, dim=-1)
            torch.matmul(attn, v)

        self._warmup(attention_op)

        self._sync_device()
        start = time.perf_counter()
        for _ in range(iterations):
            attention_op()
        self._sync_device()
        elapsed = time.perf_counter() - start

        # Approximate operations (QK^T, softmax, attn*V)
        qk_ops = batch_size * num_heads * seq_len * seq_len * head_dim * 2
        attn_v_ops = batch_size * num_heads * seq_len * seq_len * head_dim * 2
        softmax_ops = batch_size * num_heads * seq_len * seq_len * 5  # approx for exp/sum
        ops = (qk_ops + attn_v_ops + softmax_ops) * iterations
        ops_per_second = ops / elapsed

        del q, k, v
        self._cleanup()

        return BenchmarkResult("Transformer Attention", ops_per_second, elapsed, iterations)

    def benchmark_memory_bandwidth(self, size_mb: int = 512, iterations: int = 100) -> BenchmarkResult:
        """Benchmark memory bandwidth (copy operations)."""
        print("  Running memory bandwidth benchmark...")

        if self.device.type == 'cpu':
            size_mb = 64
            iterations = 30
        elif self.device.type == 'mps':
            size_mb = 256
            iterations = 50

        # float32 = 4 bytes
        num_elements = (size_mb * 1024 * 1024) // 4
        src = torch.randn(num_elements, device=self.device, dtype=torch.float32)
        dst = torch.empty(num_elements, device=self.device, dtype=torch.float32)

        def copy_op():
            dst.copy_(src)

        self._warmup(copy_op)

        self._sync_device()
        start = time.perf_counter()
        for _ in range(iterations):
            copy_op()
        self._sync_device()
        elapsed = time.perf_counter() - start

        # Calculate bandwidth (read + write)
        bytes_transferred = 2 * size_mb * 1024 * 1024 * iterations
        bytes_per_second = bytes_transferred / elapsed

        del src, dst
        self._cleanup()

        return BenchmarkResult("Memory Bandwidth", bytes_per_second, elapsed, iterations)

    def benchmark_mixed_precision(self, size: int = 8192, iterations: int = 100) -> BenchmarkResult:
        """Benchmark mixed precision operations (FP16/FP32)."""
        print("  Running mixed precision benchmark...")

        if self.device.type == 'cpu':
            size = 2048
            iterations = 20
        elif self.device.type == 'mps':
            size = 4096
            iterations = 50

        # Use FP16 for better tensor core utilization on CUDA
        dtype = torch.float16 if self.device.type in ['cuda', 'mps'] else torch.float32

        a = torch.randn(size, size, device=self.device, dtype=dtype)
        b = torch.randn(size, size, device=self.device, dtype=dtype)
        c = torch.randn(size, size, device=self.device, dtype=dtype)

        def mixed_op():
            # Chain of operations to stress tensor cores
            d = torch.mm(a, b)
            e = torch.mm(d, c)
            f = torch.relu(e)
            torch.sigmoid(f)

        self._warmup(mixed_op)

        self._sync_device()
        start = time.perf_counter()
        for _ in range(iterations):
            mixed_op()
        self._sync_device()
        elapsed = time.perf_counter() - start

        # Approximate ops (2 matmuls + relu + sigmoid)
        flops = (2 * 2 * size**3 + 3 * size**2) * iterations
        ops_per_second = flops / elapsed

        del a, b, c
        self._cleanup()

        return BenchmarkResult("Mixed Precision Ops", ops_per_second, elapsed, iterations)

    def calculate_score(self) -> Tuple[float, dict]:
        """Calculate final score between 1000-5000 using logarithmic scaling."""

        scores = {}
        weights = {
            'Matrix Multiplication': 0.30,
            '2D Convolution': 0.25,
            'Transformer Attention': 0.20,
            'Memory Bandwidth': 0.15,
            'Mixed Precision Ops': 0.10,
        }

        references = {
            'Matrix Multiplication': self.REFERENCE_SCORES['matmul'],
            '2D Convolution': self.REFERENCE_SCORES['conv2d'],
            'Transformer Attention': self.REFERENCE_SCORES['transformer'],
            'Memory Bandwidth': self.REFERENCE_SCORES['memory'],
            'Mixed Precision Ops': self.REFERENCE_SCORES['mixed'],
        }

        weighted_score = 0
        for result in self.results:
            ref = references.get(result.name, 1e10)
            # Calculate relative performance
            relative_perf = result.ops_per_second / ref

            # Use logarithmic scaling for better distribution
            # log2(1) = 0 (at reference), log2(2) = 1 (2x reference), log2(4) = 2 (4x reference)
            if relative_perf > 0:
                # Scale so that reference = 3000, 2x = 3750, 4x = 4500, 8x = 5000+
                log_perf = torch.log2(torch.tensor(max(relative_perf, 0.01))).item()
                component_score = 3000 + (log_perf * 750)
            else:
                component_score = 1000

            scores[result.name] = component_score

            weight = weights.get(result.name, 0.1)
            weighted_score += component_score * weight

        # Clamp to 1000-5000 range
        final_score = max(1000, min(5000, weighted_score))

        return final_score, scores

    def run_all_benchmarks(self) -> float:
        """Run all benchmarks and return final score."""

        print("=" * 60)
        print("PyTorch GPU Benchmark Suite")
        print("=" * 60)
        print("\nDevice: {0}".format(self.device_name))
        print("PyTorch Version: {0}".format(torch.__version__))

        if self.device.type == 'cuda':
            print("CUDA Version: {0}".format(torch.version.cuda))
            props = torch.cuda.get_device_properties(0)
            print("GPU Memory: {0:.1f} GB".format(props.total_memory / 1024**3))
            print("Compute Capability: {0}.{1}".format(props.major, props.minor))

        print("\n" + "-" * 60)
        print("Running Benchmarks...")
        print("-" * 60 + "\n")

        try:
            self.results.append(self.benchmark_matmul())
            self.results.append(self.benchmark_conv2d())
            self.results.append(self.benchmark_transformer())
            self.results.append(self.benchmark_memory_bandwidth())
            self.results.append(self.benchmark_mixed_precision())
        except RuntimeError as e:
            if "out of memory" in str(e).lower():
                print("\nWarning: Out of memory error. Retrying with smaller sizes...")
                self._cleanup()
                # Retry with smaller sizes
                self.results = []
                self.results.append(self.benchmark_matmul(size=4096, iterations=50))
                self.results.append(self.benchmark_conv2d(batch_size=16, iterations=50))
                self.results.append(self.benchmark_transformer(batch_size=8, seq_len=512, iterations=30))
                self.results.append(self.benchmark_memory_bandwidth(size_mb=256, iterations=50))
                self.results.append(self.benchmark_mixed_precision(size=4096, iterations=50))
            else:
                raise

        # Display results
        print("\n" + "-" * 60)
        print("Benchmark Results")
        print("-" * 60)

        for result in self.results:
            if "Memory" in result.name:
                throughput = result.ops_per_second / 1e9
                print("\n{0}:".format(result.name))
                print("  Throughput: {0:.2f} GB/s".format(throughput))
            else:
                tflops = result.ops_per_second / 1e12
                print("\n{0}:".format(result.name))
                print("  Throughput: {0:.2f} TFLOPS".format(tflops))
            print("  Time: {0:.3f}s ({1} iterations)".format(result.time_taken, result.iterations))

        # Calculate final score
        final_score, component_scores = self.calculate_score()

        print("\n" + "=" * 60)
        print("FINAL RESULTS")
        print("=" * 60)

        print("\nComponent Scores:")
        for name, score in component_scores.items():
            # Normalize bar to 1000-5000 range
            bar_length = int((score - 1000) / 80)
            bar = "█" * max(0, min(bar_length, 50))
            print("  {0:25s}: {1:7.1f} {2}".format(name, score, bar))

        print("\n" + "=" * 60)

        # Visual score representation
        score_normalized = (final_score - 1000) / 4000  # 0 to 1
        bar_width = 40
        filled = int(score_normalized * bar_width)
        bar = "█" * filled + "░" * (bar_width - filled)

        print("\n  FINAL SCORE: {0:.0f}".format(final_score))
        print("\n  [1000 |{0}| 5000]".format(bar))

        # Score interpretation
        if final_score >= 4500:
            rating = "EXCEPTIONAL - Top-tier GPU performance"
        elif final_score >= 3500:
            rating = "EXCELLENT - High-end GPU performance"
        elif final_score >= 2500:
            rating = "GOOD - Mid-range GPU performance"
        elif final_score >= 1500:
            rating = "FAIR - Entry-level GPU / Apple Silicon"
        else:
            rating = "BASIC - CPU or integrated graphics"

        print("\n  Rating: {0}".format(rating))
        print("\n" + "=" * 60)

        return final_score


 def main():
    """Main entry point."""
    try:
        benchmark = GPUBenchmark()
        score = benchmark.run_all_benchmarks()
        print("\nBenchmark completed successfully!")
        print("Your score: {0:.0f}/5000".format(score))
        return 0
    except KeyboardInterrupt:
        print("\n\nBenchmark cancelled by user.")
        return 1
    except Exception as e:
        print("\nError during benchmark: {0}".format(e))
        import traceback
        traceback.print_exc()
        return 1


 if __name__ == "__main__":
    sys.exit(main())
	#!/usr/bin/env python3
	"""
	PyTorch GPU Benchmark Script
	Benchmarks CUDA, MPS (Apple Silicon), or CPU and outputs a score from 1000-5000.
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import time
	import gc
	import sys
	from dataclasses import dataclass
	from typing import Tuple, Optional


	@dataclass
	class BenchmarkResult:
	"""Stores results from a single benchmark."""
	name: str
	ops_per_second: float
	time_taken: float
	iterations: int


	class GPUBenchmark:
	"""Comprehensive GPU benchmark suite."""

	# Reference values (recalibrated for ~3000 score on RTX 4070/3080 Ti)
	REFERENCE_SCORES = {
	'matmul': 40e12, # 40 TFLOPS for matrix multiplication
	'conv2d': 30e12, # 30 TFLOPS for convolution
	'transformer': 5e12, # 5 TFLOPS for transformer operations
	'memory': 700e9, # 700 GB/s for memory bandwidth
	'mixed': 100e12, # 100 TFLOPS for mixed precision (FP16)
	}

	def __init__(self):
	self.device = self._get_best_device()
	self.device_name = self._get_device_name()
	self.results: list[BenchmarkResult] = []

	def _get_best_device(self) -> torch.device:
	"""Get the best available device: CUDA > MPS > CPU."""
	if torch.cuda.is_available():
	return torch.device('cuda')
	elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
	return torch.device('mps')
	else:
	return torch.device('cpu')

	def _get_device_name(self) -> str:
	"""Get a human-readable device name."""
	if self.device.type == 'cuda':
	return f"CUDA - {torch.cuda.get_device_name(0)}"
	elif self.device.type == 'mps':
	return "Apple Silicon (MPS)"
	else:
	return "CPU"

	def _sync_device(self):
	"""Synchronize device for accurate timing."""
	if self.device.type == 'cuda':
	torch.cuda.synchronize()
	elif self.device.type == 'mps':
	torch.mps.synchronize()

	def _cleanup(self):
	"""Clean up memory."""
	gc.collect()
	if self.device.type == 'cuda':
	torch.cuda.empty_cache()
	elif self.device.type == 'mps':
	torch.mps.empty_cache()

	def _warmup(self, func, warmup_iters: int = 5):
	"""Warm up the GPU before benchmarking."""
	for _ in range(warmup_iters):
	func()
	self._sync_device()

	def benchmark_matmul(self, size: int = 8192, iterations: int = 100) -> BenchmarkResult:
	"""Benchmark matrix multiplication (core GPU operation)."""
	print(" Running matrix multiplication benchmark...")

	# Adjust size based on device capabilities
	if self.device.type == 'cpu':
	size = 2048
	iterations = 20
	elif self.device.type == 'mps':
	size = 4096
	iterations = 50

	a = torch.randn(size, size, device=self.device, dtype=torch.float32)
	b = torch.randn(size, size, device=self.device, dtype=torch.float32)

	def matmul_op():
	torch.mm(a, b)

	self._warmup(matmul_op)

	self._sync_device()
	start = time.perf_counter()
	for _ in range(iterations):
	matmul_op()
	self._sync_device()
	elapsed = time.perf_counter() - start

	# Calculate FLOPS (2 * N^3 for matrix multiplication)
	flops = 2 * (size ** 3) * iterations
	ops_per_second = flops / elapsed

	del a, b
	self._cleanup()

	return BenchmarkResult("Matrix Multiplication", ops_per_second, elapsed, iterations)

	def benchmark_conv2d(self, batch_size: int = 64, iterations: int = 100) -> BenchmarkResult:
	"""Benchmark 2D convolution (essential for CNNs)."""
	print(" Running convolution benchmark...")

	if self.device.type == 'cpu':
	batch_size = 8
	iterations = 20
	elif self.device.type == 'mps':
	batch_size = 32
	iterations = 50

	# Simulate a typical CNN layer with larger feature maps
	input_tensor = torch.randn(batch_size, 128, 256, 256, device=self.device, dtype=torch.float32)
	conv = nn.Conv2d(128, 256, kernel_size=3, padding=1).to(self.device)

	def conv_op():
	conv(input_tensor)

	self._warmup(conv_op)

	self._sync_device()
	start = time.perf_counter()
	for _ in range(iterations):
	conv_op()
	self._sync_device()
	elapsed = time.perf_counter() - start

	# Approximate FLOPS for conv2d
	output_size = 256 * 256
	flops_per_output = 2 * 128 * 3 * 3 # 2 * in_channels * kernel_size^2
	total_flops = batch_size * 256 * output_size * flops_per_output * iterations
	ops_per_second = total_flops / elapsed

	del input_tensor, conv
	self._cleanup()

	return BenchmarkResult("2D Convolution", ops_per_second, elapsed, iterations)

	def benchmark_transformer(self, batch_size: int = 32, seq_len: int = 1024,
	hidden_dim: int = 1024, iterations: int = 50) -> BenchmarkResult:
	"""Benchmark transformer-like operations (attention mechanism)."""
	print(" Running transformer attention benchmark...")

	if self.device.type == 'cpu':
	batch_size = 4
	seq_len = 256
	iterations = 10
	elif self.device.type == 'mps':
	batch_size = 16
	seq_len = 512
	iterations = 30

	# Query, Key, Value projections (16 heads, 64 dim per head)
	num_heads = 16
	head_dim = 64
	q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)
	k = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)
	v = torch.randn(batch_size, num_heads, seq_len, head_dim, device=self.device, dtype=torch.float32)

	def attention_op():
	# Scaled dot-product attention
	scores = torch.matmul(q, k.transpose(-2, -1)) / (head_dim ** 0.5)
	attn = F.softmax(scores, dim=-1)
	torch.matmul(attn, v)

	self._warmup(attention_op)

	self._sync_device()
	start = time.perf_counter()
	for _ in range(iterations):
	attention_op()
	self._sync_device()
	elapsed = time.perf_counter() - start

	# Approximate operations (QK^T, softmax, attn*V)
	qk_ops = batch_size * num_heads * seq_len * seq_len * head_dim * 2
	attn_v_ops = batch_size * num_heads * seq_len * seq_len * head_dim * 2
	softmax_ops = batch_size * num_heads * seq_len * seq_len * 5 # approx for exp/sum
	ops = (qk_ops + attn_v_ops + softmax_ops) * iterations
	ops_per_second = ops / elapsed

	del q, k, v
	self._cleanup()

	return BenchmarkResult("Transformer Attention", ops_per_second, elapsed, iterations)

	def benchmark_memory_bandwidth(self, size_mb: int = 512, iterations: int = 100) -> BenchmarkResult:
	"""Benchmark memory bandwidth (copy operations)."""
	print(" Running memory bandwidth benchmark...")

	if self.device.type == 'cpu':
	size_mb = 64
	iterations = 30
	elif self.device.type == 'mps':
	size_mb = 256
	iterations = 50

	# float32 = 4 bytes
	num_elements = (size_mb * 1024 * 1024) // 4
	src = torch.randn(num_elements, device=self.device, dtype=torch.float32)
	dst = torch.empty(num_elements, device=self.device, dtype=torch.float32)

	def copy_op():
	dst.copy_(src)

	self._warmup(copy_op)

	self._sync_device()
	start = time.perf_counter()
	for _ in range(iterations):
	copy_op()
	self._sync_device()
	elapsed = time.perf_counter() - start

	# Calculate bandwidth (read + write)
	bytes_transferred = 2 * size_mb * 1024 * 1024 * iterations
	bytes_per_second = bytes_transferred / elapsed

	del src, dst
	self._cleanup()

	return BenchmarkResult("Memory Bandwidth", bytes_per_second, elapsed, iterations)

	def benchmark_mixed_precision(self, size: int = 8192, iterations: int = 100) -> BenchmarkResult:
	"""Benchmark mixed precision operations (FP16/FP32)."""
	print(" Running mixed precision benchmark...")

	if self.device.type == 'cpu':
	size = 2048
	iterations = 20
	elif self.device.type == 'mps':
	size = 4096
	iterations = 50

	# Use FP16 for better tensor core utilization on CUDA
	dtype = torch.float16 if self.device.type in ['cuda', 'mps'] else torch.float32

	a = torch.randn(size, size, device=self.device, dtype=dtype)
	b = torch.randn(size, size, device=self.device, dtype=dtype)
	c = torch.randn(size, size, device=self.device, dtype=dtype)

	def mixed_op():
	# Chain of operations to stress tensor cores
	d = torch.mm(a, b)
	e = torch.mm(d, c)
	f = torch.relu(e)
	torch.sigmoid(f)

	self._warmup(mixed_op)

	self._sync_device()
	start = time.perf_counter()
	for _ in range(iterations):
	mixed_op()
	self._sync_device()
	elapsed = time.perf_counter() - start

	# Approximate ops (2 matmuls + relu + sigmoid)
	flops = (2 * 2 * size*3 + 3 size*2) iterations
	ops_per_second = flops / elapsed

	del a, b, c
	self._cleanup()

	return BenchmarkResult("Mixed Precision Ops", ops_per_second, elapsed, iterations)

	def calculate_score(self) -> Tuple[float, dict]:
	"""Calculate final score between 1000-5000 using logarithmic scaling."""

	scores = {}
	weights = {
	'Matrix Multiplication': 0.30,
	'2D Convolution': 0.25,
	'Transformer Attention': 0.20,
	'Memory Bandwidth': 0.15,
	'Mixed Precision Ops': 0.10,
	}

	references = {
	'Matrix Multiplication': self.REFERENCE_SCORES['matmul'],
	'2D Convolution': self.REFERENCE_SCORES['conv2d'],
	'Transformer Attention': self.REFERENCE_SCORES['transformer'],
	'Memory Bandwidth': self.REFERENCE_SCORES['memory'],
	'Mixed Precision Ops': self.REFERENCE_SCORES['mixed'],
	}

	weighted_score = 0
	for result in self.results:
	ref = references.get(result.name, 1e10)
	# Calculate relative performance
	relative_perf = result.ops_per_second / ref

	# Use logarithmic scaling for better distribution
	# log2(1) = 0 (at reference), log2(2) = 1 (2x reference), log2(4) = 2 (4x reference)
	if relative_perf > 0:
	# Scale so that reference = 3000, 2x = 3750, 4x = 4500, 8x = 5000+
	log_perf = torch.log2(torch.tensor(max(relative_perf, 0.01))).item()
	component_score = 3000 + (log_perf * 750)
	else:
	component_score = 1000

	scores[result.name] = component_score

	weight = weights.get(result.name, 0.1)
	weighted_score += component_score * weight

	# Clamp to 1000-5000 range
	final_score = max(1000, min(5000, weighted_score))

	return final_score, scores

	def run_all_benchmarks(self) -> float:
	"""Run all benchmarks and return final score."""

	print("=" * 60)
	print("PyTorch GPU Benchmark Suite")
	print("=" * 60)
	print("\nDevice: {0}".format(self.device_name))
	print("PyTorch Version: {0}".format(torch.__version__))

	if self.device.type == 'cuda':
	print("CUDA Version: {0}".format(torch.version.cuda))
	props = torch.cuda.get_device_properties(0)
	print("GPU Memory: {0:.1f} GB".format(props.total_memory / 1024**3))
	print("Compute Capability: {0}.{1}".format(props.major, props.minor))

	print("\n" + "-" * 60)
	print("Running Benchmarks...")
	print("-" * 60 + "\n")

	try:
	self.results.append(self.benchmark_matmul())
	self.results.append(self.benchmark_conv2d())
	self.results.append(self.benchmark_transformer())
	self.results.append(self.benchmark_memory_bandwidth())
	self.results.append(self.benchmark_mixed_precision())
	except RuntimeError as e:
	if "out of memory" in str(e).lower():
	print("\nWarning: Out of memory error. Retrying with smaller sizes...")
	self._cleanup()
	# Retry with smaller sizes
	self.results = []
	self.results.append(self.benchmark_matmul(size=4096, iterations=50))
	self.results.append(self.benchmark_conv2d(batch_size=16, iterations=50))
	self.results.append(self.benchmark_transformer(batch_size=8, seq_len=512, iterations=30))
	self.results.append(self.benchmark_memory_bandwidth(size_mb=256, iterations=50))
	self.results.append(self.benchmark_mixed_precision(size=4096, iterations=50))
	else:
	raise

	# Display results
	print("\n" + "-" * 60)
	print("Benchmark Results")
	print("-" * 60)

	for result in self.results:
	if "Memory" in result.name:
	throughput = result.ops_per_second / 1e9
	print("\n{0}:".format(result.name))
	print(" Throughput: {0:.2f} GB/s".format(throughput))
	else:
	tflops = result.ops_per_second / 1e12
	print("\n{0}:".format(result.name))
	print(" Throughput: {0:.2f} TFLOPS".format(tflops))
	print(" Time: {0:.3f}s ({1} iterations)".format(result.time_taken, result.iterations))

	# Calculate final score
	final_score, component_scores = self.calculate_score()

	print("\n" + "=" * 60)
	print("FINAL RESULTS")
	print("=" * 60)

	print("\nComponent Scores:")
	for name, score in component_scores.items():
	# Normalize bar to 1000-5000 range
	bar_length = int((score - 1000) / 80)
	bar = "█" * max(0, min(bar_length, 50))
	print(" {0:25s}: {1:7.1f} {2}".format(name, score, bar))

	print("\n" + "=" * 60)

	# Visual score representation
	score_normalized = (final_score - 1000) / 4000 # 0 to 1
	bar_width = 40
	filled = int(score_normalized * bar_width)
	bar = "█" * filled + "░" * (bar_width - filled)

	print("\n FINAL SCORE: {0:.0f}".format(final_score))
	print("\n [1000 \|{0}\| 5000]".format(bar))

	# Score interpretation
	if final_score >= 4500:
	rating = "EXCEPTIONAL - Top-tier GPU performance"
	elif final_score >= 3500:
	rating = "EXCELLENT - High-end GPU performance"
	elif final_score >= 2500:
	rating = "GOOD - Mid-range GPU performance"
	elif final_score >= 1500:
	rating = "FAIR - Entry-level GPU / Apple Silicon"
	else:
	rating = "BASIC - CPU or integrated graphics"

	print("\n Rating: {0}".format(rating))
	print("\n" + "=" * 60)

	return final_score


	def main():
	"""Main entry point."""
	try:
	benchmark = GPUBenchmark()
	score = benchmark.run_all_benchmarks()
	print("\nBenchmark completed successfully!")
	print("Your score: {0:.0f}/5000".format(score))
	return 0
	except KeyboardInterrupt:
	print("\n\nBenchmark cancelled by user.")
	return 1
	except Exception as e:
	print("\nError during benchmark: {0}".format(e))
	import traceback
	traceback.print_exc()
	return 1


	if __name__ == "__main__":
	sys.exit(main())
No results found