Created
August 16, 2025 18:56
-
-
Save redwrasse/5085d7fb6bec3a9d0d6ee4b5219587c6 to your computer and use it in GitHub Desktop.
Claude Code generated test of Pytorch CTC Loss forward pass comparisons - R^n vs log-simplex input space.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Claude-code generated script that compares the forward pass loss generated by CUDA native, CUDNN, and CPU nativ CTC loss implementations | |
| # for both R^n random inputs and log-simplex random inputs | |
| import torch | |
| import torch.nn as nn | |
| import torch.profiler | |
| import numpy as np | |
| from typing import Tuple, List, Optional | |
| import warnings | |
| def create_ctc_inputs_random( | |
| batch_size: int, | |
| num_classes: int, | |
| input_length: int, | |
| target_length: int, | |
| device: str = 'cuda', | |
| dtype: torch.dtype = torch.float32 | |
| ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: | |
| """Create CTC inputs with random R^n inputs""" | |
| # Raw inputs (T, N, C) - random from R^n | |
| raw_inputs = torch.randn(input_length, batch_size, num_classes, | |
| device=device, dtype=dtype) | |
| # Targets in concatenated format for CUDNN | |
| targets = [] | |
| target_lengths_list = [] | |
| for _ in range(batch_size): | |
| # Generate random target sequence (excluding blank=0) | |
| target_seq = torch.randint(1, num_classes, (target_length,), device=device, dtype=torch.int32) | |
| targets.append(target_seq) | |
| target_lengths_list.append(target_length) | |
| # Concatenate targets for CUDNN format | |
| targets = torch.cat(targets).to(device) | |
| # Input lengths - all must be T for CUDNN (input_length for each batch) | |
| input_lengths = torch.full((batch_size,), input_length, device=device, dtype=torch.int32) | |
| target_lengths = torch.tensor(target_lengths_list, device=device, dtype=torch.int32) | |
| return raw_inputs, targets, input_lengths, target_lengths | |
| def create_ctc_inputs_log_simplex( | |
| batch_size: int, | |
| num_classes: int, | |
| input_length: int, | |
| target_length: int, | |
| device: str = 'cuda', | |
| dtype: torch.dtype = torch.float32 | |
| ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: | |
| """Create CTC inputs with log-simplex distributed inputs""" | |
| # Generate random simplex points then take log | |
| # Sample from Dirichlet distribution (uniform over simplex) | |
| alpha = torch.ones(num_classes) | |
| simplex_samples = torch.distributions.Dirichlet(alpha).sample((input_length, batch_size)) | |
| # Take log to get log-simplex inputs | |
| raw_inputs = torch.log(simplex_samples).to(device=device, dtype=dtype) | |
| # Targets in concatenated format for CUDNN | |
| targets = [] | |
| target_lengths_list = [] | |
| for _ in range(batch_size): | |
| # Generate random target sequence (excluding blank=0) | |
| target_seq = torch.randint(1, num_classes, (target_length,), device=device, dtype=torch.int32) | |
| targets.append(target_seq) | |
| target_lengths_list.append(target_length) | |
| # Concatenate targets for CUDNN format | |
| targets = torch.cat(targets).to(device) | |
| # Input lengths - all must be T for CUDNN (input_length for each batch) | |
| input_lengths = torch.full((batch_size,), input_length, device=device, dtype=torch.int32) | |
| target_lengths = torch.tensor(target_lengths_list, device=device, dtype=torch.int32) | |
| return raw_inputs, targets, input_lengths, target_lengths | |
| def compare_ctc_forward_losses( | |
| batch_size: int = 4, | |
| num_classes: int = 10, | |
| input_length: int = 50, | |
| target_length: int = 30, | |
| blank: int = 0, | |
| reduction: str = 'mean', | |
| input_type: str = 'random' | |
| ) -> dict: | |
| """Compare CTC forward pass losses across CUDNN, CUDA native, and CPU implementations""" | |
| results = {} | |
| # Ensure target_length satisfies CUDNN constraint (≤ 256) | |
| if target_length > 256: | |
| raise ValueError(f"target_length {target_length} exceeds CUDNN limit of 256") | |
| print(f"Testing with batch_size={batch_size}, num_classes={num_classes}, " | |
| f"input_length={input_length}, target_length={target_length}, input_type={input_type}") | |
| # Create inputs based on type | |
| if input_type == 'log_simplex': | |
| log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda = create_ctc_inputs_log_simplex( | |
| batch_size, num_classes, input_length, target_length, device='cuda', dtype=torch.float32 | |
| ) | |
| else: # 'random' | |
| log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda = create_ctc_inputs_random( | |
| batch_size, num_classes, input_length, target_length, device='cuda', dtype=torch.float32 | |
| ) | |
| # Create inputs for CPU (copy to CPU) | |
| log_probs_cpu = log_probs_cuda.detach().cpu() | |
| targets_cpu = targets_cuda.cpu() | |
| input_lengths_cpu = input_lengths_cuda.cpu() | |
| target_lengths_cpu = target_lengths_cuda.cpu() | |
| # Test CUDNN implementation | |
| print("Testing CUDNN implementation...") | |
| ctc_loss_cudnn = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True) | |
| with torch.profiler.profile( | |
| activities=[torch.profiler.ProfilerActivity.CUDA], | |
| record_shapes=True | |
| ) as prof_cudnn: | |
| loss_cudnn = ctc_loss_cudnn(log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda) | |
| # Check if CUDNN kernel was used | |
| cudnn_kernel_used = any('cudnn' in str(event).lower() for event in prof_cudnn.events()) | |
| results['cudnn_kernel_used'] = cudnn_kernel_used | |
| results['cudnn_loss'] = loss_cudnn.item() | |
| print(f"CUDNN kernel used: {cudnn_kernel_used}") | |
| print(f"CUDNN loss: {loss_cudnn.item()}") | |
| # Test CUDA native implementation (force no CUDNN) | |
| print("Testing CUDA native implementation...") | |
| log_probs_cuda_native = log_probs_cuda.detach() | |
| # Disable CUDNN for this test | |
| with torch.backends.cudnn.flags(enabled=False): | |
| ctc_loss_cuda_native = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True) | |
| with torch.profiler.profile( | |
| activities=[torch.profiler.ProfilerActivity.CUDA], | |
| record_shapes=True | |
| ) as prof_cuda_native: | |
| loss_cuda_native = ctc_loss_cuda_native(log_probs_cuda_native, targets_cuda, input_lengths_cuda, target_lengths_cuda) | |
| # Verify no CUDNN kernel was used | |
| cuda_native_kernel_used = any('cudnn' in str(event).lower() for event in prof_cuda_native.events()) | |
| results['cuda_native_kernel_used'] = cuda_native_kernel_used | |
| results['cuda_native_loss'] = loss_cuda_native.item() | |
| print(f"CUDA native (no CUDNN) kernel used CUDNN: {cuda_native_kernel_used}") | |
| print(f"CUDA native loss: {loss_cuda_native.item()}") | |
| # Test CPU implementation | |
| print("Testing CPU implementation...") | |
| ctc_loss_cpu = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True) | |
| loss_cpu = ctc_loss_cpu(log_probs_cpu, targets_cpu, input_lengths_cpu, target_lengths_cpu) | |
| results['cpu_loss'] = loss_cpu.item() | |
| print(f"CPU loss: {loss_cpu.item()}") | |
| # Compare losses | |
| cudnn_loss = results['cudnn_loss'] | |
| cuda_native_loss = results['cuda_native_loss'] | |
| cpu_loss = results['cpu_loss'] | |
| # Compute differences | |
| cudnn_vs_cuda_native_diff = abs(cudnn_loss - cuda_native_loss) | |
| cudnn_vs_cpu_diff = abs(cudnn_loss - cpu_loss) | |
| cuda_native_vs_cpu_diff = abs(cuda_native_loss - cpu_loss) | |
| results['diff_cudnn_vs_cuda_native'] = cudnn_vs_cuda_native_diff | |
| results['diff_cudnn_vs_cpu'] = cudnn_vs_cpu_diff | |
| results['diff_cuda_native_vs_cpu'] = cuda_native_vs_cpu_diff | |
| # Relative differences (use CUDNN loss as reference) | |
| if abs(cudnn_loss) > 1e-10: | |
| results['relative_diff_cudnn_vs_cuda_native'] = cudnn_vs_cuda_native_diff / abs(cudnn_loss) | |
| results['relative_diff_cudnn_vs_cpu'] = cudnn_vs_cpu_diff / abs(cudnn_loss) | |
| else: | |
| results['relative_diff_cudnn_vs_cuda_native'] = float('inf') if cudnn_vs_cuda_native_diff > 0 else 0 | |
| results['relative_diff_cudnn_vs_cpu'] = float('inf') if cudnn_vs_cpu_diff > 0 else 0 | |
| print(f"\nLoss Comparison Results:") | |
| print(f"Diff CUDNN vs CUDA native: {results['diff_cudnn_vs_cuda_native']:.2e}") | |
| print(f"Diff CUDNN vs CPU: {results['diff_cudnn_vs_cpu']:.2e}") | |
| print(f"Diff CUDA native vs CPU: {results['diff_cuda_native_vs_cpu']:.2e}") | |
| print(f"Relative diff CUDNN vs CUDA native: {results['relative_diff_cudnn_vs_cuda_native']:.2e}") | |
| print(f"Relative diff CUDNN vs CPU: {results['relative_diff_cudnn_vs_cpu']:.2e}") | |
| return results | |
| def run_comprehensive_tests(): | |
| """Run tests with various parameter configurations and input types""" | |
| if not torch.cuda.is_available(): | |
| print("CUDA not available, skipping tests") | |
| return | |
| print("=== Comprehensive CUDNN CTC Forward Loss Tests ===\n") | |
| test_configs = [ | |
| # (batch_size, num_classes, input_length, target_length) | |
| (2, 5, 10, 5), # Small test | |
| (4, 10, 50, 30), # Medium test | |
| (8, 20, 100, 60), # Larger test | |
| (1, 10, 200, 150), # Long target length test | |
| (3, 15, 80, 256), # Maximum target length for CUDNN | |
| ] | |
| input_types = ['random', 'log_simplex'] | |
| all_results = [] | |
| for input_type in input_types: | |
| print(f"\n=== Testing with {input_type.upper()} inputs ===") | |
| for i, (batch_size, num_classes, input_length, target_length) in enumerate(test_configs): | |
| print(f"\n--- Test {i+1}/{len(test_configs)} ({input_type}) ---") | |
| try: | |
| results = compare_ctc_forward_losses( | |
| batch_size, num_classes, input_length, target_length, input_type=input_type | |
| ) | |
| results['config'] = (batch_size, num_classes, input_length, target_length) | |
| results['input_type'] = input_type | |
| all_results.append(results) | |
| except Exception as e: | |
| print(f"Test failed: {e}") | |
| continue | |
| print("-" * 50) | |
| # Summary analysis | |
| print("\n=== SUMMARY ANALYSIS ===") | |
| if not all_results: | |
| print("No successful tests to analyze") | |
| return | |
| # Group results by input type | |
| random_results = [r for r in all_results if r['input_type'] == 'random'] | |
| log_simplex_results = [r for r in all_results if r['input_type'] == 'log_simplex'] | |
| for input_type, results in [('random', random_results), ('log_simplex', log_simplex_results)]: | |
| if not results: | |
| continue | |
| print(f"\n--- {input_type.upper()} Input Analysis ---") | |
| # Check if CUDNN was actually used in any test | |
| cudnn_used_count = sum(1 for r in results if r['cudnn_kernel_used']) | |
| print(f"CUDNN kernel used in {cudnn_used_count}/{len(results)} tests") | |
| if cudnn_used_count == 0: | |
| print("WARNING: CUDNN kernel was never used! Check implementation constraints.") | |
| # Analyze loss differences | |
| cudnn_cpu_diffs = [r['diff_cudnn_vs_cpu'] for r in results] | |
| cudnn_cuda_diffs = [r['diff_cudnn_vs_cuda_native'] for r in results] | |
| print(f"CUDNN vs CPU loss differences: {min(cudnn_cpu_diffs):.2e} to {max(cudnn_cpu_diffs):.2e}") | |
| print(f"CUDNN vs CUDA native loss differences: {min(cudnn_cuda_diffs):.2e} to {max(cudnn_cuda_diffs):.2e}") | |
| # Determine if differences are likely precision vs mathematical errors | |
| large_diff_threshold = 1e-3 | |
| precision_threshold = 1e-5 | |
| large_diffs = [d for d in cudnn_cpu_diffs if d > large_diff_threshold] | |
| precision_diffs = [d for d in cudnn_cpu_diffs if d <= precision_threshold] | |
| print(f"Differences > {large_diff_threshold} (likely mathematical errors): {len(large_diffs)}") | |
| print(f"Differences ≤ {precision_threshold} (likely precision issues): {len(precision_diffs)}") | |
| if len(large_diffs) > 0: | |
| print("CONCLUSION: CUDNN implementation likely has mathematical errors") | |
| elif len(precision_diffs) == len(cudnn_cpu_diffs): | |
| print("CONCLUSION: Differences likely due to precision issues only") | |
| else: | |
| print("CONCLUSION: Mixed results - some precision, some potentially mathematical issues") | |
| if __name__ == "__main__": | |
| run_comprehensive_tests() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| === Comprehensive CUDNN CTC Forward Loss Tests === | |
| === Testing with RANDOM inputs === | |
| --- Test 1/5 (random) --- | |
| Testing with batch_size=2, num_classes=5, input_length=10, target_length=5, input_type=random | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: True | |
| CUDNN loss: 2.2869324684143066 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: -1.884143590927124 | |
| Testing CPU implementation... | |
| CPU loss: -1.884143590927124 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 4.17e+00 | |
| Diff CUDNN vs CPU: 4.17e+00 | |
| Diff CUDA native vs CPU: 0.00e+00 | |
| Relative diff CUDNN vs CUDA native: 1.82e+00 | |
| Relative diff CUDNN vs CPU: 1.82e+00 | |
| -------------------------------------------------- | |
| --- Test 2/5 (random) --- | |
| Testing with batch_size=4, num_classes=10, input_length=50, target_length=30, input_type=random | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: True | |
| CUDNN loss: 2.8773937225341797 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: -1.6434334516525269 | |
| Testing CPU implementation... | |
| CPU loss: -1.6434335708618164 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 4.52e+00 | |
| Diff CUDNN vs CPU: 4.52e+00 | |
| Diff CUDA native vs CPU: 1.19e-07 | |
| Relative diff CUDNN vs CUDA native: 1.57e+00 | |
| Relative diff CUDNN vs CPU: 1.57e+00 | |
| -------------------------------------------------- | |
| --- Test 3/5 (random) --- | |
| Testing with batch_size=8, num_classes=20, input_length=100, target_length=60, input_type=random | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: True | |
| CUDNN loss: 3.8359289169311523 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: -1.9019420146942139 | |
| Testing CPU implementation... | |
| CPU loss: -1.9019420146942139 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 5.74e+00 | |
| Diff CUDNN vs CPU: 5.74e+00 | |
| Diff CUDA native vs CPU: 0.00e+00 | |
| Relative diff CUDNN vs CUDA native: 1.50e+00 | |
| Relative diff CUDNN vs CPU: 1.50e+00 | |
| -------------------------------------------------- | |
| --- Test 4/5 (random) --- | |
| Testing with batch_size=1, num_classes=10, input_length=200, target_length=150, input_type=random | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: True | |
| CUDNN loss: 2.4502780437469482 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: -1.1409305334091187 | |
| Testing CPU implementation... | |
| CPU loss: -1.1409305334091187 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 3.59e+00 | |
| Diff CUDNN vs CPU: 3.59e+00 | |
| Diff CUDA native vs CPU: 0.00e+00 | |
| Relative diff CUDNN vs CUDA native: 1.47e+00 | |
| Relative diff CUDNN vs CPU: 1.47e+00 | |
| -------------------------------------------------- | |
| --- Test 5/5 (random) --- | |
| Testing with batch_size=3, num_classes=15, input_length=80, target_length=256, input_type=random | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: False | |
| CUDNN loss: 0.0 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: 0.0 | |
| Testing CPU implementation... | |
| CPU loss: 0.0 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 0.00e+00 | |
| Diff CUDNN vs CPU: 0.00e+00 | |
| Diff CUDA native vs CPU: 0.00e+00 | |
| Relative diff CUDNN vs CUDA native: 0.00e+00 | |
| Relative diff CUDNN vs CPU: 0.00e+00 | |
| -------------------------------------------------- | |
| === Testing with LOG_SIMPLEX inputs === | |
| --- Test 1/5 (log_simplex) --- | |
| Testing with batch_size=2, num_classes=5, input_length=10, target_length=5, input_type=log_simplex | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: True | |
| CUDNN loss: 1.802871584892273 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: 1.8028714656829834 | |
| Testing CPU implementation... | |
| CPU loss: 1.8028714656829834 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 1.19e-07 | |
| Diff CUDNN vs CPU: 1.19e-07 | |
| Diff CUDA native vs CPU: 0.00e+00 | |
| Relative diff CUDNN vs CUDA native: 6.61e-08 | |
| Relative diff CUDNN vs CPU: 6.61e-08 | |
| -------------------------------------------------- | |
| --- Test 2/5 (log_simplex) --- | |
| Testing with batch_size=4, num_classes=10, input_length=50, target_length=30, input_type=log_simplex | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: True | |
| CUDNN loss: 2.71065616607666 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: 2.71065616607666 | |
| Testing CPU implementation... | |
| CPU loss: 2.7106564044952393 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 0.00e+00 | |
| Diff CUDNN vs CPU: 2.38e-07 | |
| Diff CUDA native vs CPU: 2.38e-07 | |
| Relative diff CUDNN vs CUDA native: 0.00e+00 | |
| Relative diff CUDNN vs CPU: 8.80e-08 | |
| -------------------------------------------------- | |
| --- Test 3/5 (log_simplex) --- | |
| Testing with batch_size=8, num_classes=20, input_length=100, target_length=60, input_type=log_simplex | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: True | |
| CUDNN loss: 3.7439799308776855 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: 3.7439794540405273 | |
| Testing CPU implementation... | |
| CPU loss: 3.7439799308776855 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 4.77e-07 | |
| Diff CUDNN vs CPU: 0.00e+00 | |
| Diff CUDA native vs CPU: 4.77e-07 | |
| Relative diff CUDNN vs CUDA native: 1.27e-07 | |
| Relative diff CUDNN vs CPU: 0.00e+00 | |
| -------------------------------------------------- | |
| --- Test 4/5 (log_simplex) --- | |
| Testing with batch_size=1, num_classes=10, input_length=200, target_length=150, input_type=log_simplex | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: True | |
| CUDNN loss: 2.4955265522003174 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: 2.4955267906188965 | |
| Testing CPU implementation... | |
| CPU loss: 2.4955267906188965 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 2.38e-07 | |
| Diff CUDNN vs CPU: 2.38e-07 | |
| Diff CUDA native vs CPU: 0.00e+00 | |
| Relative diff CUDNN vs CUDA native: 9.55e-08 | |
| Relative diff CUDNN vs CPU: 9.55e-08 | |
| -------------------------------------------------- | |
| --- Test 5/5 (log_simplex) --- | |
| Testing with batch_size=3, num_classes=15, input_length=80, target_length=256, input_type=log_simplex | |
| Testing CUDNN implementation... | |
| CUDNN kernel used: False | |
| CUDNN loss: 0.0 | |
| Testing CUDA native implementation... | |
| CUDA native (no CUDNN) kernel used CUDNN: False | |
| CUDA native loss: 0.0 | |
| Testing CPU implementation... | |
| CPU loss: 0.0 | |
| Loss Comparison Results: | |
| Diff CUDNN vs CUDA native: 0.00e+00 | |
| Diff CUDNN vs CPU: 0.00e+00 | |
| Diff CUDA native vs CPU: 0.00e+00 | |
| Relative diff CUDNN vs CUDA native: 0.00e+00 | |
| Relative diff CUDNN vs CPU: 0.00e+00 | |
| -------------------------------------------------- | |
| === SUMMARY ANALYSIS === | |
| --- RANDOM Input Analysis --- | |
| CUDNN kernel used in 4/5 tests | |
| CUDNN vs CPU loss differences: 0.00e+00 to 5.74e+00 | |
| CUDNN vs CUDA native loss differences: 0.00e+00 to 5.74e+00 | |
| Differences > 0.001 (likely mathematical errors): 4 | |
| Differences ≤ 1e-05 (likely precision issues): 1 | |
| CONCLUSION: CUDNN implementation likely has mathematical errors | |
| --- LOG_SIMPLEX Input Analysis --- | |
| CUDNN kernel used in 4/5 tests | |
| CUDNN vs CPU loss differences: 0.00e+00 to 2.38e-07 | |
| CUDNN vs CUDA native loss differences: 0.00e+00 to 4.77e-07 | |
| Differences > 0.001 (likely mathematical errors): 0 | |
| Differences ≤ 1e-05 (likely precision issues): 5 | |
| CONCLUSION: Differences likely due to precision issues only |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment