Skip to content

Instantly share code, notes, and snippets.

@redwrasse
Created August 16, 2025 18:56
Show Gist options
  • Select an option

  • Save redwrasse/5085d7fb6bec3a9d0d6ee4b5219587c6 to your computer and use it in GitHub Desktop.

Select an option

Save redwrasse/5085d7fb6bec3a9d0d6ee4b5219587c6 to your computer and use it in GitHub Desktop.
Claude Code generated test of Pytorch CTC Loss forward pass comparisons - R^n vs log-simplex input space.
# Claude-code generated script that compares the forward pass loss generated by CUDA native, CUDNN, and CPU nativ CTC loss implementations
# for both R^n random inputs and log-simplex random inputs
import torch
import torch.nn as nn
import torch.profiler
import numpy as np
from typing import Tuple, List, Optional
import warnings
def create_ctc_inputs_random(
batch_size: int,
num_classes: int,
input_length: int,
target_length: int,
device: str = 'cuda',
dtype: torch.dtype = torch.float32
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""Create CTC inputs with random R^n inputs"""
# Raw inputs (T, N, C) - random from R^n
raw_inputs = torch.randn(input_length, batch_size, num_classes,
device=device, dtype=dtype)
# Targets in concatenated format for CUDNN
targets = []
target_lengths_list = []
for _ in range(batch_size):
# Generate random target sequence (excluding blank=0)
target_seq = torch.randint(1, num_classes, (target_length,), device=device, dtype=torch.int32)
targets.append(target_seq)
target_lengths_list.append(target_length)
# Concatenate targets for CUDNN format
targets = torch.cat(targets).to(device)
# Input lengths - all must be T for CUDNN (input_length for each batch)
input_lengths = torch.full((batch_size,), input_length, device=device, dtype=torch.int32)
target_lengths = torch.tensor(target_lengths_list, device=device, dtype=torch.int32)
return raw_inputs, targets, input_lengths, target_lengths
def create_ctc_inputs_log_simplex(
batch_size: int,
num_classes: int,
input_length: int,
target_length: int,
device: str = 'cuda',
dtype: torch.dtype = torch.float32
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""Create CTC inputs with log-simplex distributed inputs"""
# Generate random simplex points then take log
# Sample from Dirichlet distribution (uniform over simplex)
alpha = torch.ones(num_classes)
simplex_samples = torch.distributions.Dirichlet(alpha).sample((input_length, batch_size))
# Take log to get log-simplex inputs
raw_inputs = torch.log(simplex_samples).to(device=device, dtype=dtype)
# Targets in concatenated format for CUDNN
targets = []
target_lengths_list = []
for _ in range(batch_size):
# Generate random target sequence (excluding blank=0)
target_seq = torch.randint(1, num_classes, (target_length,), device=device, dtype=torch.int32)
targets.append(target_seq)
target_lengths_list.append(target_length)
# Concatenate targets for CUDNN format
targets = torch.cat(targets).to(device)
# Input lengths - all must be T for CUDNN (input_length for each batch)
input_lengths = torch.full((batch_size,), input_length, device=device, dtype=torch.int32)
target_lengths = torch.tensor(target_lengths_list, device=device, dtype=torch.int32)
return raw_inputs, targets, input_lengths, target_lengths
def compare_ctc_forward_losses(
batch_size: int = 4,
num_classes: int = 10,
input_length: int = 50,
target_length: int = 30,
blank: int = 0,
reduction: str = 'mean',
input_type: str = 'random'
) -> dict:
"""Compare CTC forward pass losses across CUDNN, CUDA native, and CPU implementations"""
results = {}
# Ensure target_length satisfies CUDNN constraint (≤ 256)
if target_length > 256:
raise ValueError(f"target_length {target_length} exceeds CUDNN limit of 256")
print(f"Testing with batch_size={batch_size}, num_classes={num_classes}, "
f"input_length={input_length}, target_length={target_length}, input_type={input_type}")
# Create inputs based on type
if input_type == 'log_simplex':
log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda = create_ctc_inputs_log_simplex(
batch_size, num_classes, input_length, target_length, device='cuda', dtype=torch.float32
)
else: # 'random'
log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda = create_ctc_inputs_random(
batch_size, num_classes, input_length, target_length, device='cuda', dtype=torch.float32
)
# Create inputs for CPU (copy to CPU)
log_probs_cpu = log_probs_cuda.detach().cpu()
targets_cpu = targets_cuda.cpu()
input_lengths_cpu = input_lengths_cuda.cpu()
target_lengths_cpu = target_lengths_cuda.cpu()
# Test CUDNN implementation
print("Testing CUDNN implementation...")
ctc_loss_cudnn = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CUDA],
record_shapes=True
) as prof_cudnn:
loss_cudnn = ctc_loss_cudnn(log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda)
# Check if CUDNN kernel was used
cudnn_kernel_used = any('cudnn' in str(event).lower() for event in prof_cudnn.events())
results['cudnn_kernel_used'] = cudnn_kernel_used
results['cudnn_loss'] = loss_cudnn.item()
print(f"CUDNN kernel used: {cudnn_kernel_used}")
print(f"CUDNN loss: {loss_cudnn.item()}")
# Test CUDA native implementation (force no CUDNN)
print("Testing CUDA native implementation...")
log_probs_cuda_native = log_probs_cuda.detach()
# Disable CUDNN for this test
with torch.backends.cudnn.flags(enabled=False):
ctc_loss_cuda_native = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CUDA],
record_shapes=True
) as prof_cuda_native:
loss_cuda_native = ctc_loss_cuda_native(log_probs_cuda_native, targets_cuda, input_lengths_cuda, target_lengths_cuda)
# Verify no CUDNN kernel was used
cuda_native_kernel_used = any('cudnn' in str(event).lower() for event in prof_cuda_native.events())
results['cuda_native_kernel_used'] = cuda_native_kernel_used
results['cuda_native_loss'] = loss_cuda_native.item()
print(f"CUDA native (no CUDNN) kernel used CUDNN: {cuda_native_kernel_used}")
print(f"CUDA native loss: {loss_cuda_native.item()}")
# Test CPU implementation
print("Testing CPU implementation...")
ctc_loss_cpu = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)
loss_cpu = ctc_loss_cpu(log_probs_cpu, targets_cpu, input_lengths_cpu, target_lengths_cpu)
results['cpu_loss'] = loss_cpu.item()
print(f"CPU loss: {loss_cpu.item()}")
# Compare losses
cudnn_loss = results['cudnn_loss']
cuda_native_loss = results['cuda_native_loss']
cpu_loss = results['cpu_loss']
# Compute differences
cudnn_vs_cuda_native_diff = abs(cudnn_loss - cuda_native_loss)
cudnn_vs_cpu_diff = abs(cudnn_loss - cpu_loss)
cuda_native_vs_cpu_diff = abs(cuda_native_loss - cpu_loss)
results['diff_cudnn_vs_cuda_native'] = cudnn_vs_cuda_native_diff
results['diff_cudnn_vs_cpu'] = cudnn_vs_cpu_diff
results['diff_cuda_native_vs_cpu'] = cuda_native_vs_cpu_diff
# Relative differences (use CUDNN loss as reference)
if abs(cudnn_loss) > 1e-10:
results['relative_diff_cudnn_vs_cuda_native'] = cudnn_vs_cuda_native_diff / abs(cudnn_loss)
results['relative_diff_cudnn_vs_cpu'] = cudnn_vs_cpu_diff / abs(cudnn_loss)
else:
results['relative_diff_cudnn_vs_cuda_native'] = float('inf') if cudnn_vs_cuda_native_diff > 0 else 0
results['relative_diff_cudnn_vs_cpu'] = float('inf') if cudnn_vs_cpu_diff > 0 else 0
print(f"\nLoss Comparison Results:")
print(f"Diff CUDNN vs CUDA native: {results['diff_cudnn_vs_cuda_native']:.2e}")
print(f"Diff CUDNN vs CPU: {results['diff_cudnn_vs_cpu']:.2e}")
print(f"Diff CUDA native vs CPU: {results['diff_cuda_native_vs_cpu']:.2e}")
print(f"Relative diff CUDNN vs CUDA native: {results['relative_diff_cudnn_vs_cuda_native']:.2e}")
print(f"Relative diff CUDNN vs CPU: {results['relative_diff_cudnn_vs_cpu']:.2e}")
return results
def run_comprehensive_tests():
"""Run tests with various parameter configurations and input types"""
if not torch.cuda.is_available():
print("CUDA not available, skipping tests")
return
print("=== Comprehensive CUDNN CTC Forward Loss Tests ===\n")
test_configs = [
# (batch_size, num_classes, input_length, target_length)
(2, 5, 10, 5), # Small test
(4, 10, 50, 30), # Medium test
(8, 20, 100, 60), # Larger test
(1, 10, 200, 150), # Long target length test
(3, 15, 80, 256), # Maximum target length for CUDNN
]
input_types = ['random', 'log_simplex']
all_results = []
for input_type in input_types:
print(f"\n=== Testing with {input_type.upper()} inputs ===")
for i, (batch_size, num_classes, input_length, target_length) in enumerate(test_configs):
print(f"\n--- Test {i+1}/{len(test_configs)} ({input_type}) ---")
try:
results = compare_ctc_forward_losses(
batch_size, num_classes, input_length, target_length, input_type=input_type
)
results['config'] = (batch_size, num_classes, input_length, target_length)
results['input_type'] = input_type
all_results.append(results)
except Exception as e:
print(f"Test failed: {e}")
continue
print("-" * 50)
# Summary analysis
print("\n=== SUMMARY ANALYSIS ===")
if not all_results:
print("No successful tests to analyze")
return
# Group results by input type
random_results = [r for r in all_results if r['input_type'] == 'random']
log_simplex_results = [r for r in all_results if r['input_type'] == 'log_simplex']
for input_type, results in [('random', random_results), ('log_simplex', log_simplex_results)]:
if not results:
continue
print(f"\n--- {input_type.upper()} Input Analysis ---")
# Check if CUDNN was actually used in any test
cudnn_used_count = sum(1 for r in results if r['cudnn_kernel_used'])
print(f"CUDNN kernel used in {cudnn_used_count}/{len(results)} tests")
if cudnn_used_count == 0:
print("WARNING: CUDNN kernel was never used! Check implementation constraints.")
# Analyze loss differences
cudnn_cpu_diffs = [r['diff_cudnn_vs_cpu'] for r in results]
cudnn_cuda_diffs = [r['diff_cudnn_vs_cuda_native'] for r in results]
print(f"CUDNN vs CPU loss differences: {min(cudnn_cpu_diffs):.2e} to {max(cudnn_cpu_diffs):.2e}")
print(f"CUDNN vs CUDA native loss differences: {min(cudnn_cuda_diffs):.2e} to {max(cudnn_cuda_diffs):.2e}")
# Determine if differences are likely precision vs mathematical errors
large_diff_threshold = 1e-3
precision_threshold = 1e-5
large_diffs = [d for d in cudnn_cpu_diffs if d > large_diff_threshold]
precision_diffs = [d for d in cudnn_cpu_diffs if d <= precision_threshold]
print(f"Differences > {large_diff_threshold} (likely mathematical errors): {len(large_diffs)}")
print(f"Differences ≤ {precision_threshold} (likely precision issues): {len(precision_diffs)}")
if len(large_diffs) > 0:
print("CONCLUSION: CUDNN implementation likely has mathematical errors")
elif len(precision_diffs) == len(cudnn_cpu_diffs):
print("CONCLUSION: Differences likely due to precision issues only")
else:
print("CONCLUSION: Mixed results - some precision, some potentially mathematical issues")
if __name__ == "__main__":
run_comprehensive_tests()
=== Comprehensive CUDNN CTC Forward Loss Tests ===
=== Testing with RANDOM inputs ===
--- Test 1/5 (random) ---
Testing with batch_size=2, num_classes=5, input_length=10, target_length=5, input_type=random
Testing CUDNN implementation...
CUDNN kernel used: True
CUDNN loss: 2.2869324684143066
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: -1.884143590927124
Testing CPU implementation...
CPU loss: -1.884143590927124
Loss Comparison Results:
Diff CUDNN vs CUDA native: 4.17e+00
Diff CUDNN vs CPU: 4.17e+00
Diff CUDA native vs CPU: 0.00e+00
Relative diff CUDNN vs CUDA native: 1.82e+00
Relative diff CUDNN vs CPU: 1.82e+00
--------------------------------------------------
--- Test 2/5 (random) ---
Testing with batch_size=4, num_classes=10, input_length=50, target_length=30, input_type=random
Testing CUDNN implementation...
CUDNN kernel used: True
CUDNN loss: 2.8773937225341797
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: -1.6434334516525269
Testing CPU implementation...
CPU loss: -1.6434335708618164
Loss Comparison Results:
Diff CUDNN vs CUDA native: 4.52e+00
Diff CUDNN vs CPU: 4.52e+00
Diff CUDA native vs CPU: 1.19e-07
Relative diff CUDNN vs CUDA native: 1.57e+00
Relative diff CUDNN vs CPU: 1.57e+00
--------------------------------------------------
--- Test 3/5 (random) ---
Testing with batch_size=8, num_classes=20, input_length=100, target_length=60, input_type=random
Testing CUDNN implementation...
CUDNN kernel used: True
CUDNN loss: 3.8359289169311523
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: -1.9019420146942139
Testing CPU implementation...
CPU loss: -1.9019420146942139
Loss Comparison Results:
Diff CUDNN vs CUDA native: 5.74e+00
Diff CUDNN vs CPU: 5.74e+00
Diff CUDA native vs CPU: 0.00e+00
Relative diff CUDNN vs CUDA native: 1.50e+00
Relative diff CUDNN vs CPU: 1.50e+00
--------------------------------------------------
--- Test 4/5 (random) ---
Testing with batch_size=1, num_classes=10, input_length=200, target_length=150, input_type=random
Testing CUDNN implementation...
CUDNN kernel used: True
CUDNN loss: 2.4502780437469482
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: -1.1409305334091187
Testing CPU implementation...
CPU loss: -1.1409305334091187
Loss Comparison Results:
Diff CUDNN vs CUDA native: 3.59e+00
Diff CUDNN vs CPU: 3.59e+00
Diff CUDA native vs CPU: 0.00e+00
Relative diff CUDNN vs CUDA native: 1.47e+00
Relative diff CUDNN vs CPU: 1.47e+00
--------------------------------------------------
--- Test 5/5 (random) ---
Testing with batch_size=3, num_classes=15, input_length=80, target_length=256, input_type=random
Testing CUDNN implementation...
CUDNN kernel used: False
CUDNN loss: 0.0
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: 0.0
Testing CPU implementation...
CPU loss: 0.0
Loss Comparison Results:
Diff CUDNN vs CUDA native: 0.00e+00
Diff CUDNN vs CPU: 0.00e+00
Diff CUDA native vs CPU: 0.00e+00
Relative diff CUDNN vs CUDA native: 0.00e+00
Relative diff CUDNN vs CPU: 0.00e+00
--------------------------------------------------
=== Testing with LOG_SIMPLEX inputs ===
--- Test 1/5 (log_simplex) ---
Testing with batch_size=2, num_classes=5, input_length=10, target_length=5, input_type=log_simplex
Testing CUDNN implementation...
CUDNN kernel used: True
CUDNN loss: 1.802871584892273
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: 1.8028714656829834
Testing CPU implementation...
CPU loss: 1.8028714656829834
Loss Comparison Results:
Diff CUDNN vs CUDA native: 1.19e-07
Diff CUDNN vs CPU: 1.19e-07
Diff CUDA native vs CPU: 0.00e+00
Relative diff CUDNN vs CUDA native: 6.61e-08
Relative diff CUDNN vs CPU: 6.61e-08
--------------------------------------------------
--- Test 2/5 (log_simplex) ---
Testing with batch_size=4, num_classes=10, input_length=50, target_length=30, input_type=log_simplex
Testing CUDNN implementation...
CUDNN kernel used: True
CUDNN loss: 2.71065616607666
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: 2.71065616607666
Testing CPU implementation...
CPU loss: 2.7106564044952393
Loss Comparison Results:
Diff CUDNN vs CUDA native: 0.00e+00
Diff CUDNN vs CPU: 2.38e-07
Diff CUDA native vs CPU: 2.38e-07
Relative diff CUDNN vs CUDA native: 0.00e+00
Relative diff CUDNN vs CPU: 8.80e-08
--------------------------------------------------
--- Test 3/5 (log_simplex) ---
Testing with batch_size=8, num_classes=20, input_length=100, target_length=60, input_type=log_simplex
Testing CUDNN implementation...
CUDNN kernel used: True
CUDNN loss: 3.7439799308776855
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: 3.7439794540405273
Testing CPU implementation...
CPU loss: 3.7439799308776855
Loss Comparison Results:
Diff CUDNN vs CUDA native: 4.77e-07
Diff CUDNN vs CPU: 0.00e+00
Diff CUDA native vs CPU: 4.77e-07
Relative diff CUDNN vs CUDA native: 1.27e-07
Relative diff CUDNN vs CPU: 0.00e+00
--------------------------------------------------
--- Test 4/5 (log_simplex) ---
Testing with batch_size=1, num_classes=10, input_length=200, target_length=150, input_type=log_simplex
Testing CUDNN implementation...
CUDNN kernel used: True
CUDNN loss: 2.4955265522003174
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: 2.4955267906188965
Testing CPU implementation...
CPU loss: 2.4955267906188965
Loss Comparison Results:
Diff CUDNN vs CUDA native: 2.38e-07
Diff CUDNN vs CPU: 2.38e-07
Diff CUDA native vs CPU: 0.00e+00
Relative diff CUDNN vs CUDA native: 9.55e-08
Relative diff CUDNN vs CPU: 9.55e-08
--------------------------------------------------
--- Test 5/5 (log_simplex) ---
Testing with batch_size=3, num_classes=15, input_length=80, target_length=256, input_type=log_simplex
Testing CUDNN implementation...
CUDNN kernel used: False
CUDNN loss: 0.0
Testing CUDA native implementation...
CUDA native (no CUDNN) kernel used CUDNN: False
CUDA native loss: 0.0
Testing CPU implementation...
CPU loss: 0.0
Loss Comparison Results:
Diff CUDNN vs CUDA native: 0.00e+00
Diff CUDNN vs CPU: 0.00e+00
Diff CUDA native vs CPU: 0.00e+00
Relative diff CUDNN vs CUDA native: 0.00e+00
Relative diff CUDNN vs CPU: 0.00e+00
--------------------------------------------------
=== SUMMARY ANALYSIS ===
--- RANDOM Input Analysis ---
CUDNN kernel used in 4/5 tests
CUDNN vs CPU loss differences: 0.00e+00 to 5.74e+00
CUDNN vs CUDA native loss differences: 0.00e+00 to 5.74e+00
Differences > 0.001 (likely mathematical errors): 4
Differences ≤ 1e-05 (likely precision issues): 1
CONCLUSION: CUDNN implementation likely has mathematical errors
--- LOG_SIMPLEX Input Analysis ---
CUDNN kernel used in 4/5 tests
CUDNN vs CPU loss differences: 0.00e+00 to 2.38e-07
CUDNN vs CUDA native loss differences: 0.00e+00 to 4.77e-07
Differences > 0.001 (likely mathematical errors): 0
Differences ≤ 1e-05 (likely precision issues): 5
CONCLUSION: Differences likely due to precision issues only
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment