redwrasse · August 16, 2025 18:56
diff --git a/claude_code_forward_pass.py b/claude_code_forward_pass.py
 # Claude-code generated script that compares the forward pass loss generated by CUDA native, CUDNN, and CPU nativ CTC loss implementations
 # for both R^n random inputs and log-simplex random inputs

 import torch
 import torch.nn as nn
 import torch.profiler
 import numpy as np
 from typing import Tuple, List, Optional
 import warnings

 def create_ctc_inputs_random(
    batch_size: int,
    num_classes: int, 
    input_length: int,
    target_length: int,
    device: str = 'cuda',
    dtype: torch.dtype = torch.float32
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Create CTC inputs with random R^n inputs"""
    
    # Raw inputs (T, N, C) - random from R^n
    raw_inputs = torch.randn(input_length, batch_size, num_classes,
                           device=device, dtype=dtype)
    
    # Targets in concatenated format for CUDNN
    targets = []
    target_lengths_list = []
    
    for _ in range(batch_size):
        # Generate random target sequence (excluding blank=0)
        target_seq = torch.randint(1, num_classes, (target_length,), device=device, dtype=torch.int32)
        targets.append(target_seq)
        target_lengths_list.append(target_length)
    
    # Concatenate targets for CUDNN format
    targets = torch.cat(targets).to(device)
    
    # Input lengths - all must be T for CUDNN (input_length for each batch)
    input_lengths = torch.full((batch_size,), input_length, device=device, dtype=torch.int32)
    target_lengths = torch.tensor(target_lengths_list, device=device, dtype=torch.int32)
    
    return raw_inputs, targets, input_lengths, target_lengths

 def create_ctc_inputs_log_simplex(
    batch_size: int,
    num_classes: int, 
    input_length: int,
    target_length: int,
    device: str = 'cuda',
    dtype: torch.dtype = torch.float32
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Create CTC inputs with log-simplex distributed inputs"""
    
    # Generate random simplex points then take log
    # Sample from Dirichlet distribution (uniform over simplex)
    alpha = torch.ones(num_classes)
    simplex_samples = torch.distributions.Dirichlet(alpha).sample((input_length, batch_size))
    
    # Take log to get log-simplex inputs
    raw_inputs = torch.log(simplex_samples).to(device=device, dtype=dtype)
    
    # Targets in concatenated format for CUDNN
    targets = []
    target_lengths_list = []
    
    for _ in range(batch_size):
        # Generate random target sequence (excluding blank=0)
        target_seq = torch.randint(1, num_classes, (target_length,), device=device, dtype=torch.int32)
        targets.append(target_seq)
        target_lengths_list.append(target_length)
    
    # Concatenate targets for CUDNN format
    targets = torch.cat(targets).to(device)
    
    # Input lengths - all must be T for CUDNN (input_length for each batch)
    input_lengths = torch.full((batch_size,), input_length, device=device, dtype=torch.int32)
    target_lengths = torch.tensor(target_lengths_list, device=device, dtype=torch.int32)
    
    return raw_inputs, targets, input_lengths, target_lengths

 def compare_ctc_forward_losses(
    batch_size: int = 4,
    num_classes: int = 10,
    input_length: int = 50,
    target_length: int = 30,
    blank: int = 0,
    reduction: str = 'mean',
    input_type: str = 'random'
 ) -> dict:
    """Compare CTC forward pass losses across CUDNN, CUDA native, and CPU implementations"""
    
    results = {}
    
    # Ensure target_length satisfies CUDNN constraint (≤ 256)
    if target_length > 256:
        raise ValueError(f"target_length {target_length} exceeds CUDNN limit of 256")
    
    print(f"Testing with batch_size={batch_size}, num_classes={num_classes}, "
          f"input_length={input_length}, target_length={target_length}, input_type={input_type}")
    
    # Create inputs based on type
    if input_type == 'log_simplex':
        log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda = create_ctc_inputs_log_simplex(
            batch_size, num_classes, input_length, target_length, device='cuda', dtype=torch.float32
        )
    else:  # 'random'
        log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda = create_ctc_inputs_random(
            batch_size, num_classes, input_length, target_length, device='cuda', dtype=torch.float32
        )
    
    # Create inputs for CPU (copy to CPU)
    log_probs_cpu = log_probs_cuda.detach().cpu()
    targets_cpu = targets_cuda.cpu()
    input_lengths_cpu = input_lengths_cuda.cpu()
    target_lengths_cpu = target_lengths_cuda.cpu()
    
    # Test CUDNN implementation
    print("Testing CUDNN implementation...")
    ctc_loss_cudnn = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)
    
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CUDA],
        record_shapes=True
    ) as prof_cudnn:
        loss_cudnn = ctc_loss_cudnn(log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda)
    
    # Check if CUDNN kernel was used
    cudnn_kernel_used = any('cudnn' in str(event).lower() for event in prof_cudnn.events())
    results['cudnn_kernel_used'] = cudnn_kernel_used
    results['cudnn_loss'] = loss_cudnn.item()
    
    print(f"CUDNN kernel used: {cudnn_kernel_used}")
    print(f"CUDNN loss: {loss_cudnn.item()}")
    
    # Test CUDA native implementation (force no CUDNN)
    print("Testing CUDA native implementation...")
    log_probs_cuda_native = log_probs_cuda.detach()
    
    # Disable CUDNN for this test
    with torch.backends.cudnn.flags(enabled=False):
        ctc_loss_cuda_native = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)
        
        with torch.profiler.profile(
            activities=[torch.profiler.ProfilerActivity.CUDA],
            record_shapes=True
        ) as prof_cuda_native:
            loss_cuda_native = ctc_loss_cuda_native(log_probs_cuda_native, targets_cuda, input_lengths_cuda, target_lengths_cuda)
    
    # Verify no CUDNN kernel was used
    cuda_native_kernel_used = any('cudnn' in str(event).lower() for event in prof_cuda_native.events())
    results['cuda_native_kernel_used'] = cuda_native_kernel_used
    results['cuda_native_loss'] = loss_cuda_native.item()
    
    print(f"CUDA native (no CUDNN) kernel used CUDNN: {cuda_native_kernel_used}")
    print(f"CUDA native loss: {loss_cuda_native.item()}")
    
    # Test CPU implementation
    print("Testing CPU implementation...")
    ctc_loss_cpu = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)
    loss_cpu = ctc_loss_cpu(log_probs_cpu, targets_cpu, input_lengths_cpu, target_lengths_cpu)
    
    results['cpu_loss'] = loss_cpu.item()
    
    print(f"CPU loss: {loss_cpu.item()}")
    
    # Compare losses
    cudnn_loss = results['cudnn_loss']
    cuda_native_loss = results['cuda_native_loss']
    cpu_loss = results['cpu_loss']
    
    # Compute differences
    cudnn_vs_cuda_native_diff = abs(cudnn_loss - cuda_native_loss)
    cudnn_vs_cpu_diff = abs(cudnn_loss - cpu_loss)
    cuda_native_vs_cpu_diff = abs(cuda_native_loss - cpu_loss)
    
    results['diff_cudnn_vs_cuda_native'] = cudnn_vs_cuda_native_diff
    results['diff_cudnn_vs_cpu'] = cudnn_vs_cpu_diff
    results['diff_cuda_native_vs_cpu'] = cuda_native_vs_cpu_diff
    
    # Relative differences (use CUDNN loss as reference)
    if abs(cudnn_loss) > 1e-10:
        results['relative_diff_cudnn_vs_cuda_native'] = cudnn_vs_cuda_native_diff / abs(cudnn_loss)
        results['relative_diff_cudnn_vs_cpu'] = cudnn_vs_cpu_diff / abs(cudnn_loss)
    else:
        results['relative_diff_cudnn_vs_cuda_native'] = float('inf') if cudnn_vs_cuda_native_diff > 0 else 0
        results['relative_diff_cudnn_vs_cpu'] = float('inf') if cudnn_vs_cpu_diff > 0 else 0
    
    print(f"\nLoss Comparison Results:")
    print(f"Diff CUDNN vs CUDA native: {results['diff_cudnn_vs_cuda_native']:.2e}")
    print(f"Diff CUDNN vs CPU: {results['diff_cudnn_vs_cpu']:.2e}")
    print(f"Diff CUDA native vs CPU: {results['diff_cuda_native_vs_cpu']:.2e}")
    print(f"Relative diff CUDNN vs CUDA native: {results['relative_diff_cudnn_vs_cuda_native']:.2e}")
    print(f"Relative diff CUDNN vs CPU: {results['relative_diff_cudnn_vs_cpu']:.2e}")
    
    return results

 def run_comprehensive_tests():
    """Run tests with various parameter configurations and input types"""
    
    if not torch.cuda.is_available():
        print("CUDA not available, skipping tests")
        return
        
    print("=== Comprehensive CUDNN CTC Forward Loss Tests ===\n")
    
    test_configs = [
        # (batch_size, num_classes, input_length, target_length)
        (2, 5, 10, 5),         # Small test
        (4, 10, 50, 30),       # Medium test  
        (8, 20, 100, 60),      # Larger test
        (1, 10, 200, 150),     # Long target length test
        (3, 15, 80, 256),      # Maximum target length for CUDNN
    ]
    
    input_types = ['random', 'log_simplex']
    
    all_results = []
    
    for input_type in input_types:
        print(f"\n=== Testing with {input_type.upper()} inputs ===")
        
        for i, (batch_size, num_classes, input_length, target_length) in enumerate(test_configs):
            print(f"\n--- Test {i+1}/{len(test_configs)} ({input_type}) ---")
            try:
                results = compare_ctc_forward_losses(
                    batch_size, num_classes, input_length, target_length, input_type=input_type
                )
                results['config'] = (batch_size, num_classes, input_length, target_length)
                results['input_type'] = input_type
                all_results.append(results)
            except Exception as e:
                print(f"Test failed: {e}")
                continue
            print("-" * 50)
    
    # Summary analysis
    print("\n=== SUMMARY ANALYSIS ===")
    
    if not all_results:
        print("No successful tests to analyze")
        return
    
    # Group results by input type
    random_results = [r for r in all_results if r['input_type'] == 'random']
    log_simplex_results = [r for r in all_results if r['input_type'] == 'log_simplex']
    
    for input_type, results in [('random', random_results), ('log_simplex', log_simplex_results)]:
        if not results:
            continue
            
        print(f"\n--- {input_type.upper()} Input Analysis ---")
        
        # Check if CUDNN was actually used in any test
        cudnn_used_count = sum(1 for r in results if r['cudnn_kernel_used'])
        print(f"CUDNN kernel used in {cudnn_used_count}/{len(results)} tests")
        
        if cudnn_used_count == 0:
            print("WARNING: CUDNN kernel was never used! Check implementation constraints.")
        
        # Analyze loss differences
        cudnn_cpu_diffs = [r['diff_cudnn_vs_cpu'] for r in results]
        cudnn_cuda_diffs = [r['diff_cudnn_vs_cuda_native'] for r in results]
        
        print(f"CUDNN vs CPU loss differences: {min(cudnn_cpu_diffs):.2e} to {max(cudnn_cpu_diffs):.2e}")
        print(f"CUDNN vs CUDA native loss differences: {min(cudnn_cuda_diffs):.2e} to {max(cudnn_cuda_diffs):.2e}")
        
        # Determine if differences are likely precision vs mathematical errors
        large_diff_threshold = 1e-3
        precision_threshold = 1e-5
        
        large_diffs = [d for d in cudnn_cpu_diffs if d > large_diff_threshold]
        precision_diffs = [d for d in cudnn_cpu_diffs if d <= precision_threshold]
        
        print(f"Differences > {large_diff_threshold} (likely mathematical errors): {len(large_diffs)}")
        print(f"Differences ≤ {precision_threshold} (likely precision issues): {len(precision_diffs)}")
        
        if len(large_diffs) > 0:
            print("CONCLUSION: CUDNN implementation likely has mathematical errors")
        elif len(precision_diffs) == len(cudnn_cpu_diffs):
            print("CONCLUSION: Differences likely due to precision issues only")
        else:
            print("CONCLUSION: Mixed results - some precision, some potentially mathematical issues")

 if __name__ == "__main__":
    run_comprehensive_tests()
diff --git a/Results from Colab b/Results from Colab
 === Comprehensive CUDNN CTC Forward Loss Tests ===


 === Testing with RANDOM inputs ===

 --- Test 1/5 (random) ---
 Testing with batch_size=2, num_classes=5, input_length=10, target_length=5, input_type=random
 Testing CUDNN implementation...
 CUDNN kernel used: True
 CUDNN loss: 2.2869324684143066
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: -1.884143590927124
 Testing CPU implementation...
 CPU loss: -1.884143590927124

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 4.17e+00
 Diff CUDNN vs CPU: 4.17e+00
 Diff CUDA native vs CPU: 0.00e+00
 Relative diff CUDNN vs CUDA native: 1.82e+00
 Relative diff CUDNN vs CPU: 1.82e+00
 --------------------------------------------------

 --- Test 2/5 (random) ---
 Testing with batch_size=4, num_classes=10, input_length=50, target_length=30, input_type=random
 Testing CUDNN implementation...
 CUDNN kernel used: True
 CUDNN loss: 2.8773937225341797
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: -1.6434334516525269
 Testing CPU implementation...
 CPU loss: -1.6434335708618164

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 4.52e+00
 Diff CUDNN vs CPU: 4.52e+00
 Diff CUDA native vs CPU: 1.19e-07
 Relative diff CUDNN vs CUDA native: 1.57e+00
 Relative diff CUDNN vs CPU: 1.57e+00
 --------------------------------------------------

 --- Test 3/5 (random) ---
 Testing with batch_size=8, num_classes=20, input_length=100, target_length=60, input_type=random
 Testing CUDNN implementation...
 CUDNN kernel used: True
 CUDNN loss: 3.8359289169311523
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: -1.9019420146942139
 Testing CPU implementation...
 CPU loss: -1.9019420146942139

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 5.74e+00
 Diff CUDNN vs CPU: 5.74e+00
 Diff CUDA native vs CPU: 0.00e+00
 Relative diff CUDNN vs CUDA native: 1.50e+00
 Relative diff CUDNN vs CPU: 1.50e+00
 --------------------------------------------------

 --- Test 4/5 (random) ---
 Testing with batch_size=1, num_classes=10, input_length=200, target_length=150, input_type=random
 Testing CUDNN implementation...
 CUDNN kernel used: True
 CUDNN loss: 2.4502780437469482
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: -1.1409305334091187
 Testing CPU implementation...
 CPU loss: -1.1409305334091187

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 3.59e+00
 Diff CUDNN vs CPU: 3.59e+00
 Diff CUDA native vs CPU: 0.00e+00
 Relative diff CUDNN vs CUDA native: 1.47e+00
 Relative diff CUDNN vs CPU: 1.47e+00
 --------------------------------------------------

 --- Test 5/5 (random) ---
 Testing with batch_size=3, num_classes=15, input_length=80, target_length=256, input_type=random
 Testing CUDNN implementation...
 CUDNN kernel used: False
 CUDNN loss: 0.0
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: 0.0
 Testing CPU implementation...
 CPU loss: 0.0

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 0.00e+00
 Diff CUDNN vs CPU: 0.00e+00
 Diff CUDA native vs CPU: 0.00e+00
 Relative diff CUDNN vs CUDA native: 0.00e+00
 Relative diff CUDNN vs CPU: 0.00e+00
 --------------------------------------------------

 === Testing with LOG_SIMPLEX inputs ===

 --- Test 1/5 (log_simplex) ---
 Testing with batch_size=2, num_classes=5, input_length=10, target_length=5, input_type=log_simplex
 Testing CUDNN implementation...
 CUDNN kernel used: True
 CUDNN loss: 1.802871584892273
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: 1.8028714656829834
 Testing CPU implementation...
 CPU loss: 1.8028714656829834

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 1.19e-07
 Diff CUDNN vs CPU: 1.19e-07
 Diff CUDA native vs CPU: 0.00e+00
 Relative diff CUDNN vs CUDA native: 6.61e-08
 Relative diff CUDNN vs CPU: 6.61e-08
 --------------------------------------------------

 --- Test 2/5 (log_simplex) ---
 Testing with batch_size=4, num_classes=10, input_length=50, target_length=30, input_type=log_simplex
 Testing CUDNN implementation...
 CUDNN kernel used: True
 CUDNN loss: 2.71065616607666
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: 2.71065616607666
 Testing CPU implementation...
 CPU loss: 2.7106564044952393

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 0.00e+00
 Diff CUDNN vs CPU: 2.38e-07
 Diff CUDA native vs CPU: 2.38e-07
 Relative diff CUDNN vs CUDA native: 0.00e+00
 Relative diff CUDNN vs CPU: 8.80e-08
 --------------------------------------------------

 --- Test 3/5 (log_simplex) ---
 Testing with batch_size=8, num_classes=20, input_length=100, target_length=60, input_type=log_simplex
 Testing CUDNN implementation...
 CUDNN kernel used: True
 CUDNN loss: 3.7439799308776855
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: 3.7439794540405273
 Testing CPU implementation...
 CPU loss: 3.7439799308776855

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 4.77e-07
 Diff CUDNN vs CPU: 0.00e+00
 Diff CUDA native vs CPU: 4.77e-07
 Relative diff CUDNN vs CUDA native: 1.27e-07
 Relative diff CUDNN vs CPU: 0.00e+00
 --------------------------------------------------

 --- Test 4/5 (log_simplex) ---
 Testing with batch_size=1, num_classes=10, input_length=200, target_length=150, input_type=log_simplex
 Testing CUDNN implementation...
 CUDNN kernel used: True
 CUDNN loss: 2.4955265522003174
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: 2.4955267906188965
 Testing CPU implementation...
 CPU loss: 2.4955267906188965

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 2.38e-07
 Diff CUDNN vs CPU: 2.38e-07
 Diff CUDA native vs CPU: 0.00e+00
 Relative diff CUDNN vs CUDA native: 9.55e-08
 Relative diff CUDNN vs CPU: 9.55e-08
 --------------------------------------------------

 --- Test 5/5 (log_simplex) ---
 Testing with batch_size=3, num_classes=15, input_length=80, target_length=256, input_type=log_simplex
 Testing CUDNN implementation...
 CUDNN kernel used: False
 CUDNN loss: 0.0
 Testing CUDA native implementation...
 CUDA native (no CUDNN) kernel used CUDNN: False
 CUDA native loss: 0.0
 Testing CPU implementation...
 CPU loss: 0.0

 Loss Comparison Results:
 Diff CUDNN vs CUDA native: 0.00e+00
 Diff CUDNN vs CPU: 0.00e+00
 Diff CUDA native vs CPU: 0.00e+00
 Relative diff CUDNN vs CUDA native: 0.00e+00
 Relative diff CUDNN vs CPU: 0.00e+00
 --------------------------------------------------

 === SUMMARY ANALYSIS ===

 --- RANDOM Input Analysis ---
 CUDNN kernel used in 4/5 tests
 CUDNN vs CPU loss differences: 0.00e+00 to 5.74e+00
 CUDNN vs CUDA native loss differences: 0.00e+00 to 5.74e+00
 Differences > 0.001 (likely mathematical errors): 4
 Differences ≤ 1e-05 (likely precision issues): 1
 CONCLUSION: CUDNN implementation likely has mathematical errors

 --- LOG_SIMPLEX Input Analysis ---
 CUDNN kernel used in 4/5 tests
 CUDNN vs CPU loss differences: 0.00e+00 to 2.38e-07
 CUDNN vs CUDA native loss differences: 0.00e+00 to 4.77e-07
 Differences > 0.001 (likely mathematical errors): 0
 Differences ≤ 1e-05 (likely precision issues): 5
 CONCLUSION: Differences likely due to precision issues only
	# Claude-code generated script that compares the forward pass loss generated by CUDA native, CUDNN, and CPU nativ CTC loss implementations
	# for both R^n random inputs and log-simplex random inputs

	import torch
	import torch.nn as nn
	import torch.profiler
	import numpy as np
	from typing import Tuple, List, Optional
	import warnings

	def create_ctc_inputs_random(
	batch_size: int,
	num_classes: int,
	input_length: int,
	target_length: int,
	device: str = 'cuda',
	dtype: torch.dtype = torch.float32
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	"""Create CTC inputs with random R^n inputs"""

	# Raw inputs (T, N, C) - random from R^n
	raw_inputs = torch.randn(input_length, batch_size, num_classes,
	device=device, dtype=dtype)

	# Targets in concatenated format for CUDNN
	targets = []
	target_lengths_list = []

	for _ in range(batch_size):
	# Generate random target sequence (excluding blank=0)
	target_seq = torch.randint(1, num_classes, (target_length,), device=device, dtype=torch.int32)
	targets.append(target_seq)
	target_lengths_list.append(target_length)

	# Concatenate targets for CUDNN format
	targets = torch.cat(targets).to(device)

	# Input lengths - all must be T for CUDNN (input_length for each batch)
	input_lengths = torch.full((batch_size,), input_length, device=device, dtype=torch.int32)
	target_lengths = torch.tensor(target_lengths_list, device=device, dtype=torch.int32)

	return raw_inputs, targets, input_lengths, target_lengths

	def create_ctc_inputs_log_simplex(
	batch_size: int,
	num_classes: int,
	input_length: int,
	target_length: int,
	device: str = 'cuda',
	dtype: torch.dtype = torch.float32
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	"""Create CTC inputs with log-simplex distributed inputs"""

	# Generate random simplex points then take log
	# Sample from Dirichlet distribution (uniform over simplex)
	alpha = torch.ones(num_classes)
	simplex_samples = torch.distributions.Dirichlet(alpha).sample((input_length, batch_size))

	# Take log to get log-simplex inputs
	raw_inputs = torch.log(simplex_samples).to(device=device, dtype=dtype)

	# Targets in concatenated format for CUDNN
	targets = []
	target_lengths_list = []

	for _ in range(batch_size):
	# Generate random target sequence (excluding blank=0)
	target_seq = torch.randint(1, num_classes, (target_length,), device=device, dtype=torch.int32)
	targets.append(target_seq)
	target_lengths_list.append(target_length)

	# Concatenate targets for CUDNN format
	targets = torch.cat(targets).to(device)

	# Input lengths - all must be T for CUDNN (input_length for each batch)
	input_lengths = torch.full((batch_size,), input_length, device=device, dtype=torch.int32)
	target_lengths = torch.tensor(target_lengths_list, device=device, dtype=torch.int32)

	return raw_inputs, targets, input_lengths, target_lengths

	def compare_ctc_forward_losses(
	batch_size: int = 4,
	num_classes: int = 10,
	input_length: int = 50,
	target_length: int = 30,
	blank: int = 0,
	reduction: str = 'mean',
	input_type: str = 'random'
	) -> dict:
	"""Compare CTC forward pass losses across CUDNN, CUDA native, and CPU implementations"""

	results = {}

	# Ensure target_length satisfies CUDNN constraint (≤ 256)
	if target_length > 256:
	raise ValueError(f"target_length {target_length} exceeds CUDNN limit of 256")

	print(f"Testing with batch_size={batch_size}, num_classes={num_classes}, "
	f"input_length={input_length}, target_length={target_length}, input_type={input_type}")

	# Create inputs based on type
	if input_type == 'log_simplex':
	log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda = create_ctc_inputs_log_simplex(
	batch_size, num_classes, input_length, target_length, device='cuda', dtype=torch.float32
	)
	else: # 'random'
	log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda = create_ctc_inputs_random(
	batch_size, num_classes, input_length, target_length, device='cuda', dtype=torch.float32
	)

	# Create inputs for CPU (copy to CPU)
	log_probs_cpu = log_probs_cuda.detach().cpu()
	targets_cpu = targets_cuda.cpu()
	input_lengths_cpu = input_lengths_cuda.cpu()
	target_lengths_cpu = target_lengths_cuda.cpu()

	# Test CUDNN implementation
	print("Testing CUDNN implementation...")
	ctc_loss_cudnn = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)

	with torch.profiler.profile(
	activities=[torch.profiler.ProfilerActivity.CUDA],
	record_shapes=True
	) as prof_cudnn:
	loss_cudnn = ctc_loss_cudnn(log_probs_cuda, targets_cuda, input_lengths_cuda, target_lengths_cuda)

	# Check if CUDNN kernel was used
	cudnn_kernel_used = any('cudnn' in str(event).lower() for event in prof_cudnn.events())
	results['cudnn_kernel_used'] = cudnn_kernel_used
	results['cudnn_loss'] = loss_cudnn.item()

	print(f"CUDNN kernel used: {cudnn_kernel_used}")
	print(f"CUDNN loss: {loss_cudnn.item()}")

	# Test CUDA native implementation (force no CUDNN)
	print("Testing CUDA native implementation...")
	log_probs_cuda_native = log_probs_cuda.detach()

	# Disable CUDNN for this test
	with torch.backends.cudnn.flags(enabled=False):
	ctc_loss_cuda_native = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)

	with torch.profiler.profile(
	activities=[torch.profiler.ProfilerActivity.CUDA],
	record_shapes=True
	) as prof_cuda_native:
	loss_cuda_native = ctc_loss_cuda_native(log_probs_cuda_native, targets_cuda, input_lengths_cuda, target_lengths_cuda)

	# Verify no CUDNN kernel was used
	cuda_native_kernel_used = any('cudnn' in str(event).lower() for event in prof_cuda_native.events())
	results['cuda_native_kernel_used'] = cuda_native_kernel_used
	results['cuda_native_loss'] = loss_cuda_native.item()

	print(f"CUDA native (no CUDNN) kernel used CUDNN: {cuda_native_kernel_used}")
	print(f"CUDA native loss: {loss_cuda_native.item()}")

	# Test CPU implementation
	print("Testing CPU implementation...")
	ctc_loss_cpu = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=True)
	loss_cpu = ctc_loss_cpu(log_probs_cpu, targets_cpu, input_lengths_cpu, target_lengths_cpu)

	results['cpu_loss'] = loss_cpu.item()

	print(f"CPU loss: {loss_cpu.item()}")

	# Compare losses
	cudnn_loss = results['cudnn_loss']
	cuda_native_loss = results['cuda_native_loss']
	cpu_loss = results['cpu_loss']

	# Compute differences
	cudnn_vs_cuda_native_diff = abs(cudnn_loss - cuda_native_loss)
	cudnn_vs_cpu_diff = abs(cudnn_loss - cpu_loss)
	cuda_native_vs_cpu_diff = abs(cuda_native_loss - cpu_loss)

	results['diff_cudnn_vs_cuda_native'] = cudnn_vs_cuda_native_diff
	results['diff_cudnn_vs_cpu'] = cudnn_vs_cpu_diff
	results['diff_cuda_native_vs_cpu'] = cuda_native_vs_cpu_diff

	# Relative differences (use CUDNN loss as reference)
	if abs(cudnn_loss) > 1e-10:
	results['relative_diff_cudnn_vs_cuda_native'] = cudnn_vs_cuda_native_diff / abs(cudnn_loss)
	results['relative_diff_cudnn_vs_cpu'] = cudnn_vs_cpu_diff / abs(cudnn_loss)
	else:
	results['relative_diff_cudnn_vs_cuda_native'] = float('inf') if cudnn_vs_cuda_native_diff > 0 else 0
	results['relative_diff_cudnn_vs_cpu'] = float('inf') if cudnn_vs_cpu_diff > 0 else 0

	print(f"\nLoss Comparison Results:")
	print(f"Diff CUDNN vs CUDA native: {results['diff_cudnn_vs_cuda_native']:.2e}")
	print(f"Diff CUDNN vs CPU: {results['diff_cudnn_vs_cpu']:.2e}")
	print(f"Diff CUDA native vs CPU: {results['diff_cuda_native_vs_cpu']:.2e}")
	print(f"Relative diff CUDNN vs CUDA native: {results['relative_diff_cudnn_vs_cuda_native']:.2e}")
	print(f"Relative diff CUDNN vs CPU: {results['relative_diff_cudnn_vs_cpu']:.2e}")

	return results

	def run_comprehensive_tests():
	"""Run tests with various parameter configurations and input types"""

	if not torch.cuda.is_available():
	print("CUDA not available, skipping tests")
	return

	print("=== Comprehensive CUDNN CTC Forward Loss Tests ===\n")

	test_configs = [
	# (batch_size, num_classes, input_length, target_length)
	(2, 5, 10, 5), # Small test
	(4, 10, 50, 30), # Medium test
	(8, 20, 100, 60), # Larger test
	(1, 10, 200, 150), # Long target length test
	(3, 15, 80, 256), # Maximum target length for CUDNN
	]

	input_types = ['random', 'log_simplex']

	all_results = []

	for input_type in input_types:
	print(f"\n=== Testing with {input_type.upper()} inputs ===")

	for i, (batch_size, num_classes, input_length, target_length) in enumerate(test_configs):
	print(f"\n--- Test {i+1}/{len(test_configs)} ({input_type}) ---")
	try:
	results = compare_ctc_forward_losses(
	batch_size, num_classes, input_length, target_length, input_type=input_type
	)
	results['config'] = (batch_size, num_classes, input_length, target_length)
	results['input_type'] = input_type
	all_results.append(results)
	except Exception as e:
	print(f"Test failed: {e}")
	continue
	print("-" * 50)

	# Summary analysis
	print("\n=== SUMMARY ANALYSIS ===")

	if not all_results:
	print("No successful tests to analyze")
	return

	# Group results by input type
	random_results = [r for r in all_results if r['input_type'] == 'random']
	log_simplex_results = [r for r in all_results if r['input_type'] == 'log_simplex']

	for input_type, results in [('random', random_results), ('log_simplex', log_simplex_results)]:
	if not results:
	continue

	print(f"\n--- {input_type.upper()} Input Analysis ---")

	# Check if CUDNN was actually used in any test
	cudnn_used_count = sum(1 for r in results if r['cudnn_kernel_used'])
	print(f"CUDNN kernel used in {cudnn_used_count}/{len(results)} tests")

	if cudnn_used_count == 0:
	print("WARNING: CUDNN kernel was never used! Check implementation constraints.")

	# Analyze loss differences
	cudnn_cpu_diffs = [r['diff_cudnn_vs_cpu'] for r in results]
	cudnn_cuda_diffs = [r['diff_cudnn_vs_cuda_native'] for r in results]

	print(f"CUDNN vs CPU loss differences: {min(cudnn_cpu_diffs):.2e} to {max(cudnn_cpu_diffs):.2e}")
	print(f"CUDNN vs CUDA native loss differences: {min(cudnn_cuda_diffs):.2e} to {max(cudnn_cuda_diffs):.2e}")

	# Determine if differences are likely precision vs mathematical errors
	large_diff_threshold = 1e-3
	precision_threshold = 1e-5

	large_diffs = [d for d in cudnn_cpu_diffs if d > large_diff_threshold]
	precision_diffs = [d for d in cudnn_cpu_diffs if d <= precision_threshold]

	print(f"Differences > {large_diff_threshold} (likely mathematical errors): {len(large_diffs)}")
	print(f"Differences ≤ {precision_threshold} (likely precision issues): {len(precision_diffs)}")

	if len(large_diffs) > 0:
	print("CONCLUSION: CUDNN implementation likely has mathematical errors")
	elif len(precision_diffs) == len(cudnn_cpu_diffs):
	print("CONCLUSION: Differences likely due to precision issues only")
	else:
	print("CONCLUSION: Mixed results - some precision, some potentially mathematical issues")

	if __name__ == "__main__":
	run_comprehensive_tests()
	=== Comprehensive CUDNN CTC Forward Loss Tests ===


	=== Testing with RANDOM inputs ===

	--- Test 1/5 (random) ---
	Testing with batch_size=2, num_classes=5, input_length=10, target_length=5, input_type=random
	Testing CUDNN implementation...
	CUDNN kernel used: True
	CUDNN loss: 2.2869324684143066
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: -1.884143590927124
	Testing CPU implementation...
	CPU loss: -1.884143590927124

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 4.17e+00
	Diff CUDNN vs CPU: 4.17e+00
	Diff CUDA native vs CPU: 0.00e+00
	Relative diff CUDNN vs CUDA native: 1.82e+00
	Relative diff CUDNN vs CPU: 1.82e+00
	--------------------------------------------------

	--- Test 2/5 (random) ---
	Testing with batch_size=4, num_classes=10, input_length=50, target_length=30, input_type=random
	Testing CUDNN implementation...
	CUDNN kernel used: True
	CUDNN loss: 2.8773937225341797
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: -1.6434334516525269
	Testing CPU implementation...
	CPU loss: -1.6434335708618164

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 4.52e+00
	Diff CUDNN vs CPU: 4.52e+00
	Diff CUDA native vs CPU: 1.19e-07
	Relative diff CUDNN vs CUDA native: 1.57e+00
	Relative diff CUDNN vs CPU: 1.57e+00
	--------------------------------------------------

	--- Test 3/5 (random) ---
	Testing with batch_size=8, num_classes=20, input_length=100, target_length=60, input_type=random
	Testing CUDNN implementation...
	CUDNN kernel used: True
	CUDNN loss: 3.8359289169311523
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: -1.9019420146942139
	Testing CPU implementation...
	CPU loss: -1.9019420146942139

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 5.74e+00
	Diff CUDNN vs CPU: 5.74e+00
	Diff CUDA native vs CPU: 0.00e+00
	Relative diff CUDNN vs CUDA native: 1.50e+00
	Relative diff CUDNN vs CPU: 1.50e+00
	--------------------------------------------------

	--- Test 4/5 (random) ---
	Testing with batch_size=1, num_classes=10, input_length=200, target_length=150, input_type=random
	Testing CUDNN implementation...
	CUDNN kernel used: True
	CUDNN loss: 2.4502780437469482
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: -1.1409305334091187
	Testing CPU implementation...
	CPU loss: -1.1409305334091187

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 3.59e+00
	Diff CUDNN vs CPU: 3.59e+00
	Diff CUDA native vs CPU: 0.00e+00
	Relative diff CUDNN vs CUDA native: 1.47e+00
	Relative diff CUDNN vs CPU: 1.47e+00
	--------------------------------------------------

	--- Test 5/5 (random) ---
	Testing with batch_size=3, num_classes=15, input_length=80, target_length=256, input_type=random
	Testing CUDNN implementation...
	CUDNN kernel used: False
	CUDNN loss: 0.0
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: 0.0
	Testing CPU implementation...
	CPU loss: 0.0

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 0.00e+00
	Diff CUDNN vs CPU: 0.00e+00
	Diff CUDA native vs CPU: 0.00e+00
	Relative diff CUDNN vs CUDA native: 0.00e+00
	Relative diff CUDNN vs CPU: 0.00e+00
	--------------------------------------------------

	=== Testing with LOG_SIMPLEX inputs ===

	--- Test 1/5 (log_simplex) ---
	Testing with batch_size=2, num_classes=5, input_length=10, target_length=5, input_type=log_simplex
	Testing CUDNN implementation...
	CUDNN kernel used: True
	CUDNN loss: 1.802871584892273
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: 1.8028714656829834
	Testing CPU implementation...
	CPU loss: 1.8028714656829834

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 1.19e-07
	Diff CUDNN vs CPU: 1.19e-07
	Diff CUDA native vs CPU: 0.00e+00
	Relative diff CUDNN vs CUDA native: 6.61e-08
	Relative diff CUDNN vs CPU: 6.61e-08
	--------------------------------------------------

	--- Test 2/5 (log_simplex) ---
	Testing with batch_size=4, num_classes=10, input_length=50, target_length=30, input_type=log_simplex
	Testing CUDNN implementation...
	CUDNN kernel used: True
	CUDNN loss: 2.71065616607666
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: 2.71065616607666
	Testing CPU implementation...
	CPU loss: 2.7106564044952393

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 0.00e+00
	Diff CUDNN vs CPU: 2.38e-07
	Diff CUDA native vs CPU: 2.38e-07
	Relative diff CUDNN vs CUDA native: 0.00e+00
	Relative diff CUDNN vs CPU: 8.80e-08
	--------------------------------------------------

	--- Test 3/5 (log_simplex) ---
	Testing with batch_size=8, num_classes=20, input_length=100, target_length=60, input_type=log_simplex
	Testing CUDNN implementation...
	CUDNN kernel used: True
	CUDNN loss: 3.7439799308776855
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: 3.7439794540405273
	Testing CPU implementation...
	CPU loss: 3.7439799308776855

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 4.77e-07
	Diff CUDNN vs CPU: 0.00e+00
	Diff CUDA native vs CPU: 4.77e-07
	Relative diff CUDNN vs CUDA native: 1.27e-07
	Relative diff CUDNN vs CPU: 0.00e+00
	--------------------------------------------------

	--- Test 4/5 (log_simplex) ---
	Testing with batch_size=1, num_classes=10, input_length=200, target_length=150, input_type=log_simplex
	Testing CUDNN implementation...
	CUDNN kernel used: True
	CUDNN loss: 2.4955265522003174
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: 2.4955267906188965
	Testing CPU implementation...
	CPU loss: 2.4955267906188965

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 2.38e-07
	Diff CUDNN vs CPU: 2.38e-07
	Diff CUDA native vs CPU: 0.00e+00
	Relative diff CUDNN vs CUDA native: 9.55e-08
	Relative diff CUDNN vs CPU: 9.55e-08
	--------------------------------------------------

	--- Test 5/5 (log_simplex) ---
	Testing with batch_size=3, num_classes=15, input_length=80, target_length=256, input_type=log_simplex
	Testing CUDNN implementation...
	CUDNN kernel used: False
	CUDNN loss: 0.0
	Testing CUDA native implementation...
	CUDA native (no CUDNN) kernel used CUDNN: False
	CUDA native loss: 0.0
	Testing CPU implementation...
	CPU loss: 0.0

	Loss Comparison Results:
	Diff CUDNN vs CUDA native: 0.00e+00
	Diff CUDNN vs CPU: 0.00e+00
	Diff CUDA native vs CPU: 0.00e+00
	Relative diff CUDNN vs CUDA native: 0.00e+00
	Relative diff CUDNN vs CPU: 0.00e+00
	--------------------------------------------------

	=== SUMMARY ANALYSIS ===

	--- RANDOM Input Analysis ---
	CUDNN kernel used in 4/5 tests
	CUDNN vs CPU loss differences: 0.00e+00 to 5.74e+00
	CUDNN vs CUDA native loss differences: 0.00e+00 to 5.74e+00
	Differences > 0.001 (likely mathematical errors): 4
	Differences ≤ 1e-05 (likely precision issues): 1
	CONCLUSION: CUDNN implementation likely has mathematical errors

	--- LOG_SIMPLEX Input Analysis ---
	CUDNN kernel used in 4/5 tests
	CUDNN vs CPU loss differences: 0.00e+00 to 2.38e-07
	CUDNN vs CUDA native loss differences: 0.00e+00 to 4.77e-07
	Differences > 0.001 (likely mathematical errors): 0
	Differences ≤ 1e-05 (likely precision issues): 5
	CONCLUSION: Differences likely due to precision issues only