ramanshrivastava · August 11, 2025 11:22
diff --git a/torch.py b/torch.py
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import DataLoader, TensorDataset
 import time

 def check_gpu_availability():
    """Check and display GPU availability information"""
    print("=" * 50)
    print("GPU AVAILABILITY CHECK")
    print("=" * 50)
    
    if torch.cuda.is_available():
        print(f"✓ CUDA is available")
        print(f"✓ Number of GPUs: {torch.cuda.device_count()}")
        
        for i in range(torch.cuda.device_count()):
            print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
            print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
    else:
        print("✗ CUDA is not available. Running on CPU.")
    
    print(f"\nCurrent PyTorch version: {torch.__version__}")
    print("=" * 50 + "\n")

 class SimpleNN(nn.Module):
    """Simple neural network for demonstration"""
    def __init__(self, input_size=784, hidden_size=256, num_classes=10):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

 def basic_gpu_operations():
    """Demonstrate basic tensor operations on GPU"""
    print("\n" + "=" * 50)
    print("BASIC GPU OPERATIONS")
    print("=" * 50)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    x_cpu = torch.randn(1000, 1000)
    y_cpu = torch.randn(1000, 1000)
    
    if torch.cuda.is_available():
        x_gpu = x_cpu.to('cuda')
        y_gpu = y_cpu.to('cuda')
        
        x_gpu_direct = torch.randn(1000, 1000, device='cuda')
        
        start_cpu = time.time()
        z_cpu = torch.matmul(x_cpu, y_cpu)
        cpu_time = time.time() - start_cpu
        
        start_gpu = time.time()
        z_gpu = torch.matmul(x_gpu, y_gpu)
        torch.cuda.synchronize()
        gpu_time = time.time() - start_gpu
        
        print(f"\nMatrix multiplication (1000x1000):")
        print(f"  CPU time: {cpu_time*1000:.2f} ms")
        print(f"  GPU time: {gpu_time*1000:.2f} ms")
        print(f"  Speedup: {cpu_time/gpu_time:.2f}x")
        
        z_cpu_from_gpu = z_gpu.cpu()
        print(f"\nTensor on GPU shape: {z_gpu.shape}")
        print(f"Tensor device: {z_gpu.device}")

 def train_model_on_gpu():
    """Demonstrate model training on GPU"""
    print("\n" + "=" * 50)
    print("MODEL TRAINING ON GPU")
    print("=" * 50)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training device: {device}")
    
    batch_size = 64
    input_size = 784
    num_classes = 10
    num_samples = 1000
    
    X = torch.randn(num_samples, input_size)
    y = torch.randint(0, num_classes, (num_samples,))
    
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model = SimpleNN(input_size, 256, num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    print(f"\nModel parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Model on device: {next(model.parameters()).device}")
    
    model.train()
    num_epochs = 5
    
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for batch_X, batch_y in dataloader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
    
    if torch.cuda.is_available():
        print(f"\nGPU memory used: {torch.cuda.memory_allocated(device) / 1024**2:.2f} MB")

 def multi_gpu_example():
    """Demonstrate multi-GPU usage with DataParallel"""
    print("\n" + "=" * 50)
    print("MULTI-GPU EXAMPLE")
    print("=" * 50)
    
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
        
        model = SimpleNN(784, 512, 10)
        model = nn.DataParallel(model)
        model = model.cuda()
        
        batch_size = 256
        input_data = torch.randn(batch_size, 784).cuda()
        
        output = model(input_data)
        print(f"Input shape: {input_data.shape}")
        print(f"Output shape: {output.shape}")
        
        print("\nPer-GPU memory usage:")
        for i in range(torch.cuda.device_count()):
            print(f"  GPU {i}: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
    else:
        print("Multi-GPU not available (need 2+ GPUs)")

 def gpu_memory_management():
    """Demonstrate GPU memory management techniques"""
    print("\n" + "=" * 50)
    print("GPU MEMORY MANAGEMENT")
    print("=" * 50)
    
    if torch.cuda.is_available():
        print("Initial GPU memory:")
        print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
        
        large_tensor = torch.randn(10000, 10000, device='cuda')
        print("\nAfter creating large tensor (10000x10000):")
        print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
        
        del large_tensor
        print("\nAfter deleting tensor:")
        print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
        
        torch.cuda.empty_cache()
        print("\nAfter clearing cache:")
        print(f"  Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"  Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
        
        with torch.no_grad():
            inference_tensor = torch.randn(5000, 5000, device='cuda')
            result = inference_tensor @ inference_tensor.T
        print("\nUsing torch.no_grad() for inference saves memory")

 def mixed_precision_training():
    """Demonstrate mixed precision training for faster GPU computation"""
    print("\n" + "=" * 50)
    print("MIXED PRECISION TRAINING (AMP)")
    print("=" * 50)
    
    if torch.cuda.is_available():
        from torch.cuda.amp import autocast, GradScaler
        
        device = torch.device('cuda')
        model = SimpleNN().to(device)
        optimizer = optim.Adam(model.parameters())
        scaler = GradScaler()
        
        X = torch.randn(32, 784).to(device)
        y = torch.randint(0, 10, (32,)).to(device)
        criterion = nn.CrossEntropyLoss()
        
        print("Training with Automatic Mixed Precision (AMP):")
        
        for step in range(3):
            optimizer.zero_grad()
            
            with autocast():
                output = model(X)
                loss = criterion(output, y)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            print(f"  Step {step+1}, Loss: {loss.item():.4f}")
        
        print("AMP training completed successfully!")

 def device_agnostic_code():
    """Write device-agnostic code that works on both CPU and GPU"""
    print("\n" + "=" * 50)
    print("DEVICE-AGNOSTIC CODE")
    print("=" * 50)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    def get_default_device():
        """Get the default device (GPU if available, else CPU)"""
        if torch.cuda.is_available():
            return torch.device('cuda')
        elif torch.backends.mps.is_available():
            return torch.device('mps')
        else:
            return torch.device('cpu')
    
    def to_device(data, device):
        """Move tensor(s) to the specified device"""
        if isinstance(data, (list, tuple)):
            return [to_device(x, device) for x in data]
        return data.to(device, non_blocking=True)
    
    device = get_default_device()
    print(f"Selected device: {device}")
    
    data = torch.randn(100, 100)
    data = to_device(data, device)
    print(f"Data moved to: {data.device}")
    
    model = SimpleNN()
    model = to_device(model, device)
    print(f"Model moved to: {next(model.parameters()).device}")

 if __name__ == "__main__":
    check_gpu_availability()
    
    basic_gpu_operations()
    
    train_model_on_gpu()
    
    multi_gpu_example()
    
    gpu_memory_management()
    
    if torch.cuda.is_available():
        mixed_precision_training()
    
    device_agnostic_code()
    
    print("\n" + "=" * 50)
    print("GPU OPERATIONS COMPLETED")
    print("=" * 50)
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("\nFinal GPU memory cleaned up")
	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.utils.data import DataLoader, TensorDataset
	import time

	def check_gpu_availability():
	"""Check and display GPU availability information"""
	print("=" * 50)
	print("GPU AVAILABILITY CHECK")
	print("=" * 50)

	if torch.cuda.is_available():
	print(f"✓ CUDA is available")
	print(f"✓ Number of GPUs: {torch.cuda.device_count()}")

	for i in range(torch.cuda.device_count()):
	print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
	print(f" Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
	print(f" Memory Cached: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
	else:
	print("✗ CUDA is not available. Running on CPU.")

	print(f"\nCurrent PyTorch version: {torch.__version__}")
	print("=" * 50 + "\n")

	class SimpleNN(nn.Module):
	"""Simple neural network for demonstration"""
	def __init__(self, input_size=784, hidden_size=256, num_classes=10):
	super(SimpleNN, self).__init__()
	self.fc1 = nn.Linear(input_size, hidden_size)
	self.relu = nn.ReLU()
	self.dropout = nn.Dropout(0.2)
	self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
	self.fc3 = nn.Linear(hidden_size // 2, num_classes)

	def forward(self, x):
	x = x.view(x.size(0), -1)
	x = self.fc1(x)
	x = self.relu(x)
	x = self.dropout(x)
	x = self.fc2(x)
	x = self.relu(x)
	x = self.fc3(x)
	return x

	def basic_gpu_operations():
	"""Demonstrate basic tensor operations on GPU"""
	print("\n" + "=" * 50)
	print("BASIC GPU OPERATIONS")
	print("=" * 50)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"Using device: {device}")

	x_cpu = torch.randn(1000, 1000)
	y_cpu = torch.randn(1000, 1000)

	if torch.cuda.is_available():
	x_gpu = x_cpu.to('cuda')
	y_gpu = y_cpu.to('cuda')

	x_gpu_direct = torch.randn(1000, 1000, device='cuda')

	start_cpu = time.time()
	z_cpu = torch.matmul(x_cpu, y_cpu)
	cpu_time = time.time() - start_cpu

	start_gpu = time.time()
	z_gpu = torch.matmul(x_gpu, y_gpu)
	torch.cuda.synchronize()
	gpu_time = time.time() - start_gpu

	print(f"\nMatrix multiplication (1000x1000):")
	print(f" CPU time: {cpu_time*1000:.2f} ms")
	print(f" GPU time: {gpu_time*1000:.2f} ms")
	print(f" Speedup: {cpu_time/gpu_time:.2f}x")

	z_cpu_from_gpu = z_gpu.cpu()
	print(f"\nTensor on GPU shape: {z_gpu.shape}")
	print(f"Tensor device: {z_gpu.device}")

	def train_model_on_gpu():
	"""Demonstrate model training on GPU"""
	print("\n" + "=" * 50)
	print("MODEL TRAINING ON GPU")
	print("=" * 50)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"Training device: {device}")

	batch_size = 64
	input_size = 784
	num_classes = 10
	num_samples = 1000

	X = torch.randn(num_samples, input_size)
	y = torch.randint(0, num_classes, (num_samples,))

	dataset = TensorDataset(X, y)
	dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

	model = SimpleNN(input_size, 256, num_classes).to(device)
	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(model.parameters(), lr=0.001)

	print(f"\nModel parameters: {sum(p.numel() for p in model.parameters()):,}")
	print(f"Model on device: {next(model.parameters()).device}")

	model.train()
	num_epochs = 5

	for epoch in range(num_epochs):
	epoch_loss = 0.0
	for batch_X, batch_y in dataloader:
	batch_X = batch_X.to(device)
	batch_y = batch_y.to(device)

	optimizer.zero_grad()
	outputs = model(batch_X)
	loss = criterion(outputs, batch_y)
	loss.backward()
	optimizer.step()

	epoch_loss += loss.item()

	avg_loss = epoch_loss / len(dataloader)
	print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

	if torch.cuda.is_available():
	print(f"\nGPU memory used: {torch.cuda.memory_allocated(device) / 1024**2:.2f} MB")

	def multi_gpu_example():
	"""Demonstrate multi-GPU usage with DataParallel"""
	print("\n" + "=" * 50)
	print("MULTI-GPU EXAMPLE")
	print("=" * 50)

	if torch.cuda.device_count() > 1:
	print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")

	model = SimpleNN(784, 512, 10)
	model = nn.DataParallel(model)
	model = model.cuda()

	batch_size = 256
	input_data = torch.randn(batch_size, 784).cuda()

	output = model(input_data)
	print(f"Input shape: {input_data.shape}")
	print(f"Output shape: {output.shape}")

	print("\nPer-GPU memory usage:")
	for i in range(torch.cuda.device_count()):
	print(f" GPU {i}: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
	else:
	print("Multi-GPU not available (need 2+ GPUs)")

	def gpu_memory_management():
	"""Demonstrate GPU memory management techniques"""
	print("\n" + "=" * 50)
	print("GPU MEMORY MANAGEMENT")
	print("=" * 50)

	if torch.cuda.is_available():
	print("Initial GPU memory:")
	print(f" Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
	print(f" Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

	large_tensor = torch.randn(10000, 10000, device='cuda')
	print("\nAfter creating large tensor (10000x10000):")
	print(f" Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
	print(f" Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

	del large_tensor
	print("\nAfter deleting tensor:")
	print(f" Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
	print(f" Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

	torch.cuda.empty_cache()
	print("\nAfter clearing cache:")
	print(f" Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
	print(f" Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

	with torch.no_grad():
	inference_tensor = torch.randn(5000, 5000, device='cuda')
	result = inference_tensor @ inference_tensor.T
	print("\nUsing torch.no_grad() for inference saves memory")

	def mixed_precision_training():
	"""Demonstrate mixed precision training for faster GPU computation"""
	print("\n" + "=" * 50)
	print("MIXED PRECISION TRAINING (AMP)")
	print("=" * 50)

	if torch.cuda.is_available():
	from torch.cuda.amp import autocast, GradScaler

	device = torch.device('cuda')
	model = SimpleNN().to(device)
	optimizer = optim.Adam(model.parameters())
	scaler = GradScaler()

	X = torch.randn(32, 784).to(device)
	y = torch.randint(0, 10, (32,)).to(device)
	criterion = nn.CrossEntropyLoss()

	print("Training with Automatic Mixed Precision (AMP):")

	for step in range(3):
	optimizer.zero_grad()

	with autocast():
	output = model(X)
	loss = criterion(output, y)

	scaler.scale(loss).backward()
	scaler.step(optimizer)
	scaler.update()

	print(f" Step {step+1}, Loss: {loss.item():.4f}")

	print("AMP training completed successfully!")

	def device_agnostic_code():
	"""Write device-agnostic code that works on both CPU and GPU"""
	print("\n" + "=" * 50)
	print("DEVICE-AGNOSTIC CODE")
	print("=" * 50)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	def get_default_device():
	"""Get the default device (GPU if available, else CPU)"""
	if torch.cuda.is_available():
	return torch.device('cuda')
	elif torch.backends.mps.is_available():
	return torch.device('mps')
	else:
	return torch.device('cpu')

	def to_device(data, device):
	"""Move tensor(s) to the specified device"""
	if isinstance(data, (list, tuple)):
	return [to_device(x, device) for x in data]
	return data.to(device, non_blocking=True)

	device = get_default_device()
	print(f"Selected device: {device}")

	data = torch.randn(100, 100)
	data = to_device(data, device)
	print(f"Data moved to: {data.device}")

	model = SimpleNN()
	model = to_device(model, device)
	print(f"Model moved to: {next(model.parameters()).device}")

	if __name__ == "__main__":
	check_gpu_availability()

	basic_gpu_operations()

	train_model_on_gpu()

	multi_gpu_example()

	gpu_memory_management()

	if torch.cuda.is_available():
	mixed_precision_training()

	device_agnostic_code()

	print("\n" + "=" * 50)
	print("GPU OPERATIONS COMPLETED")
	print("=" * 50)

	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	print("\nFinal GPU memory cleaned up")
No results found