Created
August 11, 2025 11:22
-
-
Save ramanshrivastava/27a5355f5f0cd91824df5bea619e64f0 to your computer and use it in GitHub Desktop.
quick torch with gpu
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| from torch.utils.data import DataLoader, TensorDataset | |
| import time | |
| def check_gpu_availability(): | |
| """Check and display GPU availability information""" | |
| print("=" * 50) | |
| print("GPU AVAILABILITY CHECK") | |
| print("=" * 50) | |
| if torch.cuda.is_available(): | |
| print(f"✓ CUDA is available") | |
| print(f"✓ Number of GPUs: {torch.cuda.device_count()}") | |
| for i in range(torch.cuda.device_count()): | |
| print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}") | |
| print(f" Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB") | |
| print(f" Memory Cached: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB") | |
| else: | |
| print("✗ CUDA is not available. Running on CPU.") | |
| print(f"\nCurrent PyTorch version: {torch.__version__}") | |
| print("=" * 50 + "\n") | |
| class SimpleNN(nn.Module): | |
| """Simple neural network for demonstration""" | |
| def __init__(self, input_size=784, hidden_size=256, num_classes=10): | |
| super(SimpleNN, self).__init__() | |
| self.fc1 = nn.Linear(input_size, hidden_size) | |
| self.relu = nn.ReLU() | |
| self.dropout = nn.Dropout(0.2) | |
| self.fc2 = nn.Linear(hidden_size, hidden_size // 2) | |
| self.fc3 = nn.Linear(hidden_size // 2, num_classes) | |
| def forward(self, x): | |
| x = x.view(x.size(0), -1) | |
| x = self.fc1(x) | |
| x = self.relu(x) | |
| x = self.dropout(x) | |
| x = self.fc2(x) | |
| x = self.relu(x) | |
| x = self.fc3(x) | |
| return x | |
| def basic_gpu_operations(): | |
| """Demonstrate basic tensor operations on GPU""" | |
| print("\n" + "=" * 50) | |
| print("BASIC GPU OPERATIONS") | |
| print("=" * 50) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Using device: {device}") | |
| x_cpu = torch.randn(1000, 1000) | |
| y_cpu = torch.randn(1000, 1000) | |
| if torch.cuda.is_available(): | |
| x_gpu = x_cpu.to('cuda') | |
| y_gpu = y_cpu.to('cuda') | |
| x_gpu_direct = torch.randn(1000, 1000, device='cuda') | |
| start_cpu = time.time() | |
| z_cpu = torch.matmul(x_cpu, y_cpu) | |
| cpu_time = time.time() - start_cpu | |
| start_gpu = time.time() | |
| z_gpu = torch.matmul(x_gpu, y_gpu) | |
| torch.cuda.synchronize() | |
| gpu_time = time.time() - start_gpu | |
| print(f"\nMatrix multiplication (1000x1000):") | |
| print(f" CPU time: {cpu_time*1000:.2f} ms") | |
| print(f" GPU time: {gpu_time*1000:.2f} ms") | |
| print(f" Speedup: {cpu_time/gpu_time:.2f}x") | |
| z_cpu_from_gpu = z_gpu.cpu() | |
| print(f"\nTensor on GPU shape: {z_gpu.shape}") | |
| print(f"Tensor device: {z_gpu.device}") | |
| def train_model_on_gpu(): | |
| """Demonstrate model training on GPU""" | |
| print("\n" + "=" * 50) | |
| print("MODEL TRAINING ON GPU") | |
| print("=" * 50) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Training device: {device}") | |
| batch_size = 64 | |
| input_size = 784 | |
| num_classes = 10 | |
| num_samples = 1000 | |
| X = torch.randn(num_samples, input_size) | |
| y = torch.randint(0, num_classes, (num_samples,)) | |
| dataset = TensorDataset(X, y) | |
| dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) | |
| model = SimpleNN(input_size, 256, num_classes).to(device) | |
| criterion = nn.CrossEntropyLoss() | |
| optimizer = optim.Adam(model.parameters(), lr=0.001) | |
| print(f"\nModel parameters: {sum(p.numel() for p in model.parameters()):,}") | |
| print(f"Model on device: {next(model.parameters()).device}") | |
| model.train() | |
| num_epochs = 5 | |
| for epoch in range(num_epochs): | |
| epoch_loss = 0.0 | |
| for batch_X, batch_y in dataloader: | |
| batch_X = batch_X.to(device) | |
| batch_y = batch_y.to(device) | |
| optimizer.zero_grad() | |
| outputs = model(batch_X) | |
| loss = criterion(outputs, batch_y) | |
| loss.backward() | |
| optimizer.step() | |
| epoch_loss += loss.item() | |
| avg_loss = epoch_loss / len(dataloader) | |
| print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}") | |
| if torch.cuda.is_available(): | |
| print(f"\nGPU memory used: {torch.cuda.memory_allocated(device) / 1024**2:.2f} MB") | |
| def multi_gpu_example(): | |
| """Demonstrate multi-GPU usage with DataParallel""" | |
| print("\n" + "=" * 50) | |
| print("MULTI-GPU EXAMPLE") | |
| print("=" * 50) | |
| if torch.cuda.device_count() > 1: | |
| print(f"Using {torch.cuda.device_count()} GPUs with DataParallel") | |
| model = SimpleNN(784, 512, 10) | |
| model = nn.DataParallel(model) | |
| model = model.cuda() | |
| batch_size = 256 | |
| input_data = torch.randn(batch_size, 784).cuda() | |
| output = model(input_data) | |
| print(f"Input shape: {input_data.shape}") | |
| print(f"Output shape: {output.shape}") | |
| print("\nPer-GPU memory usage:") | |
| for i in range(torch.cuda.device_count()): | |
| print(f" GPU {i}: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB") | |
| else: | |
| print("Multi-GPU not available (need 2+ GPUs)") | |
| def gpu_memory_management(): | |
| """Demonstrate GPU memory management techniques""" | |
| print("\n" + "=" * 50) | |
| print("GPU MEMORY MANAGEMENT") | |
| print("=" * 50) | |
| if torch.cuda.is_available(): | |
| print("Initial GPU memory:") | |
| print(f" Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f" Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| large_tensor = torch.randn(10000, 10000, device='cuda') | |
| print("\nAfter creating large tensor (10000x10000):") | |
| print(f" Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f" Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| del large_tensor | |
| print("\nAfter deleting tensor:") | |
| print(f" Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f" Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| torch.cuda.empty_cache() | |
| print("\nAfter clearing cache:") | |
| print(f" Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f" Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| with torch.no_grad(): | |
| inference_tensor = torch.randn(5000, 5000, device='cuda') | |
| result = inference_tensor @ inference_tensor.T | |
| print("\nUsing torch.no_grad() for inference saves memory") | |
| def mixed_precision_training(): | |
| """Demonstrate mixed precision training for faster GPU computation""" | |
| print("\n" + "=" * 50) | |
| print("MIXED PRECISION TRAINING (AMP)") | |
| print("=" * 50) | |
| if torch.cuda.is_available(): | |
| from torch.cuda.amp import autocast, GradScaler | |
| device = torch.device('cuda') | |
| model = SimpleNN().to(device) | |
| optimizer = optim.Adam(model.parameters()) | |
| scaler = GradScaler() | |
| X = torch.randn(32, 784).to(device) | |
| y = torch.randint(0, 10, (32,)).to(device) | |
| criterion = nn.CrossEntropyLoss() | |
| print("Training with Automatic Mixed Precision (AMP):") | |
| for step in range(3): | |
| optimizer.zero_grad() | |
| with autocast(): | |
| output = model(X) | |
| loss = criterion(output, y) | |
| scaler.scale(loss).backward() | |
| scaler.step(optimizer) | |
| scaler.update() | |
| print(f" Step {step+1}, Loss: {loss.item():.4f}") | |
| print("AMP training completed successfully!") | |
| def device_agnostic_code(): | |
| """Write device-agnostic code that works on both CPU and GPU""" | |
| print("\n" + "=" * 50) | |
| print("DEVICE-AGNOSTIC CODE") | |
| print("=" * 50) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| def get_default_device(): | |
| """Get the default device (GPU if available, else CPU)""" | |
| if torch.cuda.is_available(): | |
| return torch.device('cuda') | |
| elif torch.backends.mps.is_available(): | |
| return torch.device('mps') | |
| else: | |
| return torch.device('cpu') | |
| def to_device(data, device): | |
| """Move tensor(s) to the specified device""" | |
| if isinstance(data, (list, tuple)): | |
| return [to_device(x, device) for x in data] | |
| return data.to(device, non_blocking=True) | |
| device = get_default_device() | |
| print(f"Selected device: {device}") | |
| data = torch.randn(100, 100) | |
| data = to_device(data, device) | |
| print(f"Data moved to: {data.device}") | |
| model = SimpleNN() | |
| model = to_device(model, device) | |
| print(f"Model moved to: {next(model.parameters()).device}") | |
| if __name__ == "__main__": | |
| check_gpu_availability() | |
| basic_gpu_operations() | |
| train_model_on_gpu() | |
| multi_gpu_example() | |
| gpu_memory_management() | |
| if torch.cuda.is_available(): | |
| mixed_precision_training() | |
| device_agnostic_code() | |
| print("\n" + "=" * 50) | |
| print("GPU OPERATIONS COMPLETED") | |
| print("=" * 50) | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print("\nFinal GPU memory cleaned up") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment