Last active
May 13, 2025 17:30
-
-
Save MeetThePatel/b7e93f3d3b65a67a09f8b02440a8cef9 to your computer and use it in GitHub Desktop.
Adagrad(fused=True) benchmark script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| class TinyModel(nn.Module): | |
| def __init__(self, D=128): | |
| super().__init__() | |
| self.net = nn.Sequential(nn.Linear(D, D), nn.ReLU(), nn.Linear(D, D)) | |
| def forward(self, x): | |
| return self.net(x) | |
| def get_model_and_optimizer(D=128, lr=1e-2, eps=1e-10, fused=False, foreach=False, dtype=torch.float32): | |
| model = TinyModel(D).to(dtype=dtype, device='cuda') | |
| optimizer = torch.optim.Adagrad( | |
| model.parameters(), | |
| lr=lr, | |
| eps=eps, | |
| fused=fused, | |
| foreach=foreach | |
| ) | |
| return model, optimizer | |
| def step_model(model, optimizer, x, target): | |
| optimizer.zero_grad(set_to_none=True) | |
| loss = F.mse_loss(model(x), target) | |
| loss.backward() | |
| optimizer.step() | |
| return loss.item() | |
| def test_correctness(dtype=torch.float32): | |
| torch.manual_seed(0) | |
| D = 64 | |
| lr = 1e-2 | |
| eps = 1e-10 | |
| x = torch.randn(32, D, device="cuda", dtype=dtype) | |
| target = torch.randn(32, D, device="cuda", dtype=dtype) | |
| # Reference | |
| ref_model, ref_opt = get_model_and_optimizer(D, lr, eps, fused=False, foreach=False, dtype=dtype) | |
| foreach_model, foreach_opt = get_model_and_optimizer(D, lr, eps, fused=False, foreach=True, dtype=dtype) | |
| fused_model, fused_opt = get_model_and_optimizer(D, lr, eps, fused=True, foreach=False, dtype=dtype) | |
| # Sync initial weights | |
| for p_ref, p_foreach, p_fused in zip(ref_model.parameters(), foreach_model.parameters(), fused_model.parameters()): | |
| p_foreach.data.copy_(p_ref.data) | |
| p_fused.data.copy_(p_ref.data) | |
| configs = [ | |
| ("foreach=False, fused=False", ref_model, ref_opt), | |
| ("foreach=True, fused=False", foreach_model, foreach_opt), | |
| ("foreach=False, fused=True ", fused_model, fused_opt), | |
| ] | |
| for name, model, optimizer in configs: | |
| losses = [] | |
| for _ in range(5): | |
| losses.append(step_model(model, optimizer, x, target)) | |
| print(f"{name} : {losses}") | |
| if __name__ == "__main__": | |
| dtypes = [torch.float64, torch.float32, torch.float16, torch.bfloat16] | |
| for dtype in dtypes: | |
| print(f"\ndtype={dtype}") | |
| test_correctness(dtype=dtype) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| from itertools import product | |
| from torch.optim import Adagrad | |
| import torch.cuda.nvtx as nvtx | |
| import argparse | |
| import pandas as pd | |
| def parse_args(): | |
| parser = argparse.ArgumentParser( | |
| description="Adagrad GPU benchmark with optional Excel output" | |
| ) | |
| parser.add_argument( | |
| "--xlsx", | |
| action="store_true", | |
| help="Write results to an Excel .xlsx file", | |
| ) | |
| return parser.parse_args() | |
| def ensure_cuda(): | |
| if not torch.cuda.is_available(): | |
| raise RuntimeError("This script requires CUDA, but CUDA is not available.") | |
| def benchmark_adagrad(config, dtype, shape, num_steps=100): | |
| device = torch.device("cuda") | |
| params = [ | |
| torch.randn(shape, dtype=dtype, device=device, requires_grad=True) | |
| for _ in range(10) | |
| ] | |
| try: | |
| optimizer = Adagrad( | |
| params, lr=0.01, fused=config["fused"], foreach=config["foreach"] | |
| ) | |
| except Exception as e: | |
| return None, None, str(e) | |
| for _ in range(10): # warm-up | |
| for p in params: | |
| p.grad = torch.randn_like(p) | |
| optimizer.step() | |
| torch.cuda.synchronize() | |
| torch.cuda.reset_peak_memory_stats(device) | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) | |
| total_time_ms = 0.0 | |
| nvtx.range_push(f"{config['label']}-{dtype}-{shape}") | |
| for _ in range(num_steps): | |
| for p in params: | |
| p.grad = torch.randn_like(p) | |
| start_event.record() | |
| optimizer.step() | |
| end_event.record() | |
| torch.cuda.synchronize() | |
| total_time_ms += start_event.elapsed_time(end_event) | |
| nvtx.range_pop() | |
| peak_bytes = torch.cuda.max_memory_allocated(device) | |
| peak_mib = peak_bytes / (1024 ** 2) | |
| avg_time_s = (total_time_ms / num_steps) / 1000.0 | |
| return avg_time_s, peak_mib, None | |
| def main(): | |
| args = parse_args() | |
| ensure_cuda() | |
| configs = [ | |
| {"fused": False, "foreach": False, "label": "fused=False, foreach=False"}, | |
| {"fused": False, "foreach": True, "label": "fused=False, foreach=True"}, | |
| {"fused": True, "foreach": False, "label": "fused=True, foreach=False"}, | |
| ] | |
| dtypes = [torch.float16, torch.float32, torch.float64, torch.bfloat16] | |
| shapes = [(50257, 768), (2048, 2048), (768, 3072), (4096,)] | |
| results = [] | |
| for dtype, shape in product(dtypes, shapes): | |
| print(f"\nTesting Dtype: {dtype}, Shape: {shape}") | |
| for config in configs: | |
| print( | |
| f"Config: {config['label']} (fused={config['fused']}, foreach={config['foreach']})" | |
| ) | |
| avg_time, peak_mem, error = benchmark_adagrad(config, dtype, shape) | |
| status = "Success" if error is None else "Failed" | |
| print( | |
| f" Time: {avg_time if avg_time else 'N/A'} s, Peak Mem: {peak_mem if peak_mem else 'N/A'} MiB, Status: {status}" | |
| ) | |
| results.append( | |
| { | |
| "dtype": str(dtype), | |
| "shape": str(shape), | |
| "config": config["label"], | |
| "avg_time_s": avg_time, | |
| "peak_memory_MiB": peak_mem, | |
| "status": status, | |
| "error": error, | |
| } | |
| ) | |
| df = pd.DataFrame(results) | |
| print("\nRaw Results:") | |
| print(df) | |
| if args.xlsx: | |
| # Convert time to ms for display | |
| df["avg_time_ms"] = df["avg_time_s"] * 1000 if df["avg_time_s"].notnull().any() else df["avg_time_s"] | |
| # Pivot tables | |
| speed_table = df.pivot_table( | |
| index=["dtype", "shape"], | |
| columns="config", | |
| values="avg_time_ms" | |
| ).reset_index() | |
| memory_table = df.pivot_table( | |
| index=["dtype", "shape"], | |
| columns="config", | |
| values="peak_memory_MiB" | |
| ).reset_index() | |
| with pd.ExcelWriter("benchmark_results.xlsx") as writer: | |
| speed_table.to_excel(writer, sheet_name="speed_results", index=False) | |
| memory_table.to_excel(writer, sheet_name="memory_results", index=False) | |
| print("Saved Excel output to benchmark_results.xlsx") | |
| if __name__ == "__main__": | |
| main() |
Author
It is weird that the Adagrad fused results look different from the AdamW ones (which are expected that fused > foreach > forloop)...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Yep
I switched computers to a AMD EPYC 7B13 and 4090, so I'll attach retests for Adagrad as well.
Adagrad (lr=0.01):


AdamW (lr=0.01):


Also, I wanted to note that these timings are variable from run to run. Maybe on a larger model, the differences would be more evident.