-
-
Save MeetThePatel/b7e93f3d3b65a67a09f8b02440a8cef9 to your computer and use it in GitHub Desktop.
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| class TinyModel(nn.Module): | |
| def __init__(self, D=128): | |
| super().__init__() | |
| self.net = nn.Sequential(nn.Linear(D, D), nn.ReLU(), nn.Linear(D, D)) | |
| def forward(self, x): | |
| return self.net(x) | |
| def get_model_and_optimizer(D=128, lr=1e-2, eps=1e-10, fused=False, foreach=False, dtype=torch.float32): | |
| model = TinyModel(D).to(dtype=dtype, device='cuda') | |
| optimizer = torch.optim.Adagrad( | |
| model.parameters(), | |
| lr=lr, | |
| eps=eps, | |
| fused=fused, | |
| foreach=foreach | |
| ) | |
| return model, optimizer | |
| def step_model(model, optimizer, x, target): | |
| optimizer.zero_grad(set_to_none=True) | |
| loss = F.mse_loss(model(x), target) | |
| loss.backward() | |
| optimizer.step() | |
| return loss.item() | |
| def test_correctness(dtype=torch.float32): | |
| torch.manual_seed(0) | |
| D = 64 | |
| lr = 1e-2 | |
| eps = 1e-10 | |
| x = torch.randn(32, D, device="cuda", dtype=dtype) | |
| target = torch.randn(32, D, device="cuda", dtype=dtype) | |
| # Reference | |
| ref_model, ref_opt = get_model_and_optimizer(D, lr, eps, fused=False, foreach=False, dtype=dtype) | |
| foreach_model, foreach_opt = get_model_and_optimizer(D, lr, eps, fused=False, foreach=True, dtype=dtype) | |
| fused_model, fused_opt = get_model_and_optimizer(D, lr, eps, fused=True, foreach=False, dtype=dtype) | |
| # Sync initial weights | |
| for p_ref, p_foreach, p_fused in zip(ref_model.parameters(), foreach_model.parameters(), fused_model.parameters()): | |
| p_foreach.data.copy_(p_ref.data) | |
| p_fused.data.copy_(p_ref.data) | |
| configs = [ | |
| ("foreach=False, fused=False", ref_model, ref_opt), | |
| ("foreach=True, fused=False", foreach_model, foreach_opt), | |
| ("foreach=False, fused=True ", fused_model, fused_opt), | |
| ] | |
| for name, model, optimizer in configs: | |
| losses = [] | |
| for _ in range(5): | |
| losses.append(step_model(model, optimizer, x, target)) | |
| print(f"{name} : {losses}") | |
| if __name__ == "__main__": | |
| dtypes = [torch.float64, torch.float32, torch.float16, torch.bfloat16] | |
| for dtype in dtypes: | |
| print(f"\ndtype={dtype}") | |
| test_correctness(dtype=dtype) |
| import torch | |
| from itertools import product | |
| from torch.optim import Adagrad | |
| import torch.cuda.nvtx as nvtx | |
| import argparse | |
| import pandas as pd | |
| def parse_args(): | |
| parser = argparse.ArgumentParser( | |
| description="Adagrad GPU benchmark with optional Excel output" | |
| ) | |
| parser.add_argument( | |
| "--xlsx", | |
| action="store_true", | |
| help="Write results to an Excel .xlsx file", | |
| ) | |
| return parser.parse_args() | |
| def ensure_cuda(): | |
| if not torch.cuda.is_available(): | |
| raise RuntimeError("This script requires CUDA, but CUDA is not available.") | |
| def benchmark_adagrad(config, dtype, shape, num_steps=100): | |
| device = torch.device("cuda") | |
| params = [ | |
| torch.randn(shape, dtype=dtype, device=device, requires_grad=True) | |
| for _ in range(10) | |
| ] | |
| try: | |
| optimizer = Adagrad( | |
| params, lr=0.01, fused=config["fused"], foreach=config["foreach"] | |
| ) | |
| except Exception as e: | |
| return None, None, str(e) | |
| for _ in range(10): # warm-up | |
| for p in params: | |
| p.grad = torch.randn_like(p) | |
| optimizer.step() | |
| torch.cuda.synchronize() | |
| torch.cuda.reset_peak_memory_stats(device) | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) | |
| total_time_ms = 0.0 | |
| nvtx.range_push(f"{config['label']}-{dtype}-{shape}") | |
| for _ in range(num_steps): | |
| for p in params: | |
| p.grad = torch.randn_like(p) | |
| start_event.record() | |
| optimizer.step() | |
| end_event.record() | |
| torch.cuda.synchronize() | |
| total_time_ms += start_event.elapsed_time(end_event) | |
| nvtx.range_pop() | |
| peak_bytes = torch.cuda.max_memory_allocated(device) | |
| peak_mib = peak_bytes / (1024 ** 2) | |
| avg_time_s = (total_time_ms / num_steps) / 1000.0 | |
| return avg_time_s, peak_mib, None | |
| def main(): | |
| args = parse_args() | |
| ensure_cuda() | |
| configs = [ | |
| {"fused": False, "foreach": False, "label": "fused=False, foreach=False"}, | |
| {"fused": False, "foreach": True, "label": "fused=False, foreach=True"}, | |
| {"fused": True, "foreach": False, "label": "fused=True, foreach=False"}, | |
| ] | |
| dtypes = [torch.float16, torch.float32, torch.float64, torch.bfloat16] | |
| shapes = [(50257, 768), (2048, 2048), (768, 3072), (4096,)] | |
| results = [] | |
| for dtype, shape in product(dtypes, shapes): | |
| print(f"\nTesting Dtype: {dtype}, Shape: {shape}") | |
| for config in configs: | |
| print( | |
| f"Config: {config['label']} (fused={config['fused']}, foreach={config['foreach']})" | |
| ) | |
| avg_time, peak_mem, error = benchmark_adagrad(config, dtype, shape) | |
| status = "Success" if error is None else "Failed" | |
| print( | |
| f" Time: {avg_time if avg_time else 'N/A'} s, Peak Mem: {peak_mem if peak_mem else 'N/A'} MiB, Status: {status}" | |
| ) | |
| results.append( | |
| { | |
| "dtype": str(dtype), | |
| "shape": str(shape), | |
| "config": config["label"], | |
| "avg_time_s": avg_time, | |
| "peak_memory_MiB": peak_mem, | |
| "status": status, | |
| "error": error, | |
| } | |
| ) | |
| df = pd.DataFrame(results) | |
| print("\nRaw Results:") | |
| print(df) | |
| if args.xlsx: | |
| # Convert time to ms for display | |
| df["avg_time_ms"] = df["avg_time_s"] * 1000 if df["avg_time_s"].notnull().any() else df["avg_time_s"] | |
| # Pivot tables | |
| speed_table = df.pivot_table( | |
| index=["dtype", "shape"], | |
| columns="config", | |
| values="avg_time_ms" | |
| ).reset_index() | |
| memory_table = df.pivot_table( | |
| index=["dtype", "shape"], | |
| columns="config", | |
| values="peak_memory_MiB" | |
| ).reset_index() | |
| with pd.ExcelWriter("benchmark_results.xlsx") as writer: | |
| speed_table.to_excel(writer, sheet_name="speed_results", index=False) | |
| memory_table.to_excel(writer, sheet_name="memory_results", index=False) | |
| print("Saved Excel output to benchmark_results.xlsx") | |
| if __name__ == "__main__": | |
| main() |
There should be existing tests for correctness that you can follow--for a pointer look at how Adam is tested in test_optim.py and common_optimizers.py
There should be existing tests for correctness that you can follow--for a pointer look at how Adam is tested in test_optim.py and common_optimizers.py
@janeyx99 Aren't these automatically run? When I run pytest test/test_optim.py -vv -k Adagrad, the following tests are run (and successful). Are these the tests you are referencing:
test/test_optim.py::TestOptimRenewedCUDA::test_fused_matches_forloop_Adagrad_cuda_bfloat16 PASSED [0.5672s] [ 71%]
test/test_optim.py::TestOptimRenewedCUDA::test_fused_matches_forloop_Adagrad_cuda_float16 PASSED [0.5884s] [ 72%]
test/test_optim.py::TestOptimRenewedCUDA::test_fused_matches_forloop_Adagrad_cuda_float32 PASSED [0.5869s] [ 72%]
test/test_optim.py::TestOptimRenewedCUDA::test_fused_matches_forloop_Adagrad_cuda_float64 PASSED [0.5778s] [ 73%]
Ah yes! That's great our test infra just picked that up :D I'm guessing then that test_fused_cpu_matches_cuda is also passing?
For the benchmarks, do you mind computing a similar table for AdamW to see if the fused is slower in a similar pattern for bf16 and fp16 dtypes?
I'm guessing then that
test_fused_cpu_matches_cudais also passing?
Yep
For the benchmarks, do you mind computing a similar table for AdamW to see if the fused is slower in a similar pattern for bf16 and fp16 dtypes?
I switched computers to a AMD EPYC 7B13 and 4090, so I'll attach retests for Adagrad as well.
Also, I wanted to note that these timings are variable from run to run. Maybe on a larger model, the differences would be more evident.
It is weird that the Adagrad fused results look different from the AdamW ones (which are expected that fused > foreach > forloop)...





Benchmark results on a Ryzen 9 7950X3D and NVIDIA GeForce RTX 4080.