Skip to content

Instantly share code, notes, and snippets.

@MeetThePatel
Last active May 13, 2025 17:30
Show Gist options
  • Select an option

  • Save MeetThePatel/b7e93f3d3b65a67a09f8b02440a8cef9 to your computer and use it in GitHub Desktop.

Select an option

Save MeetThePatel/b7e93f3d3b65a67a09f8b02440a8cef9 to your computer and use it in GitHub Desktop.
Adagrad(fused=True) benchmark script
import torch
import torch.nn as nn
import torch.nn.functional as F
class TinyModel(nn.Module):
def __init__(self, D=128):
super().__init__()
self.net = nn.Sequential(nn.Linear(D, D), nn.ReLU(), nn.Linear(D, D))
def forward(self, x):
return self.net(x)
def get_model_and_optimizer(D=128, lr=1e-2, eps=1e-10, fused=False, foreach=False, dtype=torch.float32):
model = TinyModel(D).to(dtype=dtype, device='cuda')
optimizer = torch.optim.Adagrad(
model.parameters(),
lr=lr,
eps=eps,
fused=fused,
foreach=foreach
)
return model, optimizer
def step_model(model, optimizer, x, target):
optimizer.zero_grad(set_to_none=True)
loss = F.mse_loss(model(x), target)
loss.backward()
optimizer.step()
return loss.item()
def test_correctness(dtype=torch.float32):
torch.manual_seed(0)
D = 64
lr = 1e-2
eps = 1e-10
x = torch.randn(32, D, device="cuda", dtype=dtype)
target = torch.randn(32, D, device="cuda", dtype=dtype)
# Reference
ref_model, ref_opt = get_model_and_optimizer(D, lr, eps, fused=False, foreach=False, dtype=dtype)
foreach_model, foreach_opt = get_model_and_optimizer(D, lr, eps, fused=False, foreach=True, dtype=dtype)
fused_model, fused_opt = get_model_and_optimizer(D, lr, eps, fused=True, foreach=False, dtype=dtype)
# Sync initial weights
for p_ref, p_foreach, p_fused in zip(ref_model.parameters(), foreach_model.parameters(), fused_model.parameters()):
p_foreach.data.copy_(p_ref.data)
p_fused.data.copy_(p_ref.data)
configs = [
("foreach=False, fused=False", ref_model, ref_opt),
("foreach=True, fused=False", foreach_model, foreach_opt),
("foreach=False, fused=True ", fused_model, fused_opt),
]
for name, model, optimizer in configs:
losses = []
for _ in range(5):
losses.append(step_model(model, optimizer, x, target))
print(f"{name} : {losses}")
if __name__ == "__main__":
dtypes = [torch.float64, torch.float32, torch.float16, torch.bfloat16]
for dtype in dtypes:
print(f"\ndtype={dtype}")
test_correctness(dtype=dtype)
import torch
from itertools import product
from torch.optim import Adagrad
import torch.cuda.nvtx as nvtx
import argparse
import pandas as pd
def parse_args():
parser = argparse.ArgumentParser(
description="Adagrad GPU benchmark with optional Excel output"
)
parser.add_argument(
"--xlsx",
action="store_true",
help="Write results to an Excel .xlsx file",
)
return parser.parse_args()
def ensure_cuda():
if not torch.cuda.is_available():
raise RuntimeError("This script requires CUDA, but CUDA is not available.")
def benchmark_adagrad(config, dtype, shape, num_steps=100):
device = torch.device("cuda")
params = [
torch.randn(shape, dtype=dtype, device=device, requires_grad=True)
for _ in range(10)
]
try:
optimizer = Adagrad(
params, lr=0.01, fused=config["fused"], foreach=config["foreach"]
)
except Exception as e:
return None, None, str(e)
for _ in range(10): # warm-up
for p in params:
p.grad = torch.randn_like(p)
optimizer.step()
torch.cuda.synchronize()
torch.cuda.reset_peak_memory_stats(device)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
total_time_ms = 0.0
nvtx.range_push(f"{config['label']}-{dtype}-{shape}")
for _ in range(num_steps):
for p in params:
p.grad = torch.randn_like(p)
start_event.record()
optimizer.step()
end_event.record()
torch.cuda.synchronize()
total_time_ms += start_event.elapsed_time(end_event)
nvtx.range_pop()
peak_bytes = torch.cuda.max_memory_allocated(device)
peak_mib = peak_bytes / (1024 ** 2)
avg_time_s = (total_time_ms / num_steps) / 1000.0
return avg_time_s, peak_mib, None
def main():
args = parse_args()
ensure_cuda()
configs = [
{"fused": False, "foreach": False, "label": "fused=False, foreach=False"},
{"fused": False, "foreach": True, "label": "fused=False, foreach=True"},
{"fused": True, "foreach": False, "label": "fused=True, foreach=False"},
]
dtypes = [torch.float16, torch.float32, torch.float64, torch.bfloat16]
shapes = [(50257, 768), (2048, 2048), (768, 3072), (4096,)]
results = []
for dtype, shape in product(dtypes, shapes):
print(f"\nTesting Dtype: {dtype}, Shape: {shape}")
for config in configs:
print(
f"Config: {config['label']} (fused={config['fused']}, foreach={config['foreach']})"
)
avg_time, peak_mem, error = benchmark_adagrad(config, dtype, shape)
status = "Success" if error is None else "Failed"
print(
f" Time: {avg_time if avg_time else 'N/A'} s, Peak Mem: {peak_mem if peak_mem else 'N/A'} MiB, Status: {status}"
)
results.append(
{
"dtype": str(dtype),
"shape": str(shape),
"config": config["label"],
"avg_time_s": avg_time,
"peak_memory_MiB": peak_mem,
"status": status,
"error": error,
}
)
df = pd.DataFrame(results)
print("\nRaw Results:")
print(df)
if args.xlsx:
# Convert time to ms for display
df["avg_time_ms"] = df["avg_time_s"] * 1000 if df["avg_time_s"].notnull().any() else df["avg_time_s"]
# Pivot tables
speed_table = df.pivot_table(
index=["dtype", "shape"],
columns="config",
values="avg_time_ms"
).reset_index()
memory_table = df.pivot_table(
index=["dtype", "shape"],
columns="config",
values="peak_memory_MiB"
).reset_index()
with pd.ExcelWriter("benchmark_results.xlsx") as writer:
speed_table.to_excel(writer, sheet_name="speed_results", index=False)
memory_table.to_excel(writer, sheet_name="memory_results", index=False)
print("Saved Excel output to benchmark_results.xlsx")
if __name__ == "__main__":
main()
@janeyx99
Copy link

It is weird that the Adagrad fused results look different from the AdamW ones (which are expected that fused > foreach > forloop)...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment