Last active
October 17, 2025 13:16
-
-
Save traversaro/89c9b9a654c10297f1936d3a8a616240 to your computer and use it in GitHub Desktop.
Stress GPU VRAM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| allocate_vram.py — attempt to allocate ~80 GiB of VRAM with PyTorch. | |
| Usage examples: | |
| python allocate_vram.py # try for 80 GiB on cuda:0 | |
| python allocate_vram.py --target-gib 78 # pick a different target | |
| python allocate_vram.py --chunk-gib 2 # bigger allocation chunks | |
| python allocate_vram.py --device cuda:1 # choose another GPU | |
| """ | |
| import os | |
| # Helps PyTorch reduce fragmentation for large allocations (set before torch import) | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| import argparse | |
| import math | |
| import time | |
| import torch | |
| def fmt_bytes(b: int) -> str: | |
| return f"{b/1024**3:.2f} GiB" | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Allocate lots of VRAM with PyTorch.") | |
| parser.add_argument("--device", type=str, default="cuda:0", | |
| help="CUDA device like cuda:0, cuda:1, etc.") | |
| parser.add_argument("--target-gib", type=float, default=80.0, | |
| help="Target VRAM to allocate in GiB (binary, 1024^3).") | |
| parser.add_argument("--chunk-gib", type=float, default=1.0, | |
| help="Allocate in chunks of this many GiB.") | |
| parser.add_argument("--hold-seconds", type=float, default=10.0, | |
| help="How long to keep memory allocated before freeing.") | |
| parser.add_argument("--dtype", type=str, default="uint8", | |
| choices=["uint8", "float16", "bfloat16", "float32", "float64"], | |
| help="Tensor dtype to use for allocations (uint8 = 1 byte/element).") | |
| args = parser.parse_args() | |
| if not torch.cuda.is_available(): | |
| raise SystemExit("CUDA is not available. Make sure you have a CUDA-capable GPU and drivers installed.") | |
| device = torch.device(args.device) | |
| torch.cuda.set_device(device) | |
| # Clear any cached allocator state for a clean start | |
| torch.cuda.empty_cache() | |
| torch.cuda.reset_peak_memory_stats(device) | |
| # Map dtype string to actual torch dtype | |
| dtype_map = { | |
| "uint8": torch.uint8, | |
| "float16": torch.float16, | |
| "bfloat16": torch.bfloat16, | |
| "float32": torch.float32, | |
| "float64": torch.float64, | |
| } | |
| dtype = dtype_map[args.dtype] | |
| elem_size = torch.empty((), dtype=dtype).element_size() | |
| # Show device info | |
| props = torch.cuda.get_device_properties(device) | |
| total_vram = props.total_memory # bytes | |
| free_before, total_reported = torch.cuda.mem_get_info() # free/total usable by allocator | |
| print(f"Device : {props.name} ({args.device})") | |
| print(f"Total VRAM : {fmt_bytes(total_vram)}") | |
| print(f"Free (start) : {fmt_bytes(free_before)} (allocator-reported)\n") | |
| target_bytes = int(args.target_gib * (1024 ** 3)) | |
| chunk_bytes = int(args.chunk_gib * (1024 ** 3)) | |
| if chunk_bytes <= 0: | |
| raise SystemExit("--chunk-gib must be positive") | |
| print(f"Target alloc : {fmt_bytes(target_bytes)} with chunks of {fmt_bytes(chunk_bytes)}; dtype={args.dtype} (elem={elem_size} B)") | |
| tensors = [] | |
| allocated_bytes = 0 | |
| num_chunks = 0 | |
| try: | |
| while allocated_bytes < target_bytes: | |
| remaining = target_bytes - allocated_bytes | |
| this_bytes = min(chunk_bytes, remaining) | |
| # Convert bytes to element count (round up) | |
| numel = (this_bytes + elem_size - 1) // elem_size | |
| try: | |
| t = torch.empty(numel, dtype=dtype, device=device) | |
| # Access a small slice to ensure the allocation is realized | |
| t[:1] = 0 | |
| tensors.append(t) | |
| num_chunks += 1 | |
| added = t.numel() * elem_size | |
| allocated_bytes += added | |
| reserved_now = torch.cuda.memory_reserved(device) | |
| allocated_now = torch.cuda.memory_allocated(device) | |
| print(f"[{num_chunks:03d}] Allocated +{fmt_bytes(added)} " | |
| f"(total allocated={fmt_bytes(allocated_now)}, reserved={fmt_bytes(reserved_now)})") | |
| except RuntimeError as e: | |
| # Typical PyTorch OOM is RuntimeError with "out of memory" | |
| if "out of memory" in str(e).lower(): | |
| print(f"OOM reached after {fmt_bytes(allocated_bytes)}. " | |
| f"Cannot allocate next chunk of {fmt_bytes(this_bytes)}.") | |
| break | |
| else: | |
| raise | |
| torch.cuda.synchronize() | |
| peak_alloc = torch.cuda.max_memory_allocated(device) | |
| peak_res = torch.cuda.max_memory_reserved(device) | |
| free_after, _ = torch.cuda.mem_get_info() | |
| print("\n=== Summary ===") | |
| print(f"Requested : {fmt_bytes(target_bytes)}") | |
| print(f"Allocated : {fmt_bytes(allocated_bytes)} across {num_chunks} chunk(s)") | |
| print(f"Peak allocated : {fmt_bytes(peak_alloc)}") | |
| print(f"Peak reserved : {fmt_bytes(peak_res)}") | |
| print(f"Free (end) : {fmt_bytes(free_after)} (allocator-reported)") | |
| if allocated_bytes >= target_bytes: | |
| print("\nSUCCESS: Target met or exceeded. Holding allocation briefly...") | |
| else: | |
| print("\nPARTIAL: Could not reach the full target (likely device limit or fragmentation).") | |
| time.sleep(max(0.0, args.hold_seconds)) | |
| finally: | |
| # Cleanup | |
| tensors.clear() | |
| torch.cuda.empty_cache() | |
| torch.cuda.synchronize() | |
| print("Freed allocations and cleared CUDA cache.") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment