traversaro · October 17, 2025 13:16
diff --git a/allocate_vram.py b/allocate_vram.py
 #!/usr/bin/env python3
 """
 allocate_vram.py — attempt to allocate ~80 GiB of VRAM with PyTorch.

 Usage examples:
  python allocate_vram.py                 # try for 80 GiB on cuda:0
  python allocate_vram.py --target-gib 78 # pick a different target
  python allocate_vram.py --chunk-gib 2   # bigger allocation chunks
  python allocate_vram.py --device cuda:1 # choose another GPU
 """

 import os
 # Helps PyTorch reduce fragmentation for large allocations (set before torch import)
 os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

 import argparse
 import math
 import time
 import torch

 def fmt_bytes(b: int) -> str:
    return f"{b/1024**3:.2f} GiB"

 def main():
    parser = argparse.ArgumentParser(description="Allocate lots of VRAM with PyTorch.")
    parser.add_argument("--device", type=str, default="cuda:0",
                        help="CUDA device like cuda:0, cuda:1, etc.")
    parser.add_argument("--target-gib", type=float, default=80.0,
                        help="Target VRAM to allocate in GiB (binary, 1024^3).")
    parser.add_argument("--chunk-gib", type=float, default=1.0,
                        help="Allocate in chunks of this many GiB.")
    parser.add_argument("--hold-seconds", type=float, default=10.0,
                        help="How long to keep memory allocated before freeing.")
    parser.add_argument("--dtype", type=str, default="uint8",
                        choices=["uint8", "float16", "bfloat16", "float32", "float64"],
                        help="Tensor dtype to use for allocations (uint8 = 1 byte/element).")
    args = parser.parse_args()

    if not torch.cuda.is_available():
        raise SystemExit("CUDA is not available. Make sure you have a CUDA-capable GPU and drivers installed.")

    device = torch.device(args.device)
    torch.cuda.set_device(device)

    # Clear any cached allocator state for a clean start
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats(device)

    # Map dtype string to actual torch dtype
    dtype_map = {
        "uint8": torch.uint8,
        "float16": torch.float16,
        "bfloat16": torch.bfloat16,
        "float32": torch.float32,
        "float64": torch.float64,
    }
    dtype = dtype_map[args.dtype]
    elem_size = torch.empty((), dtype=dtype).element_size()

    # Show device info
    props = torch.cuda.get_device_properties(device)
    total_vram = props.total_memory  # bytes
    free_before, total_reported = torch.cuda.mem_get_info()  # free/total usable by allocator
    print(f"Device       : {props.name} ({args.device})")
    print(f"Total VRAM   : {fmt_bytes(total_vram)}")
    print(f"Free (start) : {fmt_bytes(free_before)} (allocator-reported)\n")

    target_bytes = int(args.target_gib * (1024 ** 3))
    chunk_bytes = int(args.chunk_gib * (1024 ** 3))
    if chunk_bytes <= 0:
        raise SystemExit("--chunk-gib must be positive")

    print(f"Target alloc : {fmt_bytes(target_bytes)} with chunks of {fmt_bytes(chunk_bytes)}; dtype={args.dtype} (elem={elem_size} B)")
    tensors = []
    allocated_bytes = 0
    num_chunks = 0

    try:
        while allocated_bytes < target_bytes:
            remaining = target_bytes - allocated_bytes
            this_bytes = min(chunk_bytes, remaining)
            # Convert bytes to element count (round up)
            numel = (this_bytes + elem_size - 1) // elem_size
            try:
                t = torch.empty(numel, dtype=dtype, device=device)
                # Access a small slice to ensure the allocation is realized
                t[:1] = 0
                tensors.append(t)
                num_chunks += 1
                added = t.numel() * elem_size
                allocated_bytes += added

                reserved_now = torch.cuda.memory_reserved(device)
                allocated_now = torch.cuda.memory_allocated(device)
                print(f"[{num_chunks:03d}] Allocated +{fmt_bytes(added)} "
                      f"(total allocated={fmt_bytes(allocated_now)}, reserved={fmt_bytes(reserved_now)})")
            except RuntimeError as e:
                # Typical PyTorch OOM is RuntimeError with "out of memory"
                if "out of memory" in str(e).lower():
                    print(f"OOM reached after {fmt_bytes(allocated_bytes)}. "
                          f"Cannot allocate next chunk of {fmt_bytes(this_bytes)}.")
                    break
                else:
                    raise
        torch.cuda.synchronize()
        peak_alloc = torch.cuda.max_memory_allocated(device)
        peak_res = torch.cuda.max_memory_reserved(device)
        free_after, _ = torch.cuda.mem_get_info()

        print("\n=== Summary ===")
        print(f"Requested      : {fmt_bytes(target_bytes)}")
        print(f"Allocated      : {fmt_bytes(allocated_bytes)} across {num_chunks} chunk(s)")
        print(f"Peak allocated : {fmt_bytes(peak_alloc)}")
        print(f"Peak reserved  : {fmt_bytes(peak_res)}")
        print(f"Free (end)     : {fmt_bytes(free_after)} (allocator-reported)")
        if allocated_bytes >= target_bytes:
            print("\nSUCCESS: Target met or exceeded. Holding allocation briefly...")
        else:
            print("\nPARTIAL: Could not reach the full target (likely device limit or fragmentation).")

        time.sleep(max(0.0, args.hold_seconds))
    finally:
        # Cleanup
        tensors.clear()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        print("Freed allocations and cleared CUDA cache.")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	allocate_vram.py — attempt to allocate ~80 GiB of VRAM with PyTorch.

	Usage examples:
	python allocate_vram.py # try for 80 GiB on cuda:0
	python allocate_vram.py --target-gib 78 # pick a different target
	python allocate_vram.py --chunk-gib 2 # bigger allocation chunks
	python allocate_vram.py --device cuda:1 # choose another GPU
	"""

	import os
	# Helps PyTorch reduce fragmentation for large allocations (set before torch import)
	os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

	import argparse
	import math
	import time
	import torch

	def fmt_bytes(b: int) -> str:
	return f"{b/1024**3:.2f} GiB"

	def main():
	parser = argparse.ArgumentParser(description="Allocate lots of VRAM with PyTorch.")
	parser.add_argument("--device", type=str, default="cuda:0",
	help="CUDA device like cuda:0, cuda:1, etc.")
	parser.add_argument("--target-gib", type=float, default=80.0,
	help="Target VRAM to allocate in GiB (binary, 1024^3).")
	parser.add_argument("--chunk-gib", type=float, default=1.0,
	help="Allocate in chunks of this many GiB.")
	parser.add_argument("--hold-seconds", type=float, default=10.0,
	help="How long to keep memory allocated before freeing.")
	parser.add_argument("--dtype", type=str, default="uint8",
	choices=["uint8", "float16", "bfloat16", "float32", "float64"],
	help="Tensor dtype to use for allocations (uint8 = 1 byte/element).")
	args = parser.parse_args()

	if not torch.cuda.is_available():
	raise SystemExit("CUDA is not available. Make sure you have a CUDA-capable GPU and drivers installed.")

	device = torch.device(args.device)
	torch.cuda.set_device(device)

	# Clear any cached allocator state for a clean start
	torch.cuda.empty_cache()
	torch.cuda.reset_peak_memory_stats(device)

	# Map dtype string to actual torch dtype
	dtype_map = {
	"uint8": torch.uint8,
	"float16": torch.float16,
	"bfloat16": torch.bfloat16,
	"float32": torch.float32,
	"float64": torch.float64,
	}
	dtype = dtype_map[args.dtype]
	elem_size = torch.empty((), dtype=dtype).element_size()

	# Show device info
	props = torch.cuda.get_device_properties(device)
	total_vram = props.total_memory # bytes
	free_before, total_reported = torch.cuda.mem_get_info() # free/total usable by allocator
	print(f"Device : {props.name} ({args.device})")
	print(f"Total VRAM : {fmt_bytes(total_vram)}")
	print(f"Free (start) : {fmt_bytes(free_before)} (allocator-reported)\n")

	target_bytes = int(args.target_gib * (1024 ** 3))
	chunk_bytes = int(args.chunk_gib * (1024 ** 3))
	if chunk_bytes <= 0:
	raise SystemExit("--chunk-gib must be positive")

	print(f"Target alloc : {fmt_bytes(target_bytes)} with chunks of {fmt_bytes(chunk_bytes)}; dtype={args.dtype} (elem={elem_size} B)")
	tensors = []
	allocated_bytes = 0
	num_chunks = 0

	try:
	while allocated_bytes < target_bytes:
	remaining = target_bytes - allocated_bytes
	this_bytes = min(chunk_bytes, remaining)
	# Convert bytes to element count (round up)
	numel = (this_bytes + elem_size - 1) // elem_size
	try:
	t = torch.empty(numel, dtype=dtype, device=device)
	# Access a small slice to ensure the allocation is realized
	t[:1] = 0
	tensors.append(t)
	num_chunks += 1
	added = t.numel() * elem_size
	allocated_bytes += added

	reserved_now = torch.cuda.memory_reserved(device)
	allocated_now = torch.cuda.memory_allocated(device)
	print(f"[{num_chunks:03d}] Allocated +{fmt_bytes(added)} "
	f"(total allocated={fmt_bytes(allocated_now)}, reserved={fmt_bytes(reserved_now)})")
	except RuntimeError as e:
	# Typical PyTorch OOM is RuntimeError with "out of memory"
	if "out of memory" in str(e).lower():
	print(f"OOM reached after {fmt_bytes(allocated_bytes)}. "
	f"Cannot allocate next chunk of {fmt_bytes(this_bytes)}.")
	break
	else:
	raise
	torch.cuda.synchronize()
	peak_alloc = torch.cuda.max_memory_allocated(device)
	peak_res = torch.cuda.max_memory_reserved(device)
	free_after, _ = torch.cuda.mem_get_info()

	print("\n=== Summary ===")
	print(f"Requested : {fmt_bytes(target_bytes)}")
	print(f"Allocated : {fmt_bytes(allocated_bytes)} across {num_chunks} chunk(s)")
	print(f"Peak allocated : {fmt_bytes(peak_alloc)}")
	print(f"Peak reserved : {fmt_bytes(peak_res)}")
	print(f"Free (end) : {fmt_bytes(free_after)} (allocator-reported)")
	if allocated_bytes >= target_bytes:
	print("\nSUCCESS: Target met or exceeded. Holding allocation briefly...")
	else:
	print("\nPARTIAL: Could not reach the full target (likely device limit or fragmentation).")

	time.sleep(max(0.0, args.hold_seconds))
	finally:
	# Cleanup
	tensors.clear()
	torch.cuda.empty_cache()
	torch.cuda.synchronize()
	print("Freed allocations and cleared CUDA cache.")

	if __name__ == "__main__":
	main()
No results found