Skip to content

Instantly share code, notes, and snippets.

View garrett361's full-sized avatar

Garrett Goon garrett361

View GitHub Profile
@garrett361
garrett361 / out.txt
Created August 16, 2025 15:47
block decomposition test
R=tensor([[ 0.0673+0.j, -0.6867+0.j, 0.7238+0.j],
[-0.4553+0.j, -0.6666+0.j, -0.5901+0.j],
[ 0.8878+0.j, -0.2899+0.j, -0.3575+0.j]])
[email protected]=tensor([[ 1.0000e+00+0.j, -2.7138e-08+0.j, -2.7308e-08+0.j],
[-2.7138e-08+0.j, 1.0000e+00+0.j, -3.7264e-08+0.j],
[-2.9802e-08+0.j, -4.4703e-08+0.j, 1.0000e+00+0.j]])
eigvals=tensor([ 1.0000+1.4901e-08j, -0.9785+2.0648e-01j, -0.9785-2.0648e-01j])
@garrett361
garrett361 / out.txt
Created June 5, 2025 14:35
set_reduce_scatter_divide_factor Error
torchrun --nproc-per-node 2 set_div_err.py
W0605 14:34:10.112000 783116 torch/distributed/run.py:766]
W0605 14:34:10.112000 783116 torch/distributed/run.py:766] *****************************************
W0605 14:34:10.112000 783116 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0605 14:34:10.112000 783116 torch/distributed/run.py:766] *****************************************
Running with trivial mp_policy
Passed on RANK=0 with mp_policy=MixedPrecisionPolicy(param_dtype=None, reduce_dtype=None, output_dtype=None, cast_forward_inputs=True)Passed on RANK=1 with mp_policy=MixedPrecisionPolicy(param_dtype=None, reduce_dtype=None, output_dtype=None, cast_forward_inputs=True)
Running with non-trivial mp_policy
FAILED on RANK=0 with mp_policy=MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, output
@garrett361
garrett361 / output.txt
Created April 21, 2025 18:19
test torch._grouped_mm
torch.__version__='2.8.0.dev20250421+cu126'
Passed first test
Traceback (most recent call last):
File "/app/torchtitan/gemm_test.py", line 38, in <module>
test_gemm_backward_fails()
File "/app/torchtitan/gemm_test.py", line 31, in test_gemm_backward_fails
out.sum().backward()
File "/opt/conda/envs/ai/lib/python3.11/site-packages/torch/_tensor.py", line 648, in backward
torch.autograd.backward(
File "/opt/conda/envs/ai/lib/python3.11/site-packages/torch/autograd/__init__.py", line 354, in backward
@garrett361
garrett361 / test_dtensor_slice.py
Last active March 10, 2025 19:49
DTensor slicing
import os
import torch
import torch.distributed as dist
from torch.distributed.tensor import Shard, distribute_tensor
if __name__ == "__main__":
try:
world_size = int(os.environ["WORLD_SIZE"])
local_rank = int(os.environ["LOCAL_RANK"])
@garrett361
garrett361 / main.py
Created December 11, 2024 03:03
DTensor double-sharded random
import multiprocessing as mp
import os
import torch
import torch.distributed as dist
from torch.distributed.tensor import distribute_tensor
from torch.distributed.tensor.placement_types import (
Shard,
)
@garrett361
garrett361 / overwrite.py
Created November 27, 2024 22:07
overwrite reduce scatter
import argparse
import multiprocessing as mp
import os
import torch
import torch.distributed as dist
def print_rank(s: str) -> None:
s = f'[rank={os.environ["RANK"]}] ' + s
@garrett361
garrett361 / launch_mpi_min.sh
Last active June 20, 2024 15:40
Sunspot MPI Torch Launch
#!/bin/bash -l
# Minimal mpiexec-based launch script following https://docs.alcf.anl.gov/aurora/data-science/frameworks/pytorch/
# Usage:
#
# qsub [-v [SCRIPT_PATH=your_script_path] [ARGS=...] ] launch_mpi_min.sh
#
# where your_script_path is the absolute path args will be passed to the script
#
@garrett361
garrett361 / build_one_ccl_examples.sh
Created June 7, 2024 18:43
oneCCL build and run benchmark sunspot
#!/bin/bash -l
#PBS -A Aurora_deployment
#PBS -l filesystems=home:gila
#PBS -l select=2
#PBS -l place=scatter
#PBS -l walltime=00:15:00
#PBS -q workq
#PBS -j oe
#PBS -k doe
@garrett361
garrett361 / mp_torch_reduce_scatter.py
Created June 6, 2024 13:28
mp reduce scatter xpu
"""
Launch single-node reduce scatter with multiprocessing.
python3 mp_torch_reduce_scatter.py
"""
import os
import socket
from concurrent.futures import ProcessPoolExecutor
@garrett361
garrett361 / collective.py
Created June 4, 2024 18:22
Torch Profile Comms Compute Overlap
from abc import ABC, abstractmethod
import torch
import torch.distributed as dist
if torch.cuda.is_available():
accel = torch.cuda
DEVICE_TYPE = "cuda"
BACKEND = "nccl"
else: