Garrett Goon garrett361

Research Scientist (AI) at IBM with a background in theoretical physics.

garrett361 / out.txt

Created August 16, 2025 15:47

block decomposition test

	R=tensor([[ 0.0673+0.j, -0.6867+0.j, 0.7238+0.j],
	[-0.4553+0.j, -0.6666+0.j, -0.5901+0.j],
	[ 0.8878+0.j, -0.2899+0.j, -0.3575+0.j]])

	[email protected]=tensor([[ 1.0000e+00+0.j, -2.7138e-08+0.j, -2.7308e-08+0.j],
	[-2.7138e-08+0.j, 1.0000e+00+0.j, -3.7264e-08+0.j],
	[-2.9802e-08+0.j, -4.4703e-08+0.j, 1.0000e+00+0.j]])

	eigvals=tensor([ 1.0000+1.4901e-08j, -0.9785+2.0648e-01j, -0.9785-2.0648e-01j])

garrett361 / out.txt

Created June 5, 2025 14:35

set_reduce_scatter_divide_factor Error

	torchrun --nproc-per-node 2 set_div_err.py
	W0605 14:34:10.112000 783116 torch/distributed/run.py:766]
	W0605 14:34:10.112000 783116 torch/distributed/run.py:766] *****************************************
	W0605 14:34:10.112000 783116 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W0605 14:34:10.112000 783116 torch/distributed/run.py:766] *****************************************
	Running with trivial mp_policy
	Passed on RANK=0 with mp_policy=MixedPrecisionPolicy(param_dtype=None, reduce_dtype=None, output_dtype=None, cast_forward_inputs=True)Passed on RANK=1 with mp_policy=MixedPrecisionPolicy(param_dtype=None, reduce_dtype=None, output_dtype=None, cast_forward_inputs=True)

	Running with non-trivial mp_policy
	FAILED on RANK=0 with mp_policy=MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, output

garrett361 / output.txt

Created April 21, 2025 18:19

test torch._grouped_mm

	torch.__version__='2.8.0.dev20250421+cu126'
	Passed first test
	Traceback (most recent call last):
	File "/app/torchtitan/gemm_test.py", line 38, in <module>
	test_gemm_backward_fails()
	File "/app/torchtitan/gemm_test.py", line 31, in test_gemm_backward_fails
	out.sum().backward()
	File "/opt/conda/envs/ai/lib/python3.11/site-packages/torch/_tensor.py", line 648, in backward
	torch.autograd.backward(
	File "/opt/conda/envs/ai/lib/python3.11/site-packages/torch/autograd/__init__.py", line 354, in backward

garrett361 / test_dtensor_slice.py

Last active March 10, 2025 19:49

DTensor slicing

	import os

	import torch
	import torch.distributed as dist
	from torch.distributed.tensor import Shard, distribute_tensor

	if __name__ == "__main__":
	try:
	world_size = int(os.environ["WORLD_SIZE"])
	local_rank = int(os.environ["LOCAL_RANK"])

garrett361 / main.py

Created December 11, 2024 03:03

DTensor double-sharded random

	import multiprocessing as mp
	import os

	import torch
	import torch.distributed as dist
	from torch.distributed.tensor import distribute_tensor
	from torch.distributed.tensor.placement_types import (
	Shard,
	)

garrett361 / overwrite.py

Created November 27, 2024 22:07

overwrite reduce scatter

	import argparse
	import multiprocessing as mp
	import os

	import torch
	import torch.distributed as dist


	def print_rank(s: str) -> None:
	s = f'[rank={os.environ["RANK"]}] ' + s

garrett361 / launch_mpi_min.sh

Last active June 20, 2024 15:40

Sunspot MPI Torch Launch

	#!/bin/bash -l

	# Minimal mpiexec-based launch script following https://docs.alcf.anl.gov/aurora/data-science/frameworks/pytorch/

	# Usage:
	#
	# qsub [-v [SCRIPT_PATH=your_script_path] [ARGS=...] ] launch_mpi_min.sh
	#
	# where your_script_path is the absolute path args will be passed to the script
	#

garrett361 / build_one_ccl_examples.sh

Created June 7, 2024 18:43

oneCCL build and run benchmark sunspot

garrett361 / mp_torch_reduce_scatter.py

Created June 6, 2024 13:28

mp reduce scatter xpu

	"""
	Launch single-node reduce scatter with multiprocessing.

	python3 mp_torch_reduce_scatter.py
	"""

	import os
	import socket
	from concurrent.futures import ProcessPoolExecutor

garrett361 / collective.py

Created June 4, 2024 18:22

Torch Profile Comms Compute Overlap

	from abc import ABC, abstractmethod

	import torch
	import torch.distributed as dist

	if torch.cuda.is_available():
	accel = torch.cuda
	DEVICE_TYPE = "cuda"
	BACKEND = "nccl"
	else: