Hao Zhuang zhuangh

🎯

Focusing

Tesla AI Dojo autopilot; @google qkeras, edge TPU, brain Video ML; UCSD CS PhD, PKU EECS

zhuangh / 5d_parallelism.py

Created December 2, 2025 16:35

5d_parallelism

	import os
	import math
	import torch
	import torch.nn as nn
	import torch.optim as optim
	import torch.multiprocessing as mp
	import torch.distributed as dist

	from torch.distributed.device_mesh import init_device_mesh
	from torch.distributed.fsdp import fully_shard

zhuangh / 3d_parallelism.py

Created December 2, 2025 16:18

3d_parallelism

	import os
	import torch
	import torch.nn as nn
	import torch.optim as optim
	import torch.multiprocessing as mp
	import torch.distributed as dist

	from torch.distributed.device_mesh import init_device_mesh
	from torch.distributed.fsdp import fully_shard
	from torch.distributed.tensor.parallel import (

zhuangh / mqa_reshape_go_faster.py

Last active May 5, 2024 02:00

MQA reshape_go_faster.py

zhuangh / vs-config-gdbserver.json

Created May 4, 2024 23:58

vscode-bebyond gdb - gdb server

	// https://marketplace.visualstudio.com/items?itemName=coolchyni.beyond-debug
	{
	"inputs": [
	{
	"id": "hostname",
	"description": "xxx",
	"default": "localhost",
	"type": "promptString"
	},
	{

zhuangh / single_gpu_ddp.py

Last active May 4, 2024 23:51

single_gpu_ddp.py

	# python single_gpu_ddp.py
	# https://discuss.pytorch.org/t/single-machine-single-gpu-distributed-best-practices/169243
	import torch
	import torch.distributed as dist
	import torch.nn as nn
	import torch.multiprocessing as mp
	from torch.nn.parallel import DistributedDataParallel as DDP
	import os

	def setup(rank, world_size):

zhuangh / matmul_gtx1060.ir

Created November 22, 2023 07:54

matmul_gtx1060.ir

	IR module {
	tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
	%c16_i32 = arith.constant 16 : i32
	%c1024_i32 = arith.constant 1024 : i32
	%c0_i32 = arith.constant 0 : i32
	%cst = arith.constant dense<16> : tensor<16x16xi32>
	%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
	%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
	%1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
	%2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32>

zhuangh / run_matmul_gtx1060.py

Created November 22, 2023 07:52

run_matmul_gtx1060.py

	import torch
	import triton
	import triton.language as tl
	import torch.nn.functional as F

	@triton.jit
	def matmul_kernel(
	a_ptr, b_ptr, c_ptr,
	stride_am, stride_ak,
	stride_bk, stride_bn,

zhuangh / run_triton.py

Last active November 22, 2023 01:50

run_triton.py

	import torch
	import triton
	import triton.language as tl
	import torch.nn.functional as F
	import time

	@triton.jit
	def add_kernel(x_ptr, y_ptr, output_ptr, N,
	BLOCK_SIZE: tl.constexpr):
	pid = tl.program_id(0)

zhuangh / cudagraph_decorator.py

Last active November 19, 2023 06:46

cudagraph_decorator.py

	import torch

	# acknowledgement: https://gist.github.com/bwasti/7e4cb9bd1aaddeb09bd360b570a486b1

	def cudagraph(f):
	_graphs = {}
	def f_(*args):
	key = hash(tuple(tuple(a.shape) for a in args))
	if key in _graphs:
	wrapped, *_ = _graphs[key]

zhuangh / vec2dIterator.cc

Created April 16, 2018 05:33

2d iterator with remove()

	class Vector2D {
	private:
	vector<vector<int>>::iterator row, iBegin, iEnd;
	vector<int>::iterator col;
	public:
	Vector2D(vector<vector<int>>& vec2d) {
	iBegin = row = vec2d.begin();
	iEnd = vec2d.end();
	if(vec2d.size())
	col = row->begin();