davidberard98’s gists

davidberard98 / layernorm_2d_impl.py

Last active September 23, 2025 22:24

	import argparse
	import multiprocessing
	import os
	from time import sleep

	import torch
	import triton
	import triton.language as tl

davidberard98 / speedup_results.csv

Last active September 19, 2025 18:16

We can make this file beautiful and searchable if this error is corrected: It looks like row 7 should actually have 5 columns, instead of 3 in line 6.

	metric,side_a_speedup,side_b_speedup,ratio_b_over_a,improvement_percent
	"tritonbench_rope_bwd[x_(2048, 2048)-liger_rotary_pos_emb]_speedup",3.872456642947213,2.7721369338334196,0.7158600313530245,-28.413996864697555
	"tritonbench_flex_attention_fwd[x_ (8, 16, 256, 16, 256, 128) \| noop-compiled]_speedup",80.91770255783995,65.44677234875213,0.8088066057235227,-19.11933942764773
	"tritonbench_flex_attention_bwd[x_ (8, 16, 128, 16, 128, 128) \| noop-compiled]_speedup",26.4510668357185,22.165953910167254,0.8379984840624729,-16.20015159375271
	"tritonbench_flash_attention_fwd[x_(4, 48, 128, 128, 64)-triton_tutorial_flash_v2]_speedup",1.5028248392486048,1.2918659460892634,0.8596250955867761,-14.037490441322387
	"tritonbench_flex_attention_fwd[x_ (8, 16, 512, 16, 512, 128) \| noop-compiled]_speedup",68.91569269467183,61.62544280408594,0.8942149515512374,-10.578504844876257
	"tritonbench_gemm_fwd[x_(1280, 1280, 1280)-triton_tutorial_matmul]_speedup",0.7517605254921877,0.6761268

davidberard98 / triton_34_vs_35.csv

Created September 9, 2025 04:01

We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 8.

	metric,side_a_speedup,side_b_speedup,ratio_b_over_a,improvement_percent
	"tritonbench_gemm_fwd[x_(2816, 2816, 2816)-triton_tutorial_matmul]_speedup",0.9140271292993862,0.6970010513896804,0.7625605729273494,-23.743942707265063
	"tritonbench_layer_norm_bwd[x_(4096, 7680)-liger_layer_norm]_speedup",0.9977561823241726,0.7614043701113448,0.7631166647724796,-23.688333522752036
	"tritonbench_gemm_fwd[x_(2688, 2688, 2688)-triton_tutorial_matmul]_speedup",0.9499290949872015,0.7257081424105393,0.7639603273971904,-23.603967260280957
	"tritonbench_layer_norm_bwd[x_(4096, 6656)-liger_layer_norm]_speedup",0.9266213852456668,0.7133837487030577,0.7698761976164888,-23.012380238351117
	"tritonbench_layer_norm_bwd[x_(4096, 7168)-liger_layer_norm]_speedup",0.9561411985507111,0.7366730997908861,0.770464760756583,-22.953523924341702
	"tritonbench_gemm_fwd[x_(4096, 4096, 4096)-triton_tutorial_matmul]_speedup",0.932466383549378,0.7262403068410183,0.7788380574928906,-22.11619425071094
	"tritonbench_gemm_fwd[x_(3328, 3328, 3328)-triton_tutor

davidberard98 / skipped_possible_bad_commits.txt

Created September 6, 2025 05:45

	# only skipped commits left to test
	# possible first bad commit: [f0975f9d02f6d8b69146aea84b8ee7c6e81c4ed5] [AMD][Build] Fix build issue with AMD lld (#7608)
	# possible first bad commit: [620c59165a1452ddd5dd685054b84485a35dc92e] [AMD][NFC] Group scheduling functions in StreamPipeliner (#7607)
	# possible first bad commit: [16b25e1620e87f02f7d16ff5b9da2d425c6e99a6] [BACKEND] combineRedundantWaitOps should not combine across loops/branches (#7593)
	# possible first bad commit: [fdd694d48a2b05c49bd9658fbedc3204d04654e1] [Triton][Gluon] Add `map_elementwise` (#7564)
	# possible first bad commit: [03cdcdb38a4812f8c371ff7de433e6c7b605b1ef] [KERNELS] Fix `bench_mlp.py` for AMD (#7600)
	# possible first bad commit: [ef72c317570de66807d83648bde1bff64e2898f8] [Tests] Improve regex for test_compile_only_dot (#7602)
	# possible first bad commit: [a7a89c7c9262ed3d761ea771a907d53f4caba92a] [FRONTEND] Refactor unsplat to use new op (#7586)
	# possible first bad commit: [6415039bf96145fdabca0ca0ac5f05b9d42cf45f] [AMD] Support 4x6

davidberard98 / multi_margin_loss.txt

Created September 4, 2025 17:29

	python: /data/users/dberard/triton-env/triton/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp:501: const ValueT &mlir::(anonymous namespace)::FatPointers::at(const_arg_type_t<KeyT>) const: Assertion `pointerAttrs.contains(k) && "expected fatPtrs to contain remapped fat pointer"' failed.
	#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [1, 0]}>
	#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [1, 1], order = [1, 0]}>
	#blocked2 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
	#blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 1], order = [0, 1]}>
	#blocked4 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
	module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
	tt.func public @

davidberard98 / triton_only_repro.py

Created September 3, 2025 21:04

	# AOT ID: ['0_backward']
	from ctypes import c_void_p, c_long, c_int
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from cmath import nanj
	from torch._inductor.hooks import run_intermediate_hooks

davidberard98 / gist:0304f27ecb8559b44402a0b9255be055

Created August 21, 2025 23:12

	TORCHINDUCTOR_FORCE_DISABLE_CACHES=1
	function finish() {
	pushd /home/dberard/local/pytorch-env7/triton
	git checkout -- scripts/build-llvm-project.sh
	}
	trap finish EXIT
	git patch /home/dberard/local/pytorch-env7/diff.patch
	make dev-install-llvm
	code=$?
	if [ $code -ne 0 ]

davidberard98 / triton_bisect.sh

Created August 14, 2025 16:59

	# USAGE:
	# Put this in your triton repo directory.
	# 1. Update the [BUILD COMMAND]
	# 2. Update the [PYTORCH PATH]
	# 3. Update the [TEST COMMAND]
	# 4. Run the bisect:
	# $ git bisect start
	# $ git checkout [known good commit]
	# $ git bisect good
	# $ git checkout [known bad commit]

davidberard98 / layernorm_good.ptx

Last active July 9, 2025 23:42

	//
	// Generated by LLVM NVPTX Back-End
	//

	.version 8.7
	.target sm_90a
	.address_size 64

	// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
	.extern .shared .align 16 .b8 global_smem[];

davidberard98 / layernorm_bad.ptx

Last active July 9, 2025 23:42

	//
	// Generated by LLVM NVPTX Back-End
	//

	.version 8.7
	.target sm_90a
	.address_size 64

	// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
	.extern .shared .align 16 .b8 global_smem[];

David Berard davidberard98