Skip to content

Instantly share code, notes, and snippets.

View davidberard98's full-sized avatar

David Berard davidberard98

  • Anthropic
  • San Francisco, CA
View GitHub Profile
import argparse
import multiprocessing
import os
from time import sleep
import torch
import triton
import triton.language as tl
We can make this file beautiful and searchable if this error is corrected: It looks like row 7 should actually have 5 columns, instead of 3 in line 6.
metric,side_a_speedup,side_b_speedup,ratio_b_over_a,improvement_percent
"tritonbench_rope_bwd[x_(2048, 2048)-liger_rotary_pos_emb]_speedup",3.872456642947213,2.7721369338334196,0.7158600313530245,-28.413996864697555
"tritonbench_flex_attention_fwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_speedup",80.91770255783995,65.44677234875213,0.8088066057235227,-19.11933942764773
"tritonbench_flex_attention_bwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_speedup",26.4510668357185,22.165953910167254,0.8379984840624729,-16.20015159375271
"tritonbench_flash_attention_fwd[x_(4, 48, 128, 128, 64)-triton_tutorial_flash_v2]_speedup",1.5028248392486048,1.2918659460892634,0.8596250955867761,-14.037490441322387
"tritonbench_flex_attention_fwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_speedup",68.91569269467183,61.62544280408594,0.8942149515512374,-10.578504844876257
"tritonbench_gemm_fwd[x_(1280, 1280, 1280)-triton_tutorial_matmul]_speedup",0.7517605254921877,0.6761268
We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 8.
metric,side_a_speedup,side_b_speedup,ratio_b_over_a,improvement_percent
"tritonbench_gemm_fwd[x_(2816, 2816, 2816)-triton_tutorial_matmul]_speedup",0.9140271292993862,0.6970010513896804,0.7625605729273494,-23.743942707265063
"tritonbench_layer_norm_bwd[x_(4096, 7680)-liger_layer_norm]_speedup",0.9977561823241726,0.7614043701113448,0.7631166647724796,-23.688333522752036
"tritonbench_gemm_fwd[x_(2688, 2688, 2688)-triton_tutorial_matmul]_speedup",0.9499290949872015,0.7257081424105393,0.7639603273971904,-23.603967260280957
"tritonbench_layer_norm_bwd[x_(4096, 6656)-liger_layer_norm]_speedup",0.9266213852456668,0.7133837487030577,0.7698761976164888,-23.012380238351117
"tritonbench_layer_norm_bwd[x_(4096, 7168)-liger_layer_norm]_speedup",0.9561411985507111,0.7366730997908861,0.770464760756583,-22.953523924341702
"tritonbench_gemm_fwd[x_(4096, 4096, 4096)-triton_tutorial_matmul]_speedup",0.932466383549378,0.7262403068410183,0.7788380574928906,-22.11619425071094
"tritonbench_gemm_fwd[x_(3328, 3328, 3328)-triton_tutor
# only skipped commits left to test
# possible first bad commit: [f0975f9d02f6d8b69146aea84b8ee7c6e81c4ed5] [AMD][Build] Fix build issue with AMD lld (#7608)
# possible first bad commit: [620c59165a1452ddd5dd685054b84485a35dc92e] [AMD][NFC] Group scheduling functions in StreamPipeliner (#7607)
# possible first bad commit: [16b25e1620e87f02f7d16ff5b9da2d425c6e99a6] [BACKEND] combineRedundantWaitOps should not combine across loops/branches (#7593)
# possible first bad commit: [fdd694d48a2b05c49bd9658fbedc3204d04654e1] [Triton][Gluon] Add `map_elementwise` (#7564)
# possible first bad commit: [03cdcdb38a4812f8c371ff7de433e6c7b605b1ef] [KERNELS] Fix `bench_mlp.py` for AMD (#7600)
# possible first bad commit: [ef72c317570de66807d83648bde1bff64e2898f8] [Tests] Improve regex for test_compile_only_dot (#7602)
# possible first bad commit: [a7a89c7c9262ed3d761ea771a907d53f4caba92a] [FRONTEND] Refactor unsplat to use new op (#7586)
# possible first bad commit: [6415039bf96145fdabca0ca0ac5f05b9d42cf45f] [AMD] Support 4x6
python: /data/users/dberard/triton-env/triton/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp:501: const ValueT &mlir::(anonymous namespace)::FatPointers::at(const_arg_type_t<KeyT>) const: Assertion `pointerAttrs.contains(k) && "expected fatPtrs to contain remapped fat pointer"' failed.
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [1, 0]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [1, 1], order = [1, 0]}>
#blocked2 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
#blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 1], order = [0, 1]}>
#blocked4 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @
# AOT ID: ['0_backward']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from cmath import nanj
from torch._inductor.hooks import run_intermediate_hooks
TORCHINDUCTOR_FORCE_DISABLE_CACHES=1
function finish() {
pushd /home/dberard/local/pytorch-env7/triton
git checkout -- scripts/build-llvm-project.sh
}
trap finish EXIT
git patch /home/dberard/local/pytorch-env7/diff.patch
make dev-install-llvm
code=$?
if [ $code -ne 0 ]
# USAGE:
# Put this in your triton repo directory.
# 1. Update the [BUILD COMMAND]
# 2. Update the [PYTORCH PATH]
# 3. Update the [TEST COMMAND]
# 4. Run the bisect:
# $ git bisect start
# $ git checkout [known good commit]
# $ git bisect good
# $ git checkout [known bad commit]
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_90a
.address_size 64
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_90a
.address_size 64
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];