Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Created September 4, 2025 17:29
Show Gist options
  • Select an option

  • Save davidberard98/4f68ca004435cecb62d183eef9e53483 to your computer and use it in GitHub Desktop.

Select an option

Save davidberard98/4f68ca004435cecb62d183eef9e53483 to your computer and use it in GitHub Desktop.
python: /data/users/dberard/triton-env/triton/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp:501: const ValueT &mlir::(anonymous namespace)::FatPointers::at(const_arg_type_t<KeyT>) const: Assertion `pointerAttrs.contains(k) && "expected fatPtrs to contain remapped fat pointer"' failed.
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [1, 0]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [1, 1], order = [1, 0]}>
#blocked2 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
#blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 1], order = [0, 1]}>
#blocked4 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1(%arg0: !tt.ptr<i1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg4: i32, %arg5: i32) attributes {noinline = false} {
%cst = arith.constant 2.000000e-02 : f32
%c10_i32 = arith.constant 10 : i32
%c5_i32 = arith.constant 5 : i32
%cst_0 = arith.constant dense<0.000000e+00> : tensor<1x1xf32, #blocked>
%cst_1 = arith.constant dense<0> : tensor<1x16xi8, #blocked1>
%cst_2 = arith.constant dense<0.000000e+00> : tensor<1x16xf32, #blocked1>
%cst_3 = arith.constant dense<10> : tensor<1x16xi32, #blocked1>
%0 = tt.get_program_id x : i32
%1 = arith.cmpi slt, %0, %c5_i32 : i32
%2 = tt.splat %1 : i1 -> tensor<1x1xi1, #blocked>
%3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked2>
%4 = ttg.convert_layout %3 : tensor<16xi32, #blocked2> -> tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked3}>>
%5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> -> tensor<1x16xi32, #blocked3>
%6 = ttg.convert_layout %5 : tensor<1x16xi32, #blocked3> -> tensor<1x16xi32, #blocked1>
%7 = arith.cmpi slt, %6, %cst_3 : tensor<1x16xi32, #blocked1>
%8 = arith.muli %0, %c10_i32 : i32
%9 = tt.splat %8 : i32 -> tensor<1x16xi32, #blocked1>
%10 = arith.addi %6, %9 : tensor<1x16xi32, #blocked1>
%11 = tt.splat %arg0 : !tt.ptr<i1> -> tensor<1x16x!tt.ptr<i1>, #blocked1>
%12 = tt.addptr %11, %10 : tensor<1x16x!tt.ptr<i1>, #blocked1>, tensor<1x16xi32, #blocked1>
%13 = tt.splat %1 : i1 -> tensor<1x16xi1, #blocked1>
%14 = arith.andi %7, %13 : tensor<1x16xi1, #blocked1>
%15 = tt.bitcast %12 : tensor<1x16x!tt.ptr<i1>, #blocked1> -> tensor<1x16x!tt.ptr<i8>, #blocked1>
%16 = tt.load %15, %14, %cst_1 : tensor<1x16x!tt.ptr<i8>, #blocked1>
%17 = arith.cmpi ne, %16, %cst_1 : tensor<1x16xi8, #blocked1>
%18 = tt.addptr %arg1, %0 : !tt.ptr<i64>, i32
%19 = tt.splat %18 : !tt.ptr<i64> -> tensor<1x1x!tt.ptr<i64>, #blocked>
%20 = tt.load %19, %2 evictionPolicy = evict_last : tensor<1x1x!tt.ptr<i64>, #blocked>
%21 = tt.load %arg2 : !tt.ptr<f32>
%22 = arith.extsi %6 : tensor<1x16xi32, #blocked1> to tensor<1x16xi64, #blocked1>
%23 = tt.broadcast %20 : tensor<1x1xi64, #blocked> -> tensor<1x16xi64, #blocked>
%24 = ttg.convert_layout %23 : tensor<1x16xi64, #blocked> -> tensor<1x16xi64, #blocked1>
%25 = arith.cmpi ne, %22, %24 : tensor<1x16xi64, #blocked1>
%26 = arith.mulf %21, %cst : f32
%27 = tt.splat %26 : f32 -> tensor<1x16xf32, #blocked1>
%28 = arith.select %25, %27, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1>
%29 = arith.select %17, %28, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1>
%30 = arith.select %14, %29, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1>
%31 = "tt.reduce"(%30) <{axis = 1 : i32}> ({
^bb0(%arg6: f32, %arg7: f32):
%43 = arith.addf %arg6, %arg7 : f32
tt.reduce.return %43 : f32
}) : (tensor<1x16xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>>
%32 = ttg.convert_layout %31 : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xf32, #blocked2>
%33 = ttg.convert_layout %32 : tensor<1xf32, #blocked2> -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked4}>>
%34 = tt.expand_dims %33 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1x1xf32, #blocked4>
%35 = ttg.convert_layout %34 : tensor<1x1xf32, #blocked4> -> tensor<1x1xf32, #blocked>
%36 = arith.subf %cst_0, %35 : tensor<1x1xf32, #blocked>
%37 = arith.extsi %8 : i32 to i64
%38 = tt.splat %37 : i64 -> tensor<1x1xi64, #blocked>
%39 = arith.addi %20, %38 : tensor<1x1xi64, #blocked>
%40 = tt.splat %arg3 : !tt.ptr<f32> -> tensor<1x1x!tt.ptr<f32>, #blocked>
%41 = tt.addptr %40, %39 : tensor<1x1x!tt.ptr<f32>, #blocked>, tensor<1x1xi64, #blocked>
%42 = tt.atomic_rmw fadd, relaxed, gpu, %41, %36, %2 : (tensor<1x1x!tt.ptr<f32>, #blocked>, tensor<1x1xf32, #blocked>, tensor<1x1xi1, #blocked>) -> tensor<1x1xf32, #blocked>
tt.return
}
}
{-#
external_resources: {
mlir_reproducer: {
pipeline: "builtin.module(tritongpu-coalesce, tritongpu-remove-layout-conversions, tritongpu-optimize-thread-locality, tritonamdgpu-accelerate-matmul{arch-generation-name=gfx942 kPack=1 matrix-instruction-size=0}, tritongpu-remove-layout-conversions, tritonamdgpu-optimize-epilogue, tritonamdgpu-optimize-dot-operands{arch-generation-name=gfx942}, tt.func(tritonamdgpu-hoist-layout-conversions), tritongpu-fuse-nested-loops, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, triton-licm, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritonamdgpu-stream-pipeline{global_prefetch=0 local_prefetch=0 num_stages=1 use_async_copy=false use_pingpong=true}, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritongpu-remove-layout-conversions, tritongpu-reduce-data-duplication, tt.func(tritonamdgpu-in-thread-transpose), tritongpu-remove-layout-conversions, tritonamdgpu-reorder-instructions, tt.func(tritonamdgpu-canonicalize-pointers), canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritonamdgpu-convert-buffer-ops{allow-buffer-atomics=true arch-generation-name=gfx942}, tritonamdgpu-fold-true-cmpi, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, cse, symbol-dce)",
disable_threading: false,
verify_each: true
}
}
#-}
/tmp/tmp90ym5rqn/2b/c2bniqfv7fdtefhrqbgqx3opxqba2unxatqmotxsupk5uym54xvk.py:18:0: error: Failures have been detected while processing an MLIR pass pipeline
/tmp/tmp90ym5rqn/2b/c2bniqfv7fdtefhrqbgqx3opxqba2unxatqmotxsupk5uym54xvk.py:18:0: note: Pipeline failed while executing [`TritonAMDGPUCanonicalizePointers` on 'tt.func' operation: @triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1]: reproducer generated at `std::errs, please share the reproducer above with Triton project.`
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] Triton compilation failed: triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] def triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1(in_ptr0, in_ptr1, in_ptr2, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] xnumel = 5
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_numel = 10
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] R0_BLOCK: tl.constexpr = 16
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] rnumel = r0_numel
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] RBLOCK: tl.constexpr = R0_BLOCK
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] xoffset = tl.program_id(0) * XBLOCK
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] xmask = xindex < xnumel
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_index = tl.arange(0, R0_BLOCK)[None, :]
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_offset = 0
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_mask = r0_index < r0_numel
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] roffset = r0_offset
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] rindex = r0_index
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_1 = r0_index
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] x0 = xindex
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp0 = tl.load(in_ptr0 + (r0_1 + 10*x0), r0_mask & xmask, other=0.0)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp0.dtype == tl.int1)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp1 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp1.dtype == tl.int64)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp4 = tl.load(in_ptr2 + (0))
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp4.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp5 = tl.broadcast_to(tmp4, [XBLOCK, R0_BLOCK])
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp5.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp2 = r0_1
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp2.dtype == tl.int32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp3 = tmp2 != tmp1
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp3.dtype == tl.int1)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp3.dtype == tl.int1)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp6 = 0.02
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp6.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp6.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp7 = tmp5 * tmp6
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp7.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp7.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp8 = 0.0
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp8.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp8.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp9 = tl.where(tmp3, tmp7, tmp8)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp9.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp9.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp8.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp10 = tl.where(tmp0, tmp9, tmp8)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp10.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp10.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp11.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp13 = tl.where(r0_mask & xmask, tmp11, 0)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp13.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp14 = tl.sum(tmp13, 1)[:, None].to(tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp14.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.device_assert(((0 <= tmp1) & (tmp1 < 10)) | ~(xmask), "index out of bounds: 0 <= tmp1 < 10")
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp16 = -tmp14
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp16.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp16.dtype == tl.float32)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.atomic_add(out_ptr1 + (tmp1 + 10*x0), tmp16, xmask, sem='relaxed')
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] metadata: {'signature': {'in_ptr0': '*i1', 'in_ptr1': '*i64', 'in_ptr2': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': 0, 'constants': {'XBLOCK': 1}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'device_type': 'hip', 'num_warps': 1, 'num_stages': 1, 'debug': False, 'cc': 'gfx942'}
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] Traceback (most recent call last):
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 758, in _precompile_config
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] binary = triton.compile(*compile_args, **compile_kwargs)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] File "/data/users/dberard/triton-env/triton/python/triton/compiler/compiler.py", line 319, in compile
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] next_module = compile_ir(module, metadata)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 451, in <lambda>
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 258, in make_ttgir
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] pm.run(mod)
E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] RuntimeError: PassManager::run failed
E
======================================================================
ERROR: test_comprehensive_nn_functional_multi_margin_loss_cuda_float32 (__main__.TestInductorOpInfoCUDA)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1135, in test_wrapper
return test(*args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1434, in only_fn
return fn(self, *args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 2361, in wrapper
fn(*args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1215, in dep_fn
return fn(slf, *args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1215, in dep_fn
return fn(slf, *args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1645, in wrapper
fn(*args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1560, in wrapper
fn(*args, **kwargs)
File "/home/dberard/.conda/envs/triton-env/lib/python3.10/unittest/mock.py", line 1379, in patched
return func(*newargs, **newkeywargs)
File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1118, in inner
raise e
File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1110, in inner
fn(self, device, dtype, op)
File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1375, in test_comprehensive
raise e
File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1350, in test_comprehensive
self.check_model_gpu(
File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 685, in check_model_gpu
check_model(
File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 633, in check_model
actual_grad = compute_grads(example_inputs, kwargs, actual, grads)
File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 408, in compute_grads
return torch.autograd.grad(
File "/data/users/dberard/triton-env/pytorch/torch/autograd/__init__.py", line 503, in grad
result = _engine_run_backward(
File "/data/users/dberard/triton-env/pytorch/torch/autograd/graph.py", line 841, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/data/users/dberard/triton-env/pytorch/torch/autograd/function.py", line 315, in apply
return user_fn(self, *args)
File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2303, in backward
return impl_fn()
File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2289, in impl_fn
out = CompiledFunction._backward_impl(ctx, all_args)
File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2394, in _backward_impl
CompiledFunction.compiled_bw = aot_config.bw_compiler(
File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/schemas.py", line 1267, in __call__
return self.compiler_fn(gm, example_inputs)
File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/backends/common.py", line 82, in _wrapped_bw_compiler
disable(
File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/eval_frame.py", line 1035, in _fn
return fn(*args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/_utils_internal.py", line 92, in wrapper_function
return function(*args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 2581, in bw_compiler
return compile_fx_backward(
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 2306, in compile_fx_backward
return inner_compile(
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 781, in compile_fx_inner
return wrap_compiler_debug(_compile_fx_inner, compiler_name="inductor")(
File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/repro/after_aot.py", line 144, in debug_wrapper
inner_compiled_fn = compiler_fn(gm, example_inputs)
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 989, in _compile_fx_inner
raise InductorError(e, currentframe()).with_traceback(
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 973, in _compile_fx_inner
mb_compiled_graph = fx_codegen_and_compile(
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 1694, in fx_codegen_and_compile
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 1504, in codegen_and_compile
compiled_module = graph.compile_to_module()
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2319, in compile_to_module
return self._compile_to_module()
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2329, in _compile_to_module
mod = self._compile_to_module_lines(wrapper_code)
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2397, in _compile_to_module_lines
mod = PyCodeCache.load_by_key_path(
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/codecache.py", line 3527, in load_by_key_path
mod = _reload_python_module(key, path, set_sys_modules=in_toplevel)
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/compile_tasks.py", line 33, in _reload_python_module
exec(code, mod.__dict__, mod.__dict__)
File "/tmp/tmp90ym5rqn/2u/c2urrlft4jn52zqrfqvwugrn7ztdmv52kwh5xrbulkq4gskiabt6.py", line 98, in <module>
triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1 = async_compile.triton('triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1', '''
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/async_compile.py", line 498, in triton
kernel.precompile(
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 445, in precompile
self._precompile_worker()
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 467, in _precompile_worker
compile_results.append(self._precompile_config(c))
File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 758, in _precompile_config
binary = triton.compile(*compile_args, **compile_kwargs)
File "/data/users/dberard/triton-env/triton/python/triton/compiler/compiler.py", line 319, in compile
next_module = compile_ir(module, metadata)
File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 451, in <lambda>
stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 258, in make_ttgir
pm.run(mod)
torch._inductor.exc.InductorError: RuntimeError: PassManager::run failed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper
method(*args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper
method(*args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 426, in instantiated_test
result = test(self, **param_kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1645, in wrapper
fn(*args, **kwargs)
File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1147, in test_wrapper
raise e_tracked from e
Exception: Caused by sample input at index 3: SampleInput(input=Tensor[size=(5, 10), device="cuda:0", dtype=torch.float32], args=TensorList[Tensor[size=(5,), device="cuda:0", dtype=torch.int64]], kwargs={'margin': '1.0'}, broadcasts_input=False, name='')
To execute this test, run the following from the base repo dir:
PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=3 python test/inductor/test_torchinductor_opinfo.py TestInductorOpInfoCUDA.test_comprehensive_nn_functional_multi_margin_loss_cuda_float32
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
----------------------------------------------------------------------
Ran 1 test in 5.137s
FAILED (errors=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment