Created
September 4, 2025 17:29
-
-
Save davidberard98/4f68ca004435cecb62d183eef9e53483 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| python: /data/users/dberard/triton-env/triton/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp:501: const ValueT &mlir::(anonymous namespace)::FatPointers::at(const_arg_type_t<KeyT>) const: Assertion `pointerAttrs.contains(k) && "expected fatPtrs to contain remapped fat pointer"' failed. | |
| #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [1, 0]}> | |
| #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [1, 1], order = [1, 0]}> | |
| #blocked2 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}> | |
| #blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 1], order = [0, 1]}> | |
| #blocked4 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}> | |
| module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} { | |
| tt.func public @triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1(%arg0: !tt.ptr<i1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg4: i32, %arg5: i32) attributes {noinline = false} { | |
| %cst = arith.constant 2.000000e-02 : f32 | |
| %c10_i32 = arith.constant 10 : i32 | |
| %c5_i32 = arith.constant 5 : i32 | |
| %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x1xf32, #blocked> | |
| %cst_1 = arith.constant dense<0> : tensor<1x16xi8, #blocked1> | |
| %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x16xf32, #blocked1> | |
| %cst_3 = arith.constant dense<10> : tensor<1x16xi32, #blocked1> | |
| %0 = tt.get_program_id x : i32 | |
| %1 = arith.cmpi slt, %0, %c5_i32 : i32 | |
| %2 = tt.splat %1 : i1 -> tensor<1x1xi1, #blocked> | |
| %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked2> | |
| %4 = ttg.convert_layout %3 : tensor<16xi32, #blocked2> -> tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> | |
| %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> -> tensor<1x16xi32, #blocked3> | |
| %6 = ttg.convert_layout %5 : tensor<1x16xi32, #blocked3> -> tensor<1x16xi32, #blocked1> | |
| %7 = arith.cmpi slt, %6, %cst_3 : tensor<1x16xi32, #blocked1> | |
| %8 = arith.muli %0, %c10_i32 : i32 | |
| %9 = tt.splat %8 : i32 -> tensor<1x16xi32, #blocked1> | |
| %10 = arith.addi %6, %9 : tensor<1x16xi32, #blocked1> | |
| %11 = tt.splat %arg0 : !tt.ptr<i1> -> tensor<1x16x!tt.ptr<i1>, #blocked1> | |
| %12 = tt.addptr %11, %10 : tensor<1x16x!tt.ptr<i1>, #blocked1>, tensor<1x16xi32, #blocked1> | |
| %13 = tt.splat %1 : i1 -> tensor<1x16xi1, #blocked1> | |
| %14 = arith.andi %7, %13 : tensor<1x16xi1, #blocked1> | |
| %15 = tt.bitcast %12 : tensor<1x16x!tt.ptr<i1>, #blocked1> -> tensor<1x16x!tt.ptr<i8>, #blocked1> | |
| %16 = tt.load %15, %14, %cst_1 : tensor<1x16x!tt.ptr<i8>, #blocked1> | |
| %17 = arith.cmpi ne, %16, %cst_1 : tensor<1x16xi8, #blocked1> | |
| %18 = tt.addptr %arg1, %0 : !tt.ptr<i64>, i32 | |
| %19 = tt.splat %18 : !tt.ptr<i64> -> tensor<1x1x!tt.ptr<i64>, #blocked> | |
| %20 = tt.load %19, %2 evictionPolicy = evict_last : tensor<1x1x!tt.ptr<i64>, #blocked> | |
| %21 = tt.load %arg2 : !tt.ptr<f32> | |
| %22 = arith.extsi %6 : tensor<1x16xi32, #blocked1> to tensor<1x16xi64, #blocked1> | |
| %23 = tt.broadcast %20 : tensor<1x1xi64, #blocked> -> tensor<1x16xi64, #blocked> | |
| %24 = ttg.convert_layout %23 : tensor<1x16xi64, #blocked> -> tensor<1x16xi64, #blocked1> | |
| %25 = arith.cmpi ne, %22, %24 : tensor<1x16xi64, #blocked1> | |
| %26 = arith.mulf %21, %cst : f32 | |
| %27 = tt.splat %26 : f32 -> tensor<1x16xf32, #blocked1> | |
| %28 = arith.select %25, %27, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1> | |
| %29 = arith.select %17, %28, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1> | |
| %30 = arith.select %14, %29, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1> | |
| %31 = "tt.reduce"(%30) <{axis = 1 : i32}> ({ | |
| ^bb0(%arg6: f32, %arg7: f32): | |
| %43 = arith.addf %arg6, %arg7 : f32 | |
| tt.reduce.return %43 : f32 | |
| }) : (tensor<1x16xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> | |
| %32 = ttg.convert_layout %31 : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xf32, #blocked2> | |
| %33 = ttg.convert_layout %32 : tensor<1xf32, #blocked2> -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked4}>> | |
| %34 = tt.expand_dims %33 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1x1xf32, #blocked4> | |
| %35 = ttg.convert_layout %34 : tensor<1x1xf32, #blocked4> -> tensor<1x1xf32, #blocked> | |
| %36 = arith.subf %cst_0, %35 : tensor<1x1xf32, #blocked> | |
| %37 = arith.extsi %8 : i32 to i64 | |
| %38 = tt.splat %37 : i64 -> tensor<1x1xi64, #blocked> | |
| %39 = arith.addi %20, %38 : tensor<1x1xi64, #blocked> | |
| %40 = tt.splat %arg3 : !tt.ptr<f32> -> tensor<1x1x!tt.ptr<f32>, #blocked> | |
| %41 = tt.addptr %40, %39 : tensor<1x1x!tt.ptr<f32>, #blocked>, tensor<1x1xi64, #blocked> | |
| %42 = tt.atomic_rmw fadd, relaxed, gpu, %41, %36, %2 : (tensor<1x1x!tt.ptr<f32>, #blocked>, tensor<1x1xf32, #blocked>, tensor<1x1xi1, #blocked>) -> tensor<1x1xf32, #blocked> | |
| tt.return | |
| } | |
| } | |
| {-# | |
| external_resources: { | |
| mlir_reproducer: { | |
| pipeline: "builtin.module(tritongpu-coalesce, tritongpu-remove-layout-conversions, tritongpu-optimize-thread-locality, tritonamdgpu-accelerate-matmul{arch-generation-name=gfx942 kPack=1 matrix-instruction-size=0}, tritongpu-remove-layout-conversions, tritonamdgpu-optimize-epilogue, tritonamdgpu-optimize-dot-operands{arch-generation-name=gfx942}, tt.func(tritonamdgpu-hoist-layout-conversions), tritongpu-fuse-nested-loops, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, triton-licm, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritonamdgpu-stream-pipeline{global_prefetch=0 local_prefetch=0 num_stages=1 use_async_copy=false use_pingpong=true}, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritongpu-remove-layout-conversions, tritongpu-reduce-data-duplication, tt.func(tritonamdgpu-in-thread-transpose), tritongpu-remove-layout-conversions, tritonamdgpu-reorder-instructions, tt.func(tritonamdgpu-canonicalize-pointers), canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritonamdgpu-convert-buffer-ops{allow-buffer-atomics=true arch-generation-name=gfx942}, tritonamdgpu-fold-true-cmpi, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, cse, symbol-dce)", | |
| disable_threading: false, | |
| verify_each: true | |
| } | |
| } | |
| #-} | |
| /tmp/tmp90ym5rqn/2b/c2bniqfv7fdtefhrqbgqx3opxqba2unxatqmotxsupk5uym54xvk.py:18:0: error: Failures have been detected while processing an MLIR pass pipeline | |
| /tmp/tmp90ym5rqn/2b/c2bniqfv7fdtefhrqbgqx3opxqba2unxatqmotxsupk5uym54xvk.py:18:0: note: Pipeline failed while executing [`TritonAMDGPUCanonicalizePointers` on 'tt.func' operation: @triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1]: reproducer generated at `std::errs, please share the reproducer above with Triton project.` | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] Triton compilation failed: triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] def triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1(in_ptr0, in_ptr1, in_ptr2, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr): | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] xnumel = 5 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_numel = 10 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] R0_BLOCK: tl.constexpr = 16 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] rnumel = r0_numel | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] RBLOCK: tl.constexpr = R0_BLOCK | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] xoffset = tl.program_id(0) * XBLOCK | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] xmask = xindex < xnumel | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_index = tl.arange(0, R0_BLOCK)[None, :] | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_offset = 0 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_mask = r0_index < r0_numel | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] roffset = r0_offset | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] rindex = r0_index | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] r0_1 = r0_index | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] x0 = xindex | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp0 = tl.load(in_ptr0 + (r0_1 + 10*x0), r0_mask & xmask, other=0.0) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp0.dtype == tl.int1) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp1 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp1.dtype == tl.int64) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp4 = tl.load(in_ptr2 + (0)) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp4.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp5 = tl.broadcast_to(tmp4, [XBLOCK, R0_BLOCK]) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp5.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp2 = r0_1 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp2.dtype == tl.int32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp3 = tmp2 != tmp1 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp3.dtype == tl.int1) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp3.dtype == tl.int1) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp6 = 0.02 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp6.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp6.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp7 = tmp5 * tmp6 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp7.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp7.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp8 = 0.0 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp8.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp8.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp9 = tl.where(tmp3, tmp7, tmp8) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp9.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp9.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp8.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp10 = tl.where(tmp0, tmp9, tmp8) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp10.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp10.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK]) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp11.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp13 = tl.where(r0_mask & xmask, tmp11, 0) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp13.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp14 = tl.sum(tmp13, 1)[:, None].to(tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp14.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.device_assert(((0 <= tmp1) & (tmp1 < 10)) | ~(xmask), "index out of bounds: 0 <= tmp1 < 10") | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tmp16 = -tmp14 | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp16.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.static_assert(tmp16.dtype == tl.float32) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] tl.atomic_add(out_ptr1 + (tmp1 + 10*x0), tmp16, xmask, sem='relaxed') | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] metadata: {'signature': {'in_ptr0': '*i1', 'in_ptr1': '*i64', 'in_ptr2': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': 0, 'constants': {'XBLOCK': 1}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'device_type': 'hip', 'num_warps': 1, 'num_stages': 1, 'debug': False, 'cc': 'gfx942'} | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] Traceback (most recent call last): | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 758, in _precompile_config | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] binary = triton.compile(*compile_args, **compile_kwargs) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] File "/data/users/dberard/triton-env/triton/python/triton/compiler/compiler.py", line 319, in compile | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] next_module = compile_ir(module, metadata) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 451, in <lambda> | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 258, in make_ttgir | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] pm.run(mod) | |
| E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] RuntimeError: PassManager::run failed | |
| E | |
| ====================================================================== | |
| ERROR: test_comprehensive_nn_functional_multi_margin_loss_cuda_float32 (__main__.TestInductorOpInfoCUDA) | |
| ---------------------------------------------------------------------- | |
| Traceback (most recent call last): | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1135, in test_wrapper | |
| return test(*args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1434, in only_fn | |
| return fn(self, *args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 2361, in wrapper | |
| fn(*args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1215, in dep_fn | |
| return fn(slf, *args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1215, in dep_fn | |
| return fn(slf, *args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1645, in wrapper | |
| fn(*args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1560, in wrapper | |
| fn(*args, **kwargs) | |
| File "/home/dberard/.conda/envs/triton-env/lib/python3.10/unittest/mock.py", line 1379, in patched | |
| return func(*newargs, **newkeywargs) | |
| File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner | |
| return func(*args, **kwds) | |
| File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner | |
| return func(*args, **kwds) | |
| File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner | |
| return func(*args, **kwds) | |
| File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1118, in inner | |
| raise e | |
| File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1110, in inner | |
| fn(self, device, dtype, op) | |
| File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1375, in test_comprehensive | |
| raise e | |
| File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1350, in test_comprehensive | |
| self.check_model_gpu( | |
| File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner | |
| return func(*args, **kwds) | |
| File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 685, in check_model_gpu | |
| check_model( | |
| File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 633, in check_model | |
| actual_grad = compute_grads(example_inputs, kwargs, actual, grads) | |
| File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 408, in compute_grads | |
| return torch.autograd.grad( | |
| File "/data/users/dberard/triton-env/pytorch/torch/autograd/__init__.py", line 503, in grad | |
| result = _engine_run_backward( | |
| File "/data/users/dberard/triton-env/pytorch/torch/autograd/graph.py", line 841, in _engine_run_backward | |
| return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass | |
| File "/data/users/dberard/triton-env/pytorch/torch/autograd/function.py", line 315, in apply | |
| return user_fn(self, *args) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2303, in backward | |
| return impl_fn() | |
| File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2289, in impl_fn | |
| out = CompiledFunction._backward_impl(ctx, all_args) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2394, in _backward_impl | |
| CompiledFunction.compiled_bw = aot_config.bw_compiler( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/schemas.py", line 1267, in __call__ | |
| return self.compiler_fn(gm, example_inputs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/backends/common.py", line 82, in _wrapped_bw_compiler | |
| disable( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/eval_frame.py", line 1035, in _fn | |
| return fn(*args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_utils_internal.py", line 92, in wrapper_function | |
| return function(*args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 2581, in bw_compiler | |
| return compile_fx_backward( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 2306, in compile_fx_backward | |
| return inner_compile( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 781, in compile_fx_inner | |
| return wrap_compiler_debug(_compile_fx_inner, compiler_name="inductor")( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/repro/after_aot.py", line 144, in debug_wrapper | |
| inner_compiled_fn = compiler_fn(gm, example_inputs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 989, in _compile_fx_inner | |
| raise InductorError(e, currentframe()).with_traceback( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 973, in _compile_fx_inner | |
| mb_compiled_graph = fx_codegen_and_compile( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 1694, in fx_codegen_and_compile | |
| return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 1504, in codegen_and_compile | |
| compiled_module = graph.compile_to_module() | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2319, in compile_to_module | |
| return self._compile_to_module() | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2329, in _compile_to_module | |
| mod = self._compile_to_module_lines(wrapper_code) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2397, in _compile_to_module_lines | |
| mod = PyCodeCache.load_by_key_path( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/codecache.py", line 3527, in load_by_key_path | |
| mod = _reload_python_module(key, path, set_sys_modules=in_toplevel) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/compile_tasks.py", line 33, in _reload_python_module | |
| exec(code, mod.__dict__, mod.__dict__) | |
| File "/tmp/tmp90ym5rqn/2u/c2urrlft4jn52zqrfqvwugrn7ztdmv52kwh5xrbulkq4gskiabt6.py", line 98, in <module> | |
| triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1 = async_compile.triton('triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1', ''' | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/async_compile.py", line 498, in triton | |
| kernel.precompile( | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 445, in precompile | |
| self._precompile_worker() | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 467, in _precompile_worker | |
| compile_results.append(self._precompile_config(c)) | |
| File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 758, in _precompile_config | |
| binary = triton.compile(*compile_args, **compile_kwargs) | |
| File "/data/users/dberard/triton-env/triton/python/triton/compiler/compiler.py", line 319, in compile | |
| next_module = compile_ir(module, metadata) | |
| File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 451, in <lambda> | |
| stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options) | |
| File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 258, in make_ttgir | |
| pm.run(mod) | |
| torch._inductor.exc.InductorError: RuntimeError: PassManager::run failed | |
| The above exception was the direct cause of the following exception: | |
| Traceback (most recent call last): | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper | |
| method(*args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper | |
| method(*args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 426, in instantiated_test | |
| result = test(self, **param_kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1645, in wrapper | |
| fn(*args, **kwargs) | |
| File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1147, in test_wrapper | |
| raise e_tracked from e | |
| Exception: Caused by sample input at index 3: SampleInput(input=Tensor[size=(5, 10), device="cuda:0", dtype=torch.float32], args=TensorList[Tensor[size=(5,), device="cuda:0", dtype=torch.int64]], kwargs={'margin': '1.0'}, broadcasts_input=False, name='') | |
| To execute this test, run the following from the base repo dir: | |
| PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=3 python test/inductor/test_torchinductor_opinfo.py TestInductorOpInfoCUDA.test_comprehensive_nn_functional_multi_margin_loss_cuda_float32 | |
| This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
| ---------------------------------------------------------------------- | |
| Ran 1 test in 5.137s | |
| FAILED (errors=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment