davidberard98 · September 4, 2025 17:29
diff --git a/multi_margin_loss.txt b/multi_margin_loss.txt
 python: /data/users/dberard/triton-env/triton/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp:501: const ValueT &mlir::(anonymous namespace)::FatPointers::at(const_arg_type_t<KeyT>) const: Assertion `pointerAttrs.contains(k) && "expected fatPtrs to contain remapped fat pointer"' failed.
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 16], warpsPerCTA = [1, 1], order = [1, 0]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
 #blocked3 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [1, 1], order = [0, 1]}>
 #blocked4 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
  tt.func public @triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1(%arg0: !tt.ptr<i1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg4: i32, %arg5: i32) attributes {noinline = false} {
    %cst = arith.constant 2.000000e-02 : f32
    %c10_i32 = arith.constant 10 : i32
    %c5_i32 = arith.constant 5 : i32
    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x1xf32, #blocked>
    %cst_1 = arith.constant dense<0> : tensor<1x16xi8, #blocked1>
    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x16xf32, #blocked1>
    %cst_3 = arith.constant dense<10> : tensor<1x16xi32, #blocked1>
    %0 = tt.get_program_id x : i32
    %1 = arith.cmpi slt, %0, %c5_i32 : i32
    %2 = tt.splat %1 : i1 -> tensor<1x1xi1, #blocked>
    %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked2>
    %4 = ttg.convert_layout %3 : tensor<16xi32, #blocked2> -> tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked3}>>
    %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> -> tensor<1x16xi32, #blocked3>
    %6 = ttg.convert_layout %5 : tensor<1x16xi32, #blocked3> -> tensor<1x16xi32, #blocked1>
    %7 = arith.cmpi slt, %6, %cst_3 : tensor<1x16xi32, #blocked1>
    %8 = arith.muli %0, %c10_i32 : i32
    %9 = tt.splat %8 : i32 -> tensor<1x16xi32, #blocked1>
    %10 = arith.addi %6, %9 : tensor<1x16xi32, #blocked1>
    %11 = tt.splat %arg0 : !tt.ptr<i1> -> tensor<1x16x!tt.ptr<i1>, #blocked1>
    %12 = tt.addptr %11, %10 : tensor<1x16x!tt.ptr<i1>, #blocked1>, tensor<1x16xi32, #blocked1>
    %13 = tt.splat %1 : i1 -> tensor<1x16xi1, #blocked1>
    %14 = arith.andi %7, %13 : tensor<1x16xi1, #blocked1>
    %15 = tt.bitcast %12 : tensor<1x16x!tt.ptr<i1>, #blocked1> -> tensor<1x16x!tt.ptr<i8>, #blocked1>
    %16 = tt.load %15, %14, %cst_1 : tensor<1x16x!tt.ptr<i8>, #blocked1>
    %17 = arith.cmpi ne, %16, %cst_1 : tensor<1x16xi8, #blocked1>
    %18 = tt.addptr %arg1, %0 : !tt.ptr<i64>, i32
    %19 = tt.splat %18 : !tt.ptr<i64> -> tensor<1x1x!tt.ptr<i64>, #blocked>
    %20 = tt.load %19, %2 evictionPolicy = evict_last : tensor<1x1x!tt.ptr<i64>, #blocked>
    %21 = tt.load %arg2 : !tt.ptr<f32>
    %22 = arith.extsi %6 : tensor<1x16xi32, #blocked1> to tensor<1x16xi64, #blocked1>
    %23 = tt.broadcast %20 : tensor<1x1xi64, #blocked> -> tensor<1x16xi64, #blocked>
    %24 = ttg.convert_layout %23 : tensor<1x16xi64, #blocked> -> tensor<1x16xi64, #blocked1>
    %25 = arith.cmpi ne, %22, %24 : tensor<1x16xi64, #blocked1>
    %26 = arith.mulf %21, %cst : f32
    %27 = tt.splat %26 : f32 -> tensor<1x16xf32, #blocked1>
    %28 = arith.select %25, %27, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1>
    %29 = arith.select %17, %28, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1>
    %30 = arith.select %14, %29, %cst_2 : tensor<1x16xi1, #blocked1>, tensor<1x16xf32, #blocked1>
    %31 = "tt.reduce"(%30) <{axis = 1 : i32}> ({
    ^bb0(%arg6: f32, %arg7: f32):
      %43 = arith.addf %arg6, %arg7 : f32
      tt.reduce.return %43 : f32
    }) : (tensor<1x16xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>>
    %32 = ttg.convert_layout %31 : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xf32, #blocked2>
    %33 = ttg.convert_layout %32 : tensor<1xf32, #blocked2> -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked4}>>
    %34 = tt.expand_dims %33 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1x1xf32, #blocked4>
    %35 = ttg.convert_layout %34 : tensor<1x1xf32, #blocked4> -> tensor<1x1xf32, #blocked>
    %36 = arith.subf %cst_0, %35 : tensor<1x1xf32, #blocked>
    %37 = arith.extsi %8 : i32 to i64
    %38 = tt.splat %37 : i64 -> tensor<1x1xi64, #blocked>
    %39 = arith.addi %20, %38 : tensor<1x1xi64, #blocked>
    %40 = tt.splat %arg3 : !tt.ptr<f32> -> tensor<1x1x!tt.ptr<f32>, #blocked>
    %41 = tt.addptr %40, %39 : tensor<1x1x!tt.ptr<f32>, #blocked>, tensor<1x1xi64, #blocked>
    %42 = tt.atomic_rmw fadd, relaxed, gpu, %41, %36, %2 : (tensor<1x1x!tt.ptr<f32>, #blocked>, tensor<1x1xf32, #blocked>, tensor<1x1xi1, #blocked>) -> tensor<1x1xf32, #blocked>
    tt.return
  }
 }

 {-#
  external_resources: {
    mlir_reproducer: {
      pipeline: "builtin.module(tritongpu-coalesce, tritongpu-remove-layout-conversions, tritongpu-optimize-thread-locality, tritonamdgpu-accelerate-matmul{arch-generation-name=gfx942 kPack=1 matrix-instruction-size=0}, tritongpu-remove-layout-conversions, tritonamdgpu-optimize-epilogue, tritonamdgpu-optimize-dot-operands{arch-generation-name=gfx942}, tt.func(tritonamdgpu-hoist-layout-conversions), tritongpu-fuse-nested-loops, canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, triton-licm, canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritonamdgpu-stream-pipeline{global_prefetch=0 local_prefetch=0 num_stages=1 use_async_copy=false use_pingpong=true}, canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritongpu-remove-layout-conversions, tritongpu-reduce-data-duplication, tt.func(tritonamdgpu-in-thread-transpose), tritongpu-remove-layout-conversions, tritonamdgpu-reorder-instructions, tt.func(tritonamdgpu-canonicalize-pointers), canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritonamdgpu-convert-buffer-ops{allow-buffer-atomics=true arch-generation-name=gfx942}, tritonamdgpu-fold-true-cmpi, canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, cse, symbol-dce)",
      disable_threading: false,
      verify_each: true
    }
  }
 #-}
 /tmp/tmp90ym5rqn/2b/c2bniqfv7fdtefhrqbgqx3opxqba2unxatqmotxsupk5uym54xvk.py:18:0: error: Failures have been detected while processing an MLIR pass pipeline
 /tmp/tmp90ym5rqn/2b/c2bniqfv7fdtefhrqbgqx3opxqba2unxatqmotxsupk5uym54xvk.py:18:0: note: Pipeline failed while executing [`TritonAMDGPUCanonicalizePointers` on 'tt.func' operation: @triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1]: reproducer generated at `std::errs, please share the reproducer above with Triton project.`
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] Triton compilation failed: triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] def triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1(in_ptr0, in_ptr1, in_ptr2, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     xnumel = 5
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     r0_numel = 10
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     R0_BLOCK: tl.constexpr = 16
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     rnumel = r0_numel
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     RBLOCK: tl.constexpr = R0_BLOCK
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     xoffset = tl.program_id(0) * XBLOCK
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     xmask = xindex < xnumel
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     r0_index = tl.arange(0, R0_BLOCK)[None, :]
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     r0_offset = 0
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     r0_mask = r0_index < r0_numel
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     roffset = r0_offset
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     rindex = r0_index
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     r0_1 = r0_index
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     x0 = xindex
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp0 = tl.load(in_ptr0 + (r0_1 + 10*x0), r0_mask & xmask, other=0.0)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp0.dtype == tl.int1)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp1 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp1.dtype == tl.int64)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp4 = tl.load(in_ptr2 + (0))
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp4.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp5 = tl.broadcast_to(tmp4, [XBLOCK, R0_BLOCK])
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp5.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp2 = r0_1
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp2.dtype == tl.int32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp3 = tmp2 != tmp1
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp3.dtype == tl.int1)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp3.dtype == tl.int1)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp6 = 0.02
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp6.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp6.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp7 = tmp5 * tmp6
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp7.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp7.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp8 = 0.0
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp8.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp8.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp9 = tl.where(tmp3, tmp7, tmp8)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp9.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp9.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp8.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp10 = tl.where(tmp0, tmp9, tmp8)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp10.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp10.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp11.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp13 = tl.where(r0_mask & xmask, tmp11, 0)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp13.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp14 = tl.sum(tmp13, 1)[:, None].to(tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp14.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.device_assert(((0 <= tmp1) & (tmp1 < 10)) | ~(xmask), "index out of bounds: 0 <= tmp1 < 10")
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tmp16 = -tmp14
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp16.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.static_assert(tmp16.dtype == tl.float32)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     tl.atomic_add(out_ptr1 + (tmp1 + 10*x0), tmp16, xmask, sem='relaxed')
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] 
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] metadata: {'signature': {'in_ptr0': '*i1', 'in_ptr1': '*i64', 'in_ptr2': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': 0, 'constants': {'XBLOCK': 1}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'device_type': 'hip', 'num_warps': 1, 'num_stages': 1, 'debug': False, 'cc': 'gfx942'}
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] Traceback (most recent call last):
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]   File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 758, in _precompile_config
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     binary = triton.compile(*compile_args, **compile_kwargs)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]   File "/data/users/dberard/triton-env/triton/python/triton/compiler/compiler.py", line 319, in compile
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     next_module = compile_ir(module, metadata)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]   File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 451, in <lambda>
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]   File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 258, in make_ttgir
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0]     pm.run(mod)
 E0904 10:26:23.881000 1271446 torch/_inductor/runtime/triton_heuristics.py:760] [0/0] RuntimeError: PassManager::run failed
 E
 ======================================================================
 ERROR: test_comprehensive_nn_functional_multi_margin_loss_cuda_float32 (__main__.TestInductorOpInfoCUDA)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1135, in test_wrapper
    return test(*args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1434, in only_fn
    return fn(self, *args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 2361, in wrapper
    fn(*args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1215, in dep_fn
    return fn(slf, *args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1215, in dep_fn
    return fn(slf, *args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1645, in wrapper
    fn(*args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1560, in wrapper
    fn(*args, **kwargs)
  File "/home/dberard/.conda/envs/triton-env/lib/python3.10/unittest/mock.py", line 1379, in patched
    return func(*newargs, **newkeywargs)
  File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1118, in inner
    raise e
  File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1110, in inner
    fn(self, device, dtype, op)
  File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1375, in test_comprehensive
    raise e
  File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor_opinfo.py", line 1350, in test_comprehensive
    self.check_model_gpu(
  File "/home/dberard/.conda/envs/triton-env/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 685, in check_model_gpu
    check_model(
  File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 633, in check_model
    actual_grad = compute_grads(example_inputs, kwargs, actual, grads)
  File "/data/users/dberard/triton-env/pytorch/test/inductor/test_torchinductor.py", line 408, in compute_grads
    return torch.autograd.grad(
  File "/data/users/dberard/triton-env/pytorch/torch/autograd/__init__.py", line 503, in grad
    result = _engine_run_backward(
  File "/data/users/dberard/triton-env/pytorch/torch/autograd/graph.py", line 841, in _engine_run_backward
    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "/data/users/dberard/triton-env/pytorch/torch/autograd/function.py", line 315, in apply
    return user_fn(self, *args)
  File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2303, in backward
    return impl_fn()
  File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2289, in impl_fn
    out = CompiledFunction._backward_impl(ctx, all_args)
  File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2394, in _backward_impl
    CompiledFunction.compiled_bw = aot_config.bw_compiler(
  File "/data/users/dberard/triton-env/pytorch/torch/_functorch/_aot_autograd/schemas.py", line 1267, in __call__
    return self.compiler_fn(gm, example_inputs)
  File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/backends/common.py", line 82, in _wrapped_bw_compiler
    disable(
  File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/eval_frame.py", line 1035, in _fn
    return fn(*args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/_utils_internal.py", line 92, in wrapper_function
    return function(*args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 2581, in bw_compiler
    return compile_fx_backward(
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 2306, in compile_fx_backward
    return inner_compile(
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 781, in compile_fx_inner
    return wrap_compiler_debug(_compile_fx_inner, compiler_name="inductor")(
  File "/data/users/dberard/triton-env/pytorch/torch/_dynamo/repro/after_aot.py", line 144, in debug_wrapper
    inner_compiled_fn = compiler_fn(gm, example_inputs)
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 989, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 973, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 1694, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/compile_fx.py", line 1504, in codegen_and_compile
    compiled_module = graph.compile_to_module()
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2319, in compile_to_module
    return self._compile_to_module()
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2329, in _compile_to_module
    mod = self._compile_to_module_lines(wrapper_code)
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/graph.py", line 2397, in _compile_to_module_lines
    mod = PyCodeCache.load_by_key_path(
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/codecache.py", line 3527, in load_by_key_path
    mod = _reload_python_module(key, path, set_sys_modules=in_toplevel)
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/compile_tasks.py", line 33, in _reload_python_module
    exec(code, mod.__dict__, mod.__dict__)
  File "/tmp/tmp90ym5rqn/2u/c2urrlft4jn52zqrfqvwugrn7ztdmv52kwh5xrbulkq4gskiabt6.py", line 98, in <module>
    triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1 = async_compile.triton('triton_per_fused_arange_div_expand_ne_neg_new_zeros_scalar_tensor_scatter_add_sum_where_1', '''
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/async_compile.py", line 498, in triton
    kernel.precompile(
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 445, in precompile
    self._precompile_worker()
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 467, in _precompile_worker
    compile_results.append(self._precompile_config(c))
  File "/data/users/dberard/triton-env/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 758, in _precompile_config
    binary = triton.compile(*compile_args, **compile_kwargs)
  File "/data/users/dberard/triton-env/triton/python/triton/compiler/compiler.py", line 319, in compile
    next_module = compile_ir(module, metadata)
  File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 451, in <lambda>
    stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
  File "/data/users/dberard/triton-env/triton/python/triton/backends/amd/compiler.py", line 258, in make_ttgir
    pm.run(mod)
 torch._inductor.exc.InductorError: RuntimeError: PassManager::run failed

 The above exception was the direct cause of the following exception:

 Traceback (most recent call last):
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper
    method(*args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper
    method(*args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 426, in instantiated_test
    result = test(self, **param_kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_utils.py", line 1645, in wrapper
    fn(*args, **kwargs)
  File "/data/users/dberard/triton-env/pytorch/torch/testing/_internal/common_device_type.py", line 1147, in test_wrapper
    raise e_tracked from e
 Exception: Caused by sample input at index 3: SampleInput(input=Tensor[size=(5, 10), device="cuda:0", dtype=torch.float32], args=TensorList[Tensor[size=(5,), device="cuda:0", dtype=torch.int64]], kwargs={'margin': '1.0'}, broadcasts_input=False, name='')

 To execute this test, run the following from the base repo dir:
    PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=3 python test/inductor/test_torchinductor_opinfo.py TestInductorOpInfoCUDA.test_comprehensive_nn_functional_multi_margin_loss_cuda_float32

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ----------------------------------------------------------------------
 Ran 1 test in 5.137s

 FAILED (errors=1)
No results found