HDCharles · April 1, 2025 09:36
diff --git a/gistfile1.txt b/gistfile1.txt
 /home/cdhernandez/.conda/envs/pytorch-3.12/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
  self.gen = func(*args, **kwds)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] Output code: 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # AOT ID: ['0_inference']
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import torch
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import math
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import random
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import os
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import tempfile
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from math import inf, nan
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from cmath import nanj
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch import device, empty_strided
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import triton
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import triton.language as tl
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     grid,
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     split_scan_grid,
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     grid_combo_kernels,
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     start_graph,
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     end_graph,
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] )
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] aten = torch.ops.aten
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/nw/cnw55j7m7wguamk45hegxbqannr3ua2whtxlkam2qoztukgvzhaa.py
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # Topologically Sorted Source Nodes: [float_1, mul, mean, add, rsqrt, mul_1, output, mul_2], Original ATen: [aten._to_copy, aten.mul, aten.mean, aten.add, aten.rsqrt]
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   add => add
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   float_1 => convert_element_type
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   mean => mean
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   mul => mul
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   mul_1 => mul_1
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   mul_2 => mul_2
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   output => convert_element_type_1
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   rsqrt => rsqrt
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # Graph fragment:
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   %convert_element_type : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg0_1, torch.float32), kwargs = {})
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %convert_element_type), kwargs = {})
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   %mean : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%mul, [-1], True), kwargs = {})
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mean, 1e-05), kwargs = {})
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   %rsqrt : [num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {})
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   %convert_element_type_1 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {})
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] #   %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %arg1_1), kwargs = {})
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] triton_red_fused__to_copy_add_mean_mul_rsqrt_0 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_rsqrt_0', '''
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import triton
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import triton.language as tl
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     size_hints={'x': 4, 'r0_': 4096},
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     filename=__file__,
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 4), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_rsqrt_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] )
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] @triton.jit
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] def triton_red_fused__to_copy_add_mean_mul_rsqrt_0(in_ptr0, in_ptr1, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     xnumel = 4
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     r0_numel = 4096
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     rnumel = r0_numel
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     rbase = r0_base
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     x0 = xindex
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         roffset = r0_offset
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         rindex = r0_index
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         r0_1 = r0_index
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp2 = tmp1 * tmp1
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp5 = _tmp4 + tmp3
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         _tmp4 = tl.where(r0_mask & xmask, tmp5, _tmp4)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     tmp4 = tl.sum(_tmp4, 1)[:, None]
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         roffset = r0_offset
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         rindex = r0_index
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         r0_1 = r0_index
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp6 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp15 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp8 = 4096.0
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp9 = tmp4 / tmp8
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp10 = 1e-05
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp11 = tmp9 + tmp10
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp12 = libdevice.rsqrt(tmp11)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp13 = tmp7 * tmp12
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp14 = tmp13.to(tl.float32)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tmp16 = tmp14 * tmp15
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         tl.store(out_ptr1 + (r0_1 + 4096*x0), tmp16, r0_mask & xmask)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] ''', device_str='cuda')
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] async_compile.wait(globals())
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] del async_compile
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] def call(args):
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     arg0_1, arg1_1 = args
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     args.clear()
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     assert_size_stride(arg0_1, (4, 1, 4096), (4096, 4096, 1))
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     assert_size_stride(arg1_1, (4096, ), (1, ))
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         buf1 = empty_strided_cuda((4, 1, 4096), (4096, 4096, 1), torch.bfloat16)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         # Topologically Sorted Source Nodes: [float_1, mul, mean, add, rsqrt, mul_1, output, mul_2], Original ATen: [aten._to_copy, aten.mul, aten.mean, aten.add, aten.rsqrt]
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         triton_red_fused__to_copy_add_mean_mul_rsqrt_0.run(arg0_1, arg1_1, buf1, 4, 4096, grid=grid(4), stream=stream0)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         del arg0_1
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]         del arg1_1
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     return (buf1, )
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     arg0_1 = rand_strided((4, 1, 4096), (4096, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     arg1_1 = rand_strided((4096, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     fn = lambda: call([arg0_1, arg1_1])
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] if __name__ == "__main__":
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] 
 V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1092] [2/0_1] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/kv/ckvcpd3fzxrnga3hqj7ehqmfd7ztquytgkda3j6jajm6zzru7noz.py
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] Output code: 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # AOT ID: ['1_inference']
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import torch
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import math
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import random
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import os
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import tempfile
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from math import inf, nan
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from cmath import nanj
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch import device, empty_strided
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton.language as tl
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     grid,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     split_scan_grid,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     grid_combo_kernels,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     start_graph,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     end_graph,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] )
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] aten = torch.ops.aten
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/sz/cszgef2zhtwtfb5ukfdazs4v2uocs3t6l54we5tv4pgl32s324yr.py
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Topologically Sorted Source Nodes: [expert_weights], Original ATen: [aten._softmax]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   expert_weights => amax, convert_element_type_2, convert_element_type_3, div, exp, sub, sum_1
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Graph fragment:
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %convert_element_type_2 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mm, torch.float32), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %amax : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%convert_element_type_2, [-1], True), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_2, %amax), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %exp : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub,), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp, [-1], True), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %div : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp, %sum_1), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %convert_element_type_3 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div, torch.bfloat16), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_per_fused__softmax_0 = async_compile.triton('triton_per_fused__softmax_0', '''
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton.language as tl
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] @triton_heuristics.persistent_reduction(
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     size_hints={'x': 4, 'r0_': 8},
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     filename=__file__,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0,), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__softmax_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] )
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] @triton.jit
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] def triton_per_fused__softmax_0(in_out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr):
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     xnumel = 4
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     r0_numel = 8
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     R0_BLOCK: tl.constexpr = 8
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     rnumel = r0_numel
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     r0_index = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     r0_offset = 0
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     roffset = r0_offset
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     rindex = r0_index
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     r0_1 = r0_index
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     x0 = xindex
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (r0_1 + 8*x0), xmask, other=0.0).to(tl.float32)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp4 = tl.where(xmask, tmp2, float("-inf"))
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp5 = triton_helpers.max2(tmp4, 1)[:, None]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp6 = tmp1 - tmp5
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp7 = tl_math.exp(tmp6)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp8 = tl.broadcast_to(tmp7, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp10 = tl.where(xmask, tmp8, 0)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp11 = tl.sum(tmp10, 1)[:, None]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp12 = tmp7 / tmp11
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp13 = tmp12.to(tl.float32)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tl.store(in_out_ptr0 + (r0_1 + 8*x0), tmp13, xmask)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/5g/c5gztvklis5blbr72aeuxee7ogqe3urzratlqymlsfl5rml5odfk.py
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Topologically Sorted Source Nodes: [sum_1, expert_weights_2], Original ATen: [aten.sum, aten.div]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   expert_weights_2 => div_1
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   sum_1 => sum_2
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Graph fragment:
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%getitem, [-1], True), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] #   %div_1 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%getitem, %sum_2), kwargs = {})
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_poi_fused_div_sum_1 = async_compile.triton('triton_poi_fused_div_sum_1', '''
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton.language as tl
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     size_hints={'x': 8}, 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     filename=__file__,
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_div_sum_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] )
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] @triton.jit
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] def triton_poi_fused_div_sum_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     xnumel = 8
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     x2 = xindex
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     x1 = xindex // 2
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp1 = tl.load(in_ptr0 + (2*x1), xmask, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp2 = tl.load(in_ptr0 + (1 + 2*x1), xmask, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp3 = tmp1 + tmp2
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tmp4 = tmp0 / tmp3
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     tl.store(out_ptr0 + (x2), tmp4, xmask)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] async_compile.wait(globals())
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] del async_compile
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] def call(args):
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     arg0_1, arg1_1 = args
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     args.clear()
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     assert_size_stride(arg0_1, (4, 1, 4096), (4096, 4096, 1))
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     assert_size_stride(arg1_1, (8, 4096), (4096, 1))
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         buf0 = empty_strided_cuda((4, 8), (8, 1), torch.bfloat16)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         # Topologically Sorted Source Nodes: [scores], Original ATen: [aten.mm]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         extern_kernels.mm(reinterpret_tensor(arg0_1, (4, 4096), (4096, 1), 0), reinterpret_tensor(arg1_1, (4096, 8), (1, 4096), 0), out=buf0)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         del arg1_1
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         buf3 = buf0; del buf0  # reuse
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         # Topologically Sorted Source Nodes: [expert_weights], Original ATen: [aten._softmax]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         triton_per_fused__softmax_0.run(buf3, 4, 8, grid=grid(4), stream=stream0)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         # Topologically Sorted Source Nodes: [expert_weights, topk], Original ATen: [aten._softmax, aten.topk]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         buf4 = torch.ops.aten.topk.default(buf3, 2)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         del buf3
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         buf5 = buf4[0]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         assert_size_stride(buf5, (4, 2), (2, 1))
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         buf6 = buf4[1]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         assert_size_stride(buf6, (4, 2), (2, 1))
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         del buf4
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         buf7 = empty_strided_cuda((4, 2), (2, 1), torch.bfloat16)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         # Topologically Sorted Source Nodes: [sum_1, expert_weights_2], Original ATen: [aten.sum, aten.div]
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         triton_poi_fused_div_sum_1.run(buf5, buf7, 8, grid=grid(8), stream=stream0)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]         del buf5
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     return (reinterpret_tensor(arg0_1, (4, 4096), (4096, 1), 0), buf6, buf7, )
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     arg0_1 = rand_strided((4, 1, 4096), (4096, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     arg1_1 = rand_strided((8, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     fn = lambda: call([arg0_1, arg1_1])
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] if __name__ == "__main__":
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] 
 V0401 02:34:29.314000 3240940 site-packages/torch/_inductor/codecache.py:1092] [3/0_1] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/sa/csa7hmt3qvuirshz463bgy4sd3olawgiy7zmlwnkhgzoiuxkvuas.py
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] Output code: 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # AOT ID: ['2_inference']
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import torch
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import math
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import random
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import os
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import tempfile
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from math import inf, nan
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from cmath import nanj
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch import device, empty_strided
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton.language as tl
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     grid,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     split_scan_grid,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     grid_combo_kernels,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     start_graph,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     end_graph,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] )
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] aten = torch.ops.aten
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/x5/cx5dcpwwkbcak7ec6f42gznrnizwakgvvysewhahd357r2afaoky.py
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Topologically Sorted Source Nodes: [cumsum], Original ATen: [aten.cumsum]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   cumsum => cumsum
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Graph fragment:
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   %cumsum : [num_users=1] = call_function[target=torch.ops.aten.cumsum.default](args = (%histc, 0), kwargs = {})
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_per_fused_cumsum_0 = async_compile.triton('triton_per_fused_cumsum_0', '''
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton.language as tl
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton.jit
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def _triton_helper_fn_add0(arg0_0, arg1_0):
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp0 = arg0_0 + arg1_0
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     return tmp0
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton_heuristics.persistent_reduction(
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     size_hints={'x': 1, 'r0_': 16},
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     filename=__file__,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*i64', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0,), 'tt.equal_to': (1,)}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_cumsum_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] )
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton.jit
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def triton_per_fused_cumsum_0(in_out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr):
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     xnumel = 1
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_numel = 9
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     R0_BLOCK: tl.constexpr = 16
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     rnumel = r0_numel
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_index = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_offset = 0
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_mask = r0_index < r0_numel
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     roffset = r0_offset
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     rindex = r0_index
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_0 = r0_index
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (r0_0), r0_mask, other=0.0)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp1 = tmp0.to(tl.int64)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp3, = tl.associative_scan((tmp2,), 1, _triton_helper_fn_add0)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tl.store(in_out_ptr0 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp3, r0_mask)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/2l/c2ly6tanvidk6eyla4mydbbwkltgwzimgveg677mmndzmp3d4dag.py
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Topologically Sorted Source Nodes: [ordered_token_activations, div, floor, ordered_token_indices], Original ATen: [aten.sort, aten.div, aten.floor, aten._to_copy]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   div => div
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   floor => floor
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   ordered_token_activations => getitem_1, sort
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   ordered_token_indices => convert_element_type
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Graph fragment:
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   %sort : [num_users=1] = call_function[target=torch.ops.aten.sort.stable](args = (%view,), kwargs = {stable: True})
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   %getitem_1 : [num_users=2] = call_function[target=operator.getitem](args = (%sort, 1), kwargs = {})
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   %div : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%getitem_1, 2), kwargs = {})
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   %floor : [num_users=1] = call_function[target=torch.ops.aten.floor.default](args = (%div,), kwargs = {})
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] #   %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%floor, torch.int64), kwargs = {})
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_per_fused__to_copy_div_floor_sort_1 = async_compile.triton('triton_per_fused__to_copy_div_floor_sort_1', '''
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton.language as tl
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton_heuristics.persistent_reduction(
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     size_hints={'x': 1, 'r0_': 8},
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     filename=__file__,
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'out_ptr1': '*i64', 'out_ptr2': '*i64', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': (3,)}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_div_floor_sort_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] )
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton.jit
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def triton_per_fused__to_copy_div_floor_sort_1(in_ptr0, out_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr):
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     xnumel = 1
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_numel = 8
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     R0_BLOCK: tl.constexpr = 8
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     rnumel = r0_numel
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_index = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_offset = 0
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     roffset = r0_offset
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     rindex = r0_index
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     r0_0 = r0_index
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp0 = tl.load(in_ptr0 + (r0_0), None)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp1 = r0_0
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp2 = tmp1.to(tl.int16)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp4 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp5, tmp6, = triton_helpers.sort_with_index(tmp3, tmp4, None, 1, stable=True, descending=False)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp7 = tmp6.to(tl.int64)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp8 = tmp7.to(tl.float32)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp9 = 0.5
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp10 = tmp8 * tmp9
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp11 = libdevice.floor(tmp10)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tmp12 = tmp11.to(tl.int64)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tl.store(out_ptr1 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp7, None)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     tl.store(out_ptr2 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp12, None)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] async_compile.wait(globals())
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] del async_compile
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def call(args):
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     arg0_1, = args
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     args.clear()
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     assert_size_stride(arg0_1, (4, 2), (2, 1))
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         # Topologically Sorted Source Nodes: [num_tokens_per_expert], Original ATen: [aten.histc]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         buf4 = torch.ops.aten.histc.default(arg0_1, 9, -1, 8)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         buf5 = buf4
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         assert_size_stride(buf5, (9, ), (1, ))
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         del buf4
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         buf6 = buf5; del buf5  # reuse
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         # Topologically Sorted Source Nodes: [cumsum], Original ATen: [aten.cumsum]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         triton_per_fused_cumsum_0.run(buf6, 1, 9, grid=grid(1), stream=stream0)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         buf2 = empty_strided_cuda((8, ), (1, ), torch.int64)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         buf3 = empty_strided_cuda((8, ), (1, ), torch.int64)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         # Topologically Sorted Source Nodes: [ordered_token_activations, div, floor, ordered_token_indices], Original ATen: [aten.sort, aten.div, aten.floor, aten._to_copy]
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         triton_per_fused__to_copy_div_floor_sort_1.run(arg0_1, buf2, buf3, 1, 8, grid=grid(1), stream=stream0)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]         del arg0_1
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     return (buf2, buf3, buf6, )
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     arg0_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     fn = lambda: call([arg0_1])
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] if __name__ == "__main__":
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] 
 V0401 02:34:29.365000 3240940 site-packages/torch/_inductor/codecache.py:1092] [4/0_1] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/iz/cizfitgcjxxo52caxetfsorvxrxg6uvxvkugusw3tk2kha2klabs.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] Output code: 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # AOT ID: ['3_inference']
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import torch
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import random
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import os
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import tempfile
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from math import inf, nan
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from cmath import nanj
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch import device, empty_strided
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     grid,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     split_scan_grid,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     grid_combo_kernels,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     start_graph,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     end_graph,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] aten = torch.ops.aten
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/o4/co4xnn2io22jfgnxz4ht5jvbf2llatrrh2jxbjcrgdchs4g4oage.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_x], Original ATen: [aten.index]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   cur_x => index
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %index : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg0_1]), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 12288
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x1 = xindex // 4096
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x2 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tl.full([XBLOCK], 4, tl.int32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp3 = tmp0 < 0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp4 = tl.where(tmp3, tmp2, tmp0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x2), tmp5, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/yj/cyj7p7xqlcmuj23z3eb2yyzznvkflquhslgxtbejde24is7cpycm.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   linear => convert_element_type
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute, torch.bfloat16), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_1 = async_compile.triton('triton_poi_fused__to_copy_1', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 58720256
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (x0), None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/sl/cslemfnvmno5niab2n5zknfoqfgbyu7zy6qecwrz2iim5mdzojk7.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   linear => mul
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   mul => mul_2
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   silu => convert_element_type_3, convert_element_type_4, mul_1, sigmoid
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm, %select_1), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type_3 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.float32), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_3,), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_3, %sigmoid), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type_4 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %mm_1), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_2 = async_compile.triton('triton_poi_fused_mul_silu_2', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 65536}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_mul_silu_2(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 43008
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x2 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), xmask).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, xmask)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/64/c64hgrupdizact3b6llutwalvnx33ksi5ddojikmmstwkklowf3j.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_6, linear_7], Original ATen: [aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   linear_6 => mul_6, sum_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   linear_7 => mul_9, sum_2
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_6, [1]), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_9, [1]), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_red_fused_mm_3 = async_compile.triton('triton_red_fused_mm_3', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 16384, 'r0_': 4096},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     reduction_hint=ReductionHint.DEFAULT,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_red_fused_mm_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 14336
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     r0_numel = 4096
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     rnumel = r0_numel
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     rbase = r0_base
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (0))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         roffset = r0_offset
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         rindex = r0_index
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         r0_1 = r0_index
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp8 = tl.load(in_ptr2 + (117440512 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp14 = tl.load(in_ptr3 + (117440512 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp3 = tmp1 + tmp2
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp4 = tmp1 < 0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp5 = tl.where(tmp4, tmp3, tmp1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp13 = _tmp12 + tmp11
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp15 = tmp14.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp16 = tmp7 * tmp15
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp19 = _tmp18 + tmp17
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp12 = tl.sum(_tmp12, 1)[:, None]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp18 = tl.sum(_tmp18, 1)[:, None]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp12, xmask)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr1 + (x0), tmp18, xmask)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/p4/cp44ewu7oadqvpm7kqql2mch4jkm7suwnms7dg3lxm6kymm24pkq.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_out_2], Original ATen: [aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   cur_out_2 => mul_11, sum_3
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_11, [1]), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_red_fused_mm_4 = async_compile.triton('triton_red_fused_mm_4', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_red_fused_mm_4(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 4096
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     r0_numel = 14336
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     rnumel = r0_numel
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     rbase = r0_base
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         roffset = r0_offset
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         rindex = r0_index
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         r0_1 = r0_index
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp2 = tl.load(in_ptr1 + (28672 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp12 = tl.load(in_ptr3 + (117440512 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/hz/chzpp3oidskhj7g4vx6jgtqvyd4whdkmzx6oypxs2tyfu45hwcqw.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_x_3, cur_x_6], Original ATen: [aten.index]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   cur_x_3 => index_3
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   cur_x_6 => index_6
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %index_3 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg4_1]), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %index_6 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg7_1]), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_index_5 = async_compile.triton('triton_poi_fused_index_5', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 8192}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i64', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_index_5(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 8192
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x1 = xindex // 4096
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x2 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp6 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tl.full([XBLOCK], 4, tl.int32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp3 = tmp0 < 0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp4 = tl.where(tmp3, tmp2, tmp0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp7 = tmp6 + tmp1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp8 = tmp6 < 0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp9 = tl.where(tmp8, tmp7, tmp6)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp10 = tl.load(in_ptr1 + (x0 + 4096*tmp9), None).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x2), tmp5, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr1 + (x2), tmp10, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/la/clahscggfyxr4yqj5gamqksnyomqxgibxjiqa3ridzteknsvnwjm.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   linear_9 => convert_element_type_30
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type_30 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_9, torch.bfloat16), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_6 = async_compile.triton('triton_poi_fused__to_copy_6', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused__to_copy_6(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 58720256
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (176160768 + x0), None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/us/cusajh75vrnwh42pn5f4rv4knbjqkieyfcs5ogoltegkfmfkol4c.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   linear_9 => mul_12
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   mul_3 => mul_14
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   silu_3 => convert_element_type_33, convert_element_type_34, mul_13, sigmoid_3
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_6, %select_16), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type_33 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_12, torch.float32), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %sigmoid_3 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_33,), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_13 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_33, %sigmoid_3), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type_34 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_13, torch.bfloat16), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_14 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_34, %mm_7), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_7 = async_compile.triton('triton_poi_fused_mul_silu_7', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_7', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_mul_silu_7(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 28672
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x2 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tl.load(in_ptr0 + (43008 + x0), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/bn/cbntbuf2uowesjcvj7pijik2jtcococuwohpnparrrikin5bb3is.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   linear_18 => convert_element_type_57
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type_57 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_18, torch.bfloat16), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_8 = async_compile.triton('triton_poi_fused__to_copy_8', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 58720256
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (352321536 + x0), None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xk/cxkxa4iqtzpndenpxumel4jvuld773zjmzkir4txf2hm5n7vrya4.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   linear_18 => mul_21
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   mul_6 => mul_23
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   silu_6 => convert_element_type_60, convert_element_type_61, mul_22, sigmoid_6
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_15, %select_31), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type_60 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_21, torch.float32), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %sigmoid_6 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_60,), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_22 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_60, %sigmoid_6), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %convert_element_type_61 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_22, torch.bfloat16), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_23 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_61, %mm_16), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_9 = async_compile.triton('triton_poi_fused_mul_silu_9', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_9', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_mul_silu_9(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 28672
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x2 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tl.load(in_ptr0 + (86016 + x0), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ct/cctreepea7pb6izuwtwnqatvjccnzanpmpyx44n36pl3c3zqxuyk.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   getitem_32 => index_8
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   ordered_outs => cat
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   weighted_ordered_outs => mul_27
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%mm_2, %convert_element_type_29, %mm_8, %mm_17],), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg15_1]), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_cat_index_mul_view_10 = async_compile.triton('triton_poi_fused_cat_index_mul_view_10', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_10', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_cat_index_mul_view_10(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 32768
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x1 = xindex // 4096
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x2 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp26 = tl.load(in_ptr4 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = x1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp1 = tl.full([1], 0, tl.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp2 = tmp0 >= tmp1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp3 = tl.full([1], 3, tl.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp4 = tmp0 < tmp3
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp5 = tl.load(in_ptr0 + (x0 + 4096*(x1)), tmp4, other=0.0).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp6 = tmp0 >= tmp3
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp7 = tl.full([1], 4, tl.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp8 = tmp0 < tmp7
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp9 = tmp6 & tmp8
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp10 = tl.load(in_ptr1 + (x0), tmp9, eviction_policy='evict_last', other=0.0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp11 = tmp10.to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp12 = tl.full(tmp11.shape, 0.0, tmp11.dtype)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp13 = tl.where(tmp9, tmp11, tmp12)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp14 = tmp0 >= tmp7
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp15 = tl.full([1], 6, tl.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp16 = tmp0 < tmp15
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp17 = tmp14 & tmp16
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp18 = tl.load(in_ptr2 + (x0 + 4096*((-4) + x1)), tmp17, other=0.0).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp19 = tmp0 >= tmp15
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp20 = tl.full([1], 8, tl.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp21 = tmp0 < tmp20
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp22 = tl.load(in_ptr3 + (x0 + 4096*((-6) + x1)), tmp19, other=0.0).to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp23 = tl.where(tmp17, tmp18, tmp22)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp24 = tl.where(tmp9, tmp13, tmp23)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp25 = tl.where(tmp4, tmp5, tmp24)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp27 = tl.full([XBLOCK], 8, tl.int32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp28 = tmp26 + tmp27
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp29 = tmp26 < 0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp30 = tl.where(tmp29, tmp28, tmp26)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp31 = tl.load(in_ptr5 + (tmp30), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp32 = tmp25 * tmp31
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x2), tmp32, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/lx/clxfjvsgrm7woaqpniy5p2traw2xugy7umu7g2rnslg6lyqzyhtu.py
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [final_out, final_out_1], Original ATen: [aten.zeros_like, aten.scatter_add]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   final_out => full_default
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   final_out_1 => scatter_add
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment:
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] #   %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_27), kwargs = {})
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_scatter_add_zeros_like_11 = async_compile.triton('triton_poi_fused_scatter_add_zeros_like_11', '''
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     filename=__file__,
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_scatter_add_zeros_like_11', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_scatter_add_zeros_like_11(out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xnumel = 16384
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     x0 = xindex
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tmp0 = 0.0
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp0, None)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] async_compile.wait(globals())
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del async_compile
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def call(args):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1 = args
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     args.clear()
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg0_1, (3, ), (1, ))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg1_1, (4, 4096), (4096, 1))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg3_1, (1, ), (1, ))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg4_1, (2, ), (1, ))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg7_1, (2, ), (1, ))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg9_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg10_1, (8, 14336), (14336, 1))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg11_1, (8, 14336), (14336, 1))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg12_1, (8, 4096, 14336), (58720256, 14336, 1))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg13_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg14_1, (4, 2), (2, 1))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg15_1, (8, ), (1, ))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     assert_size_stride(arg16_1, (8, ), (1, ))
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf0 = empty_strided_cuda((3, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [cur_x], Original ATen: [aten.index]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused_index_0.run(arg0_1, arg1_1, buf0, 12288, grid=grid(12288), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg0_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf1 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused__to_copy_1.run(arg9_1, buf1, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf2 = empty_strided_cuda((3, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf0, buf1, out=buf2)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf3 = empty_strided_cuda((3, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_1], Original ATen: [aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf0, reinterpret_tensor(arg13_1, (4096, 14336), (1, 4096), 0), out=buf3)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf4 = buf2; del buf2  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused_mul_silu_2.run(buf4, arg10_1, buf3, 43008, grid=grid(43008), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf3
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf5 = buf0; del buf0  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear, silu, mul, cur_out], Original ATen: [aten.mul, aten.silu, aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf4, reinterpret_tensor(arg12_1, (14336, 4096), (1, 14336), 0), out=buf5)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf4
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf6 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf7 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_6, linear_7], Original ATen: [aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_red_fused_mm_3.run(arg3_1, arg1_1, arg9_1, arg13_1, buf6, buf7, 14336, 4096, grid=grid(14336), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg3_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf8 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_2], Original ATen: [aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_red_fused_mm_4.run(buf6, arg10_1, buf7, arg12_1, buf8, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf6
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf7
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf9 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf15 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [cur_x_3, cur_x_6], Original ATen: [aten.index]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused_index_5.run(arg4_1, arg1_1, arg7_1, buf9, buf15, 8192, grid=grid(8192), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg1_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg4_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg7_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf10 = buf1; del buf1  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused__to_copy_6.run(arg9_1, buf10, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf11 = empty_strided_cuda((2, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf9, buf10, out=buf11)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf12 = empty_strided_cuda((2, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_10], Original ATen: [aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf9, reinterpret_tensor(arg13_1, (4096, 14336), (1, 4096), 176160768), out=buf12)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf13 = buf11; del buf11  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused_mul_silu_7.run(buf13, arg10_1, buf12, 28672, grid=grid(28672), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf14 = buf9; del buf9  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3, cur_out_3], Original ATen: [aten.mul, aten.silu, aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf13, reinterpret_tensor(arg12_1, (14336, 4096), (1, 14336), 176160768), out=buf14)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf16 = buf10; del buf10  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused__to_copy_8.run(arg9_1, buf16, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg9_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf17 = buf13; del buf13  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf15, buf16, out=buf17)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf16
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf18 = buf12; del buf12  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_19], Original ATen: [aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf15, reinterpret_tensor(arg13_1, (4096, 14336), (1, 4096), 352321536), out=buf18)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg13_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf19 = buf17; del buf17  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused_mul_silu_9.run(buf19, arg10_1, buf18, 28672, grid=grid(28672), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg10_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf18
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf20 = buf15; del buf15  # reuse
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6, cur_out_6], Original ATen: [aten.mul, aten.silu, aten.mm]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         extern_kernels.mm(buf19, reinterpret_tensor(arg12_1, (14336, 4096), (1, 14336), 352321536), out=buf20)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg12_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf19
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf21 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused_cat_index_mul_view_10.run(buf5, buf8, buf14, buf20, arg15_1, arg14_1, buf21, 32768, grid=grid(32768), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg14_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg15_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf14
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf20
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf5
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf8
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         buf22 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         # Topologically Sorted Source Nodes: [final_out, final_out_1], Original ATen: [aten.zeros_like, aten.scatter_add]
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         triton_poi_fused_scatter_add_zeros_like_11.run(buf22, 16384, grid=grid(16384), stream=stream0)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         aten.scatter_reduce_.two(buf22,0,reinterpret_tensor(arg16_1, (8, 4096), (1, 0), 0),buf21, reduce='sum', include_self=True)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del arg16_1
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]         del buf21
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     return (buf22, )
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg0_1 = rand_strided((3, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg1_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg2_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg3_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg4_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg5_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg6_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg7_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg8_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg9_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg10_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg11_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg12_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg13_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg14_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg15_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     arg16_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1])
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] if __name__ == "__main__":
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] 
 V0401 02:34:29.616000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/0] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/27/c27evdctcr5rhwr4lru3xk34rbtzfmvgxoznmbsawgwtxx5f6wwm.py
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] Output code: 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] # AOT ID: ['4_inference']
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import torch
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import math
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import random
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import os
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import tempfile
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from math import inf, nan
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from cmath import nanj
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch import device, empty_strided
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] aten = torch.ops.aten
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] async_compile.wait(globals())
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] del async_compile
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] def call(args):
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     arg0_1, = args
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     args.clear()
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     assert_size_stride(arg0_1, (4, 4096), (4096, 1))
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     return (reinterpret_tensor(arg0_1, (4, 1, 4096), (4096, 4096, 1), 0), )
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     arg0_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     fn = lambda: call([arg0_1])
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] if __name__ == "__main__":
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] 
 V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1092] [8/0] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/m5/cm5thqm6z4ynaqklrs7ic6cv4pgvjmeyyf3i3jrqp3u7zfrxvkbp.py
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] Output code: 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # AOT ID: ['5_inference']
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import torch
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import math
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import random
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import os
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import tempfile
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from math import inf, nan
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from cmath import nanj
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch import device, empty_strided
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import triton
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import triton.language as tl
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     grid,
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     split_scan_grid,
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     grid_combo_kernels,
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     start_graph,
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     end_graph,
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] )
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] aten = torch.ops.aten
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ri/crizb6pffeptnetkgqxd7sozjkjnq6ne5zh2c2mhrig66r6wdi7p.py
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # Topologically Sorted Source Nodes: [out], Original ATen: [aten.add]
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] #   out => add
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # Graph fragment:
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] #   %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg0_1, %arg1_1), kwargs = {})
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] triton_poi_fused_add_0 = async_compile.triton('triton_poi_fused_add_0', '''
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import triton
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import triton.language as tl
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     filename=__file__,
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     min_elem_per_thread=0
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] )
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] @triton.jit
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] def triton_poi_fused_add_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     xnumel = 16384
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     x0 = xindex
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     tmp1 = tl.load(in_ptr1 + (x0), None).to(tl.float32)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     tl.store(out_ptr0 + (x0), tmp2, None)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] ''', device_str='cuda')
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] async_compile.wait(globals())
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] del async_compile
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] def call(args):
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     arg0_1, arg1_1 = args
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     args.clear()
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     assert_size_stride(arg0_1, (4, 1, 4096), (4096, 4096, 1))
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     assert_size_stride(arg1_1, (4, 1, 4096), (4096, 4096, 1))
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]         buf0 = empty_strided_cuda((4, 1, 4096), (4096, 4096, 1), torch.bfloat16)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]         # Topologically Sorted Source Nodes: [out], Original ATen: [aten.add]
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]         triton_poi_fused_add_0.run(arg0_1, arg1_1, buf0, 16384, grid=grid(16384), stream=stream0)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]         del arg0_1
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]         del arg1_1
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     return (buf0, )
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     arg0_1 = rand_strided((4, 1, 4096), (4096, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     arg1_1 = rand_strided((4, 1, 4096), (4096, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     fn = lambda: call([arg0_1, arg1_1])
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] if __name__ == "__main__":
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] 
 V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1092] [9/0] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/pr/cprgnlfeu2ugvkfc3eeslbdmpypbt4grnfduu3t5ayen3dpmlt23.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] Output code: 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # AOT ID: ['6_inference']
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import torch
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import random
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import os
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import tempfile
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from math import inf, nan
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from cmath import nanj
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch import device, empty_strided
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     grid,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     split_scan_grid,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     grid_combo_kernels,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     start_graph,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     end_graph,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] aten = torch.ops.aten
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xu/cxubyjb4f3dfq4lxjugwcaa5ojabp22ovyl6ijcs5lrkzaqqadka.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_3, linear_4, linear_12, linear_13, linear_15, linear_16, linear_18, linear_19], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_12 => mul_15, sum_4
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_13 => mul_18, sum_5
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_15 => mul_21, sum_7
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_16 => mul_24, sum_8
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_18 => mul_27, sum_10
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_19 => mul_30, sum_11
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_3 => mul_3, sum_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_4 => mul_6, sum_2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_3, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_6, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_15, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_18, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_12, %unsqueeze_13), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_21, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_24 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_14, %unsqueeze_15), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_24, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_18, %unsqueeze_19), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_10 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_27, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_20, %unsqueeze_21), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_11 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_30, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_0 = async_compile.triton('triton_red_fused_mm_0', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 16384, 'r0_': 4096},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     reduction_hint=ReductionHint.DEFAULT,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*i64', 'in_ptr6': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'out_ptr4': '*fp32', 'out_ptr5': '*fp32', 'out_ptr6': '*fp32', 'out_ptr7': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 12, 'num_reduction': 8, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, out_ptr5, out_ptr6, out_ptr7, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 14336
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_numel = 4096
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rnumel = r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rbase = r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp0 = tl.load(in_ptr0 + (0))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp20 = tl.load(in_ptr4 + (0))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp21 = tl.broadcast_to(tmp20, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp37 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp39 = tl.load(in_ptr5 + (0))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp40 = tl.broadcast_to(tmp39, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp50 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp56 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp58 = tl.load(in_ptr6 + (0))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp59 = tl.broadcast_to(tmp58, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp69 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp75 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         roffset = r0_offset
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         rindex = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_1 = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp8 = tl.load(in_ptr2 + (58720256 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp14 = tl.load(in_ptr3 + (58720256 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp27 = tl.load(in_ptr2 + (234881024 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp33 = tl.load(in_ptr3 + (234881024 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp46 = tl.load(in_ptr2 + (293601280 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp52 = tl.load(in_ptr3 + (293601280 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp65 = tl.load(in_ptr2 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp71 = tl.load(in_ptr3 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp3 = tmp1 + tmp2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp4 = tmp1 < 0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp5 = tl.where(tmp4, tmp3, tmp1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp13 = _tmp12 + tmp11
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp15 = tmp14.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp16 = tmp7 * tmp15
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp19 = _tmp18 + tmp17
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp22 = tmp21 + tmp2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp23 = tmp21 < 0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp24 = tl.where(tmp23, tmp22, tmp21)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp25 = tl.load(in_ptr1 + (r0_1 + 4096*tmp24), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp26 = tmp25.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp28 = tmp27.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp29 = tmp26 * tmp28
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp32 = _tmp31 + tmp30
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp34 = tmp33.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp35 = tmp26 * tmp34
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp36 = tl.broadcast_to(tmp35, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp38 = _tmp37 + tmp36
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp37 = tl.where(r0_mask & xmask, tmp38, _tmp37)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp41 = tmp40 + tmp2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp42 = tmp40 < 0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp43 = tl.where(tmp42, tmp41, tmp40)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp44 = tl.load(in_ptr1 + (r0_1 + 4096*tmp43), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp45 = tmp44.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp47 = tmp46.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp48 = tmp45 * tmp47
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp49 = tl.broadcast_to(tmp48, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp51 = _tmp50 + tmp49
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp50 = tl.where(r0_mask & xmask, tmp51, _tmp50)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp53 = tmp52.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp54 = tmp45 * tmp53
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp55 = tl.broadcast_to(tmp54, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp57 = _tmp56 + tmp55
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp56 = tl.where(r0_mask & xmask, tmp57, _tmp56)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp60 = tmp59 + tmp2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp61 = tmp59 < 0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp62 = tl.where(tmp61, tmp60, tmp59)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp63 = tl.load(in_ptr1 + (r0_1 + 4096*tmp62), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp64 = tmp63.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp66 = tmp65.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp67 = tmp64 * tmp66
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp68 = tl.broadcast_to(tmp67, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp70 = _tmp69 + tmp68
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp69 = tl.where(r0_mask & xmask, tmp70, _tmp69)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp72 = tmp71.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp73 = tmp64 * tmp72
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp74 = tl.broadcast_to(tmp73, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp76 = _tmp75 + tmp74
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp75 = tl.where(r0_mask & xmask, tmp76, _tmp75)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp12 = tl.sum(_tmp12, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp18 = tl.sum(_tmp18, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp31 = tl.sum(_tmp31, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp37 = tl.sum(_tmp37, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp50 = tl.sum(_tmp50, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp56 = tl.sum(_tmp56, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp69 = tl.sum(_tmp69, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp75 = tl.sum(_tmp75, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr0 + (x0), tmp12, xmask)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr1 + (x0), tmp18, xmask)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr2 + (x0), tmp31, xmask)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr3 + (x0), tmp37, xmask)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr4 + (x0), tmp50, xmask)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr5 + (x0), tmp56, xmask)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr6 + (x0), tmp69, xmask)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr7 + (x0), tmp75, xmask)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ur/curt75b3i4q2ncyfteskzogwmeozmkcorskoajpt6t5cxsox3shf.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_1], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   cur_out_1 => mul_8, sum_3
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_8 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_8, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_1 = async_compile.triton('triton_red_fused_mm_1', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_1(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 4096
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_numel = 14336
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rnumel = r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rbase = r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         roffset = r0_offset
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         rindex = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_1 = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp2 = tl.load(in_ptr1 + (14336 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp12 = tl.load(in_ptr3 + (58720256 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/nv/cnvigyarqkoqkrzao6xsn6e2aq3m2k6tu3qmtpxrwxa4r5jseolx.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_4], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   cur_out_4 => mul_20, sum_6
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_20 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_10, %unsqueeze_11), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_6 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_20, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_2 = async_compile.triton('triton_red_fused_mm_2', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 4096
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_numel = 14336
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rnumel = r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rbase = r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         roffset = r0_offset
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         rindex = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_1 = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp2 = tl.load(in_ptr1 + (57344 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp12 = tl.load(in_ptr3 + (234881024 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/df/cdf5pfpbj4fujw6qckkl2cwjemrdxy2tlk77lleu3lglvjxmqoxu.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_5], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   cur_out_5 => mul_26, sum_9
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_26 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_16, %unsqueeze_17), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_9 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_26, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_3 = async_compile.triton('triton_red_fused_mm_3', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 4096
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_numel = 14336
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rnumel = r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rbase = r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         roffset = r0_offset
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         rindex = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_1 = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp2 = tl.load(in_ptr1 + (71680 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp12 = tl.load(in_ptr3 + (293601280 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/4e/c4ezrplevfovmql56fqmbjyg3ukwfiepbujmfc366pbac235zmy2.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   cur_out_6 => mul_32, sum_12
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_32 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_22, %unsqueeze_23), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sum_12 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_32, [1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_4 = async_compile.triton('triton_red_fused_mm_4', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_4(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 4096
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_numel = 14336
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rnumel = r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     rbase = r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         roffset = r0_offset
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         rindex = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         r0_1 = r0_index
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp2 = tl.load(in_ptr1 + (86016 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp12 = tl.load(in_ptr3 + (352321536 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/iw/ciwidkcd3ljl5a5zq2hjsxygmbf64oghthcmaew3lnibva6zev7h.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_x_7], Original ATen: [aten.index]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   cur_x_7 => index_7
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %index_7 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg11_1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_index_5 = async_compile.triton('triton_poi_fused_index_5', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     min_elem_per_thread=0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused_index_5(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 16384
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x1 = xindex // 4096
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x2 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp1 = tl.full([XBLOCK], 4, tl.int32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp3 = tmp0 < 0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp4 = tl.where(tmp3, tmp2, tmp0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr0 + (x2), tmp5, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ui/cuircasukcqg6f73z26qb6qfe5gfxff4efagdgqou4pdcrmpi7nj.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_21], Original ATen: [aten._to_copy]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_21 => convert_element_type_75
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %convert_element_type_75 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_21, torch.bfloat16), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused__to_copy_6 = async_compile.triton('triton_poi_fused__to_copy_6', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     min_elem_per_thread=0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused__to_copy_6(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 58720256
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp0 = tl.load(in_ptr0 + (411041792 + x0), None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/bk/cbknq2eijpxdvrpijconf66fgurvapv6anth5r24fs4ukfzsebkj.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_21, silu_7, mul_7], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   linear_21 => mul_33
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   mul_7 => mul_35
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   silu_7 => convert_element_type_78, convert_element_type_79, mul_34, sigmoid_7
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_9, %select_36), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %convert_element_type_78 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_33, torch.float32), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %sigmoid_7 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_78,), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_34 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_78, %sigmoid_7), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %convert_element_type_79 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_34, torch.bfloat16), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_35 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_79, %mm_10), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_mul_silu_7 = async_compile.triton('triton_poi_fused_mul_silu_7', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 65536}, 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_7', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     min_elem_per_thread=0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused_mul_silu_7(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 57344
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x2 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp1 = tl.load(in_ptr0 + (100352 + x0), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ll/clljbokytk2lektwzfmhr7qpaxzbo5jvh6ocbzqat67rzsxp2hzg.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   getitem_32 => index_8
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   ordered_outs => cat
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   weighted_ordered_outs => mul_36
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%convert_element_type_20, %convert_element_type_50, %convert_element_type_62, %convert_element_type_74, %mm_11],), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg18_1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_cat_index_mul_view_8 = async_compile.triton('triton_poi_fused_cat_index_mul_view_8', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'in_ptr5': '*i64', 'in_ptr6': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_8', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     min_elem_per_thread=0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused_cat_index_mul_view_8(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 32768
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x1 = xindex // 4096
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x2 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp41 = tl.load(in_ptr5 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp0 = x1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp1 = tl.full([1], 0, tl.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp2 = tmp0 >= tmp1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp3 = tl.full([1], 1, tl.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp4 = tmp0 < tmp3
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp5 = tl.load(in_ptr0 + (x0), tmp4, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp7 = tl.full(tmp6.shape, 0.0, tmp6.dtype)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp8 = tl.where(tmp4, tmp6, tmp7)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp9 = tmp0 >= tmp3
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp10 = tl.full([1], 2, tl.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp11 = tmp0 < tmp10
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp12 = tmp9 & tmp11
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp13 = tl.load(in_ptr1 + (x0), tmp12, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp14 = tmp13.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp15 = tl.full(tmp14.shape, 0.0, tmp14.dtype)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp16 = tl.where(tmp12, tmp14, tmp15)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp17 = tmp0 >= tmp10
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp18 = tl.full([1], 3, tl.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp19 = tmp0 < tmp18
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp20 = tmp17 & tmp19
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp21 = tl.load(in_ptr2 + (x0), tmp20, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp22 = tmp21.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp23 = tl.full(tmp22.shape, 0.0, tmp22.dtype)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp24 = tl.where(tmp20, tmp22, tmp23)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp25 = tmp0 >= tmp18
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp26 = tl.full([1], 4, tl.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp27 = tmp0 < tmp26
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp28 = tmp25 & tmp27
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp29 = tl.load(in_ptr3 + (x0), tmp28, eviction_policy='evict_last', other=0.0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp30 = tmp29.to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp31 = tl.full(tmp30.shape, 0.0, tmp30.dtype)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp32 = tl.where(tmp28, tmp30, tmp31)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp33 = tmp0 >= tmp26
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp34 = tl.full([1], 8, tl.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp35 = tmp0 < tmp34
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp36 = tl.load(in_ptr4 + (x0 + 4096*((-4) + x1)), tmp33, other=0.0).to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp37 = tl.where(tmp28, tmp32, tmp36)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp38 = tl.where(tmp20, tmp24, tmp37)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp39 = tl.where(tmp12, tmp16, tmp38)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp40 = tl.where(tmp4, tmp8, tmp39)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp42 = tl.full([XBLOCK], 8, tl.int32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp43 = tmp41 + tmp42
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp44 = tmp41 < 0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp45 = tl.where(tmp44, tmp43, tmp41)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp46 = tl.load(in_ptr6 + (tmp45), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp47 = tmp40 * tmp46
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp47, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/5v/c5vf3qxpfhadxjjk2iaok6tatmp3ma3zbvp3i55o72txamnxemhp.py
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   final_out => full_default
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   final_out_1 => scatter_add
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   getitem_32 => index_8
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   weighted_ordered_outs => mul_36
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment:
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg18_1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] #   %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_36), kwargs = {})
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_9 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_9', '''
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     filename=__file__,
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_9', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     min_elem_per_thread=0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_9(out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xnumel = 16384
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     x0 = xindex
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tmp0 = 0.0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     tl.store(out_ptr0 + (x0), tmp0, None)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda')
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] async_compile.wait(globals())
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del async_compile
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def call(args):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1 = args
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     args.clear()
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     s0 = arg6_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     s1 = arg8_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     s3 = arg10_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg1_1, (4, 4096), (4096, 1))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg2_1, (1, ), (1, ))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg5_1, (1, ), (1, ))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg7_1, (1, ), (1, ))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg9_1, (1, ), (1, ))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg11_1, (4, ), (1, ))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg12_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg13_1, (8, 14336), (14336, 1))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg14_1, (8, 14336), (14336, 1))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg15_1, (8, 4096, 14336), (58720256, 14336, 1))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg16_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg17_1, (4, 2), (2, 1))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg18_1, (8, ), (1, ))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     assert_size_stride(arg19_1, (8, ), (1, ))
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf0 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf1 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf3 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf4 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf6 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf7 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf9 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf10 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [linear_3, linear_4, linear_12, linear_13, linear_15, linear_16, linear_18, linear_19], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_red_fused_mm_0.run(arg2_1, arg1_1, arg12_1, arg16_1, arg5_1, arg7_1, arg9_1, buf0, buf1, buf3, buf4, buf6, buf7, buf9, buf10, 14336, 4096, grid=grid(14336), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg2_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg5_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg7_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg9_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf2 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_1], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_red_fused_mm_1.run(buf0, arg13_1, buf1, arg15_1, buf2, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf0
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf5 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_4], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_red_fused_mm_2.run(buf3, arg13_1, buf4, arg15_1, buf5, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf3
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf4
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf8 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_5], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_red_fused_mm_3.run(buf6, arg13_1, buf7, arg15_1, buf8, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf6
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf7
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf11 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_red_fused_mm_4.run(buf9, arg13_1, buf10, arg15_1, buf11, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf10
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf9
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf12 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [cur_x_7], Original ATen: [aten.index]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_poi_fused_index_5.run(arg11_1, arg1_1, buf12, 16384, grid=grid(16384), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg11_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg1_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf13 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [linear_21], Original ATen: [aten._to_copy]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_poi_fused__to_copy_6.run(arg12_1, buf13, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg12_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf14 = empty_strided_cuda((4, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [linear_21], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         extern_kernels.mm(buf12, buf13, out=buf14)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf13
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf15 = empty_strided_cuda((4, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [linear_22], Original ATen: [aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         extern_kernels.mm(buf12, reinterpret_tensor(arg16_1, (4096, 14336), (1, 4096), 411041792), out=buf15)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg16_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf16 = buf14; del buf14  # reuse
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [linear_21, silu_7, mul_7], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_poi_fused_mul_silu_7.run(buf16, arg13_1, buf15, 57344, grid=grid(57344), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg13_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf15
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf17 = buf12; del buf12  # reuse
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [linear_21, silu_7, mul_7, cur_out_7], Original ATen: [aten.mul, aten.silu, aten.mm]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         extern_kernels.mm(buf16, reinterpret_tensor(arg15_1, (14336, 4096), (1, 14336), 411041792), out=buf17)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg15_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf16
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf18 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf20 = buf18; del buf18  # reuse
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_poi_fused_cat_index_mul_view_8.run(buf20, buf2, buf5, buf8, buf11, buf17, arg18_1, arg17_1, 32768, grid=grid(32768), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg17_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg18_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf11
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf5
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf8
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         buf19 = buf17; del buf17  # reuse
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         triton_poi_fused_index_mul_scatter_add_view_zeros_like_9.run(buf19, 16384, grid=grid(16384), stream=stream0)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         aten.scatter_reduce_.two(buf19,0,reinterpret_tensor(arg19_1, (8, 4096), (1, 0), 0),buf20, reduce='sum', include_self=True)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del arg19_1
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]         del buf20
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     return (buf19, )
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg0_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg1_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg2_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg3_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg4_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg5_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg6_1 = 2
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg7_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg8_1 = 3
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg9_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg10_1 = 4
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg11_1 = rand_strided((4, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg12_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg13_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg14_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg15_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg16_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg17_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg18_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     arg19_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1])
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] if __name__ == "__main__":
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] 
 V0401 02:34:30.148000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/1] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/pf/cpfssnwtpri3hothbev44nsvvpmfegzizjh2thez24irbonmltkz.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] Output code: 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # AOT ID: ['7_inference']
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import torch
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import random
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import os
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import tempfile
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from math import inf, nan
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from cmath import nanj
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch import device, empty_strided
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     grid,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     split_scan_grid,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     grid_combo_kernels,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     start_graph,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     end_graph,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] aten = torch.ops.aten
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/47/c47usox3ylfenecsd3rpl5hb7bu7kaksuvekxjnzdtgvc77etdkx.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_x], Original ATen: [aten.index]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   cur_x => index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %index : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg2_1, [%arg1_1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 8192}, 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     min_elem_per_thread=0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x1 = xindex // 4096
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x2 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp1 = tl.full([XBLOCK], 4, tl.int32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp3 = tmp0 < 0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp4 = tl.where(tmp3, tmp2, tmp0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x2), tmp5, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/oa/coajgi5zdctbara3th3yr7xwipnynkbqk3jhttvbn3efmnkmfqyf.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_x_4], Original ATen: [aten.index]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   cur_x_4 => index_4
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %index_4 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg2_1, [%arg9_1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_1 = async_compile.triton('triton_poi_fused_index_1', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     min_elem_per_thread=0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_index_1(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x1 = xindex // 4096
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x2 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp1 = tl.full([XBLOCK], 4, tl.int32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp3 = tmp0 < 0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp4 = tl.where(tmp3, tmp2, tmp0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x2), tmp5, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xt/cxtfkt422w6p7oijexohdeo6ybueo56ffm6ki2mwmywpjyghzgiu.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   final_out => full_default
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   final_out_1 => scatter_add
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   getitem_32 => index_8
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   weighted_ordered_outs => mul_77
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg21_1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_77 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_77), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_2 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_2', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     min_elem_per_thread=0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_2(out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xnumel = 16384
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = 0.0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x0), tmp0, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ga/cgabqhaao3hxvgbvdsfjezkeg3qz23gm44ggolarwouffgf3nkvw.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear => convert_element_type
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute, torch.bfloat16), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused__to_copy_3 = async_compile.triton('triton_poi_fused__to_copy_3', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     min_elem_per_thread=0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused__to_copy_3(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xnumel = 58720256
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = tl.load(in_ptr0 + (x0), None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/n7/cn7or4u2r2ljcbjwejxdis6zjw7vfoadf2pemj3q4ntgk6cxrpwy.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.view, aten.silu]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear => mul_10, view_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   mul => mul_22
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   silu => convert_element_type_3, convert_element_type_4, mul_17, sigmoid
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm, %select_1), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %view_1 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%mul_10, [%arg0_1, 14336]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %convert_element_type_3 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_1, torch.float32), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_3,), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_3, %sigmoid), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %convert_element_type_4 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_17, torch.bfloat16), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_22 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %mm_1), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_mul_silu_view_4 = async_compile.triton('triton_poi_fused_mul_silu_view_4', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_view_4', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     min_elem_per_thread=0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_mul_silu_view_4(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x2 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), xmask).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, xmask)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/3m/c3m7zthtl55aow3uylw7vzdvgth7ef6kt5ltrw7mdrr2etnqxobu.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_12], Original ATen: [aten._to_copy]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear_12 => convert_element_type_42
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %convert_element_type_42 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_12, torch.bfloat16), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused__to_copy_5 = async_compile.triton('triton_poi_fused__to_copy_5', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     min_elem_per_thread=0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused__to_copy_5(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xnumel = 58720256
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = tl.load(in_ptr0 + (234881024 + x0), None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/24/c24pf2rswgtq45keb26c7ezzzrfmmnckx2dvvhk6h4hnrp57vy5k.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_12, silu_4, mul_4], Original ATen: [aten.mul, aten.view, aten.silu]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear_12 => mul_48, view_9
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   mul_4 => mul_60
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   silu_4 => convert_element_type_45, convert_element_type_46, mul_55, sigmoid_4
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_48 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_6, %select_21), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %view_9 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%mul_48, [%sym_size_int_1, 14336]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %convert_element_type_45 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_9, torch.float32), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sigmoid_4 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_45,), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_55 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_45, %sigmoid_4), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %convert_element_type_46 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_55, torch.bfloat16), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_60 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_46, %mm_7), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_mul_silu_view_6 = async_compile.triton('triton_poi_fused_mul_silu_view_6', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 65536}, 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_view_6', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     min_elem_per_thread=0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_mul_silu_view_6(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x2 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp1 = tl.load(in_ptr0 + (57344 + x0), xmask, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), xmask).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, xmask)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/yo/cyozbv757sudj3555go56uooa4ozemydbyqiiau3krglmf7unlnn.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_6, linear_7, linear_9, linear_10, linear_18, linear_19], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear_10 => mul_39, sum_5
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear_18 => mul_68, sum_7
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear_19 => mul_71, sum_8
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear_6 => mul_30, sum_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear_7 => mul_33, sum_2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   linear_9 => mul_36, sum_4
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_30, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_33, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_36, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_39 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_39, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_68 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_12, %unsqueeze_13), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_68, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_71 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_14, %unsqueeze_15), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_71, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_7 = async_compile.triton('triton_red_fused_mm_7', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 16384, 'r0_': 4096},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     reduction_hint=ReductionHint.DEFAULT,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'out_ptr4': '*fp32', 'out_ptr5': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 9, 'num_reduction': 6, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_red_fused_mm_7(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, out_ptr5, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xnumel = 14336
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     r0_numel = 4096
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     rnumel = r0_numel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     rbase = r0_base
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = tl.load(in_ptr0 + (0))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp20 = tl.load(in_ptr4 + (0))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp21 = tl.broadcast_to(tmp20, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp37 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp39 = tl.load(in_ptr5 + (0))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp40 = tl.broadcast_to(tmp39, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp50 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp56 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         roffset = r0_offset
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         rindex = r0_index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_1 = r0_index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp8 = tl.load(in_ptr2 + (117440512 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp14 = tl.load(in_ptr3 + (117440512 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp27 = tl.load(in_ptr2 + (176160768 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp33 = tl.load(in_ptr3 + (176160768 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp46 = tl.load(in_ptr2 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp52 = tl.load(in_ptr3 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp3 = tmp1 + tmp2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp4 = tmp1 < 0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp5 = tl.where(tmp4, tmp3, tmp1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp13 = _tmp12 + tmp11
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp15 = tmp14.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp16 = tmp7 * tmp15
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp19 = _tmp18 + tmp17
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp22 = tmp21 + tmp2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp23 = tmp21 < 0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp24 = tl.where(tmp23, tmp22, tmp21)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp25 = tl.load(in_ptr1 + (r0_1 + 4096*tmp24), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp26 = tmp25.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp28 = tmp27.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp29 = tmp26 * tmp28
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp32 = _tmp31 + tmp30
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp34 = tmp33.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp35 = tmp26 * tmp34
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp36 = tl.broadcast_to(tmp35, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp38 = _tmp37 + tmp36
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp37 = tl.where(r0_mask & xmask, tmp38, _tmp37)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp41 = tmp40 + tmp2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp42 = tmp40 < 0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp43 = tl.where(tmp42, tmp41, tmp40)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp44 = tl.load(in_ptr1 + (r0_1 + 4096*tmp43), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp45 = tmp44.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp47 = tmp46.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp48 = tmp45 * tmp47
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp49 = tl.broadcast_to(tmp48, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp51 = _tmp50 + tmp49
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp50 = tl.where(r0_mask & xmask, tmp51, _tmp50)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp53 = tmp52.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp54 = tmp45 * tmp53
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp55 = tl.broadcast_to(tmp54, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp57 = _tmp56 + tmp55
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp56 = tl.where(r0_mask & xmask, tmp57, _tmp56)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp12 = tl.sum(_tmp12, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp18 = tl.sum(_tmp18, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp31 = tl.sum(_tmp31, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp37 = tl.sum(_tmp37, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp50 = tl.sum(_tmp50, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp56 = tl.sum(_tmp56, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x0), tmp12, xmask)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr1 + (x0), tmp18, xmask)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr2 + (x0), tmp31, xmask)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr3 + (x0), tmp37, xmask)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr4 + (x0), tmp50, xmask)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr5 + (x0), tmp56, xmask)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/py/cpyklkc22mkxjlh53mqbh3dd2w37zcfownerro63nvx6ixbipxmt.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_2], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   cur_out_2 => mul_35, sum_3
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_35 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_35, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_8 = async_compile.triton('triton_red_fused_mm_8', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_red_fused_mm_8(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xnumel = 4096
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     r0_numel = 14336
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     rnumel = r0_numel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     rbase = r0_base
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         roffset = r0_offset
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         rindex = r0_index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_1 = r0_index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp2 = tl.load(in_ptr1 + (28672 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp12 = tl.load(in_ptr3 + (117440512 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/4j/c4jlady6omslbrb4jox42a2c5mvgk3ipozvtusxlzasl57vvucsk.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_3], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   cur_out_3 => mul_41, sum_6
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_41 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_10, %unsqueeze_11), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_6 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_41, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_9 = async_compile.triton('triton_red_fused_mm_9', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_9', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_red_fused_mm_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xnumel = 4096
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     r0_numel = 14336
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     rnumel = r0_numel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     rbase = r0_base
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         roffset = r0_offset
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         rindex = r0_index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_1 = r0_index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp2 = tl.load(in_ptr1 + (43008 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp12 = tl.load(in_ptr3 + (176160768 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/2j/c2jjhi4uu44klkhr5p2jsotr2ixgc5f7m654aac4jtevdhmevht5.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   cur_out_6 => mul_73, sum_9
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_73 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_16, %unsqueeze_17), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %sum_9 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_73, [1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_10 = async_compile.triton('triton_red_fused_mm_10', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_10', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_red_fused_mm_10(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xnumel = 4096
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     r0_numel = 14336
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     rnumel = r0_numel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     rbase = r0_base
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         roffset = r0_offset
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         rindex = r0_index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         r0_1 = r0_index
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp2 = tl.load(in_ptr1 + (86016 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp12 = tl.load(in_ptr3 + (352321536 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/vv/cvvjurpv46iafznus7yhpgxuegye2ukrl4r7m3nkcrzai6zc4wag.py
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   getitem_32 => index_8
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   ordered_outs => cat
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   weighted_ordered_outs => mul_77
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment:
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%mm_2, %convert_element_type_29, %convert_element_type_41, %mm_8, %convert_element_type_71],), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg21_1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] #   %mul_77 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_cat_index_mul_view_11 = async_compile.triton('triton_poi_fused_cat_index_mul_view_11', '''
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     filename=__file__,
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*i64', 'in_ptr6': '*bf16', 'ks0': 'i32', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 9), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_11', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     min_elem_per_thread=0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_cat_index_mul_view_11(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, ks0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xnumel = 32768
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x1 = xindex // 4096
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     x2 = xindex
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp38 = tl.load(in_ptr5 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp0 = x1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp1 = tl.full([1], 0, tl.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp2 = tmp0 >= tmp1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp3 = ks0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp4 = tmp0 < tmp3
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp5 = tl.load(in_ptr0 + (x0 + 4096*(x1)), tmp4, other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp6 = tmp0 >= tmp3
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp7 = 1 + ks0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp8 = tmp0 < tmp7
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp9 = tmp6 & tmp8
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp10 = tl.load(in_ptr1 + (x0), tmp9, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp11 = tmp10.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp12 = tl.full(tmp11.shape, 0.0, tmp11.dtype)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp13 = tl.where(tmp9, tmp11, tmp12)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp14 = tmp0 >= tmp7
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp15 = 2 + ks0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp16 = tmp0 < tmp15
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp17 = tmp14 & tmp16
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp18 = tl.load(in_ptr2 + (x0), tmp17, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp19 = tmp18.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp20 = tl.full(tmp19.shape, 0.0, tmp19.dtype)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp21 = tl.where(tmp17, tmp19, tmp20)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp22 = tmp0 >= tmp15
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp23 = tl.full([1], 7, tl.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp24 = tmp0 < tmp23
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp25 = tmp22 & tmp24
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp26 = tl.load(in_ptr3 + (x0 + 4096*((-2) + x1 + ((-1)*ks0))), tmp25, other=0.0).to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp27 = tmp0 >= tmp23
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp28 = tl.full([1], 8, tl.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp29 = tmp0 < tmp28
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp30 = tl.load(in_ptr4 + (x0), tmp27, eviction_policy='evict_last', other=0.0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp31 = tmp30.to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp32 = tl.full(tmp31.shape, 0.0, tmp31.dtype)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp33 = tl.where(tmp27, tmp31, tmp32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp34 = tl.where(tmp25, tmp26, tmp33)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp35 = tl.where(tmp17, tmp21, tmp34)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp36 = tl.where(tmp9, tmp13, tmp35)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp37 = tl.where(tmp4, tmp5, tmp36)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp39 = tl.full([XBLOCK], 8, tl.int32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp40 = tmp38 + tmp39
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp41 = tmp38 < 0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp42 = tl.where(tmp41, tmp40, tmp38)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp43 = tl.load(in_ptr6 + (tmp42), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tmp44 = tmp37 * tmp43
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp44, None)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda')
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] async_compile.wait(globals())
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del async_compile
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def call(args):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1 = args
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     args.clear()
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     s0 = arg0_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     s1 = arg3_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     s2 = arg6_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     s4 = arg8_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     s5 = arg10_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     s6 = arg13_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg1_1, (s0, ), (1, ))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg2_1, (4, 4096), (4096, 1))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg5_1, (1, ), (1, ))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg7_1, (1, ), (1, ))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg9_1, (5 + ((-1)*s0), ), (1, ))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg12_1, (1, ), (1, ))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg15_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg16_1, (8, 14336), (14336, 1))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg17_1, (8, 14336), (14336, 1))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg18_1, (8, 4096, 14336), (58720256, 14336, 1))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg19_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg20_1, (4, 2), (2, 1))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg21_1, (8, ), (1, ))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     assert_size_stride(arg22_1, (8, ), (1, ))
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf0 = empty_strided_cuda((s0, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [cur_x], Original ATen: [aten.index]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_index_0_xnumel = 4096*s0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_index_0.run(arg1_1, arg2_1, buf0, triton_poi_fused_index_0_xnumel, grid=grid(triton_poi_fused_index_0_xnumel), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg1_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf12 = empty_strided_cuda((5 + ((-1)*s0), 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [cur_x_4], Original ATen: [aten.index]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_index_1_xnumel = 20480 + ((-4096)*s0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_index_1.run(arg9_1, arg2_1, buf12, triton_poi_fused_index_1_xnumel, grid=grid(triton_poi_fused_index_1_xnumel), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg9_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf22 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_index_mul_scatter_add_view_zeros_like_2.run(buf22, 16384, grid=grid(16384), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf3 = empty_strided_cuda((s0, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear_1], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         extern_kernels.mm(buf0, reinterpret_tensor(arg19_1, (4096, 14336), (1, 4096), 0), out=buf3)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf1 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused__to_copy_3.run(arg15_1, buf1, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf2 = empty_strided_cuda((s0, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         extern_kernels.mm(buf0, buf1, out=buf2)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf4 = buf2; del buf2  # reuse
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.view, aten.silu]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_mul_silu_view_4_xnumel = 14336*s0
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_mul_silu_view_4.run(buf4, arg16_1, buf3, triton_poi_fused_mul_silu_view_4_xnumel, grid=grid(triton_poi_fused_mul_silu_view_4_xnumel), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf3
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf5 = buf0; del buf0  # reuse
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear, silu, mul, cur_out], Original ATen: [aten.mul, aten.view, aten.silu, aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         extern_kernels.mm(buf4, reinterpret_tensor(arg18_1, (14336, 4096), (1, 14336), 0), out=buf5)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf4
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf15 = empty_strided_cuda((5 + ((-1)*s0), 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear_13], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         extern_kernels.mm(buf12, reinterpret_tensor(arg19_1, (4096, 14336), (1, 4096), 234881024), out=buf15)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf13 = buf1; del buf1  # reuse
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear_12], Original ATen: [aten._to_copy]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused__to_copy_5.run(arg15_1, buf13, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf14 = empty_strided_cuda((5 + ((-1)*s0), 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear_12], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         extern_kernels.mm(buf12, buf13, out=buf14)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf13
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf16 = buf14; del buf14  # reuse
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear_12, silu_4, mul_4], Original ATen: [aten.mul, aten.view, aten.silu]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_mul_silu_view_6_xnumel = 71680 + ((-14336)*s0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_mul_silu_view_6.run(buf16, arg16_1, buf15, triton_poi_fused_mul_silu_view_6_xnumel, grid=grid(triton_poi_fused_mul_silu_view_6_xnumel), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf15
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf17 = buf12; del buf12  # reuse
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear_12, silu_4, mul_4, cur_out_4], Original ATen: [aten.mul, aten.view, aten.silu, aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         extern_kernels.mm(buf16, reinterpret_tensor(arg18_1, (14336, 4096), (1, 14336), 234881024), out=buf17)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf16
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf6 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf7 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf9 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf10 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf18 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf19 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [linear_6, linear_7, linear_9, linear_10, linear_18, linear_19], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_red_fused_mm_7.run(arg5_1, arg2_1, arg15_1, arg19_1, arg7_1, arg12_1, buf6, buf7, buf9, buf10, buf18, buf19, 14336, 4096, grid=grid(14336), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg12_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg15_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg19_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg2_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg5_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg7_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf8 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_2], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_red_fused_mm_8.run(buf6, arg16_1, buf7, arg18_1, buf8, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf6
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf7
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf11 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_3], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_red_fused_mm_9.run(buf9, arg16_1, buf10, arg18_1, buf11, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf10
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf9
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf20 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_red_fused_mm_10.run(buf18, arg16_1, buf19, arg18_1, buf20, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg16_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg18_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf18
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf19
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf21 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         buf23 = buf21; del buf21  # reuse
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         triton_poi_fused_cat_index_mul_view_11.run(buf23, buf5, buf8, buf11, buf17, buf20, arg21_1, arg20_1, s0, 32768, grid=grid(32768), stream=stream0)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg20_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg21_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf11
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf17
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf20
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf5
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf8
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         aten.scatter_reduce_.two(buf22,0,reinterpret_tensor(arg22_1, (8, 4096), (1, 0), 0),buf23, reduce='sum', include_self=True)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del arg22_1
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]         del buf23
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     return (buf22, )
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg0_1 = 2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg1_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg2_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg3_1 = 2
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg4_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg5_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg6_1 = 3
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg7_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg8_1 = 4
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg9_1 = rand_strided((3, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg10_1 = 7
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg11_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg12_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg13_1 = 8
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg14_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg15_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg16_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg17_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg18_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg19_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg20_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg21_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     arg22_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1])
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] if __name__ == "__main__":
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] 
 V0401 02:34:31.726000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/2] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/hs/chsirooun7kydovvjk4oinw35bsqbrcbopw4tqtfnvy6v3ejoj7c.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] Output code: 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # AOT ID: ['8_inference']
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import torch
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import random
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import os
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import tempfile
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from math import inf, nan
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from cmath import nanj
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch import device, empty_strided
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     grid,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     split_scan_grid,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     grid_combo_kernels,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     start_graph,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     end_graph,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] aten = torch.ops.aten
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/fs/cfshdmnxmjiadh6stjyoymktb3rxxlyv6fwu5jxfh3dk5sm6riaz.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_x_5], Original ATen: [aten.index]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   cur_x_5 => index_5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %index_5 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg8_1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     min_elem_per_thread=0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 12288
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x1 = xindex // 4096
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x2 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp1 = tl.full([XBLOCK], 4, tl.int32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp3 = tmp0 < 0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp4 = tl.where(tmp3, tmp2, tmp0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x2), tmp5, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/pp/cppt43372rvepxmwsrrmtoqxmhn7h57p7lpbofdbz5qy6rfpsuqv.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   final_out => full_default
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   final_out_1 => scatter_add
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   getitem_32 => index_8
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   weighted_ordered_outs => mul_39
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg19_1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_39 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_39), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_1 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_1', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     min_elem_per_thread=0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_1(out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 16384
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp0 = 0.0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x0), tmp0, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ni/cnizkrlflmp6ljf6woclsajqj7kqw27zspbpp5ttw66c5ftx2avg.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_15], Original ATen: [aten._to_copy]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_15 => convert_element_type_54
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %convert_element_type_54 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_15, torch.bfloat16), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused__to_copy_2 = async_compile.triton('triton_poi_fused__to_copy_2', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     min_elem_per_thread=0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused__to_copy_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 58720256
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp0 = tl.load(in_ptr0 + (293601280 + x0), None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/44/c44yb5mjd3q5nyxs2qd2wdzheklkfzgjouugy7d5wzerdsbdvo75.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_15, silu_5, mul_5], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_15 => mul_24
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   mul_5 => mul_26
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   silu_5 => convert_element_type_57, convert_element_type_58, mul_25, sigmoid_5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_24 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_6, %select_26), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %convert_element_type_57 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_24, torch.float32), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sigmoid_5 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_57,), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_25 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_57, %sigmoid_5), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %convert_element_type_58 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_25, torch.bfloat16), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_26 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_58, %mm_7), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_mul_silu_3 = async_compile.triton('triton_poi_fused_mul_silu_3', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 65536}, 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_3', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     min_elem_per_thread=0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused_mul_silu_3(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 43008
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x2 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp1 = tl.load(in_ptr0 + (71680 + x0), xmask, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), xmask).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/oz/cozllcw53qa3x32uidkhkhpsdwytxdwkjt76sc6ebejd7rj2ey4k.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear, linear_1, linear_3, linear_4, linear_9, linear_10, linear_18, linear_19, linear_21, linear_22], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear => mul, sum_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_1 => mul_3, sum_2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_10 => mul_18, sum_8
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_18 => mul_27, sum_10
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_19 => mul_30, sum_11
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_21 => mul_33, sum_13
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_22 => mul_36, sum_14
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_3 => mul_6, sum_4
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_4 => mul_9, sum_5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   linear_9 => mul_15, sum_7
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_3, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_6, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_9, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_12, %unsqueeze_13), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_15, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_14, %unsqueeze_15), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_18, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_18, %unsqueeze_19), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_10 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_27, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_20, %unsqueeze_21), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_11 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_30, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_24, %unsqueeze_25), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_13 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_33, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_26, %unsqueeze_27), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_14 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_36, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_4 = async_compile.triton('triton_red_fused_mm_4', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 16384, 'r0_': 4096},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     reduction_hint=ReductionHint.DEFAULT,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*i64', 'in_ptr6': '*i64', 'in_ptr7': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'out_ptr4': '*fp32', 'out_ptr5': '*fp32', 'out_ptr6': '*fp32', 'out_ptr7': '*fp32', 'out_ptr8': '*fp32', 'out_ptr9': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 15, 'num_reduction': 10, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_4(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, out_ptr5, out_ptr6, out_ptr7, out_ptr8, out_ptr9, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 14336
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_numel = 4096
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rnumel = r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rbase = r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp0 = tl.load(in_ptr0 + (0))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp20 = tl.load(in_ptr4 + (0))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp21 = tl.broadcast_to(tmp20, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp37 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp39 = tl.load(in_ptr5 + (0))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp40 = tl.broadcast_to(tmp39, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp50 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp56 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp58 = tl.load(in_ptr6 + (0))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp59 = tl.broadcast_to(tmp58, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp69 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp75 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp77 = tl.load(in_ptr7 + (0))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp78 = tl.broadcast_to(tmp77, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp88 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp94 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         roffset = r0_offset
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         rindex = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_1 = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp14 = tl.load(in_ptr3 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp27 = tl.load(in_ptr2 + (58720256 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp33 = tl.load(in_ptr3 + (58720256 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp46 = tl.load(in_ptr2 + (176160768 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp52 = tl.load(in_ptr3 + (176160768 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp65 = tl.load(in_ptr2 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp71 = tl.load(in_ptr3 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp84 = tl.load(in_ptr2 + (411041792 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp90 = tl.load(in_ptr3 + (411041792 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp3 = tmp1 + tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp4 = tmp1 < 0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp5 = tl.where(tmp4, tmp3, tmp1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp13 = _tmp12 + tmp11
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp15 = tmp14.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp16 = tmp7 * tmp15
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp19 = _tmp18 + tmp17
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp22 = tmp21 + tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp23 = tmp21 < 0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp24 = tl.where(tmp23, tmp22, tmp21)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp25 = tl.load(in_ptr1 + (r0_1 + 4096*tmp24), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp26 = tmp25.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp28 = tmp27.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp29 = tmp26 * tmp28
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp32 = _tmp31 + tmp30
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp34 = tmp33.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp35 = tmp26 * tmp34
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp36 = tl.broadcast_to(tmp35, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp38 = _tmp37 + tmp36
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp37 = tl.where(r0_mask & xmask, tmp38, _tmp37)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp41 = tmp40 + tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp42 = tmp40 < 0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp43 = tl.where(tmp42, tmp41, tmp40)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp44 = tl.load(in_ptr1 + (r0_1 + 4096*tmp43), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp45 = tmp44.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp47 = tmp46.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp48 = tmp45 * tmp47
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp49 = tl.broadcast_to(tmp48, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp51 = _tmp50 + tmp49
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp50 = tl.where(r0_mask & xmask, tmp51, _tmp50)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp53 = tmp52.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp54 = tmp45 * tmp53
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp55 = tl.broadcast_to(tmp54, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp57 = _tmp56 + tmp55
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp56 = tl.where(r0_mask & xmask, tmp57, _tmp56)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp60 = tmp59 + tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp61 = tmp59 < 0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp62 = tl.where(tmp61, tmp60, tmp59)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp63 = tl.load(in_ptr1 + (r0_1 + 4096*tmp62), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp64 = tmp63.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp66 = tmp65.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp67 = tmp64 * tmp66
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp68 = tl.broadcast_to(tmp67, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp70 = _tmp69 + tmp68
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp69 = tl.where(r0_mask & xmask, tmp70, _tmp69)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp72 = tmp71.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp73 = tmp64 * tmp72
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp74 = tl.broadcast_to(tmp73, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp76 = _tmp75 + tmp74
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp75 = tl.where(r0_mask & xmask, tmp76, _tmp75)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp79 = tmp78 + tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp80 = tmp78 < 0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp81 = tl.where(tmp80, tmp79, tmp78)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp82 = tl.load(in_ptr1 + (r0_1 + 4096*tmp81), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp83 = tmp82.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp85 = tmp84.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp86 = tmp83 * tmp85
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp87 = tl.broadcast_to(tmp86, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp89 = _tmp88 + tmp87
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp88 = tl.where(r0_mask & xmask, tmp89, _tmp88)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp91 = tmp90.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp92 = tmp83 * tmp91
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp93 = tl.broadcast_to(tmp92, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp95 = _tmp94 + tmp93
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp94 = tl.where(r0_mask & xmask, tmp95, _tmp94)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp12 = tl.sum(_tmp12, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp18 = tl.sum(_tmp18, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp31 = tl.sum(_tmp31, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp37 = tl.sum(_tmp37, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp50 = tl.sum(_tmp50, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp56 = tl.sum(_tmp56, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp69 = tl.sum(_tmp69, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp75 = tl.sum(_tmp75, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp88 = tl.sum(_tmp88, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp94 = tl.sum(_tmp94, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x0), tmp12, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr1 + (x0), tmp18, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr2 + (x0), tmp31, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr3 + (x0), tmp37, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr4 + (x0), tmp50, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr5 + (x0), tmp56, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr6 + (x0), tmp69, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr7 + (x0), tmp75, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr8 + (x0), tmp88, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr9 + (x0), tmp94, xmask)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xx/cxx6xntbnboc2g6tdf7vyxn4y73k7jimowg7zyfn5bowklvzwsdg.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   cur_out => mul_5, sum_3
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_5, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_5 = async_compile.triton('triton_red_fused_mm_5', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 4096
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_numel = 14336
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rnumel = r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rbase = r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         roffset = r0_offset
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         rindex = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_1 = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp2 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp12 = tl.load(in_ptr3 + (r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ce/cceviquawbkgm2efk7w2ykpozllrbcae74m5k44pyzscgiiogapd.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_1], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   cur_out_1 => mul_11, sum_6
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_10, %unsqueeze_11), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_6 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_11, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_6 = async_compile.triton('triton_red_fused_mm_6', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 4096
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_numel = 14336
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rnumel = r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rbase = r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         roffset = r0_offset
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         rindex = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_1 = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp2 = tl.load(in_ptr1 + (14336 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp12 = tl.load(in_ptr3 + (58720256 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/if/ciffkzjjwbz6mpsov6naxwgkjnoztxaxx5mlr5kykxugcuwjhqlr.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_3], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   cur_out_3 => mul_20, sum_9
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_20 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_16, %unsqueeze_17), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_9 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_20, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_7 = async_compile.triton('triton_red_fused_mm_7', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_7(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 4096
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_numel = 14336
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rnumel = r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rbase = r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         roffset = r0_offset
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         rindex = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_1 = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp2 = tl.load(in_ptr1 + (43008 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp12 = tl.load(in_ptr3 + (176160768 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/6g/c6gqgrsqygxdbqf2zsinu6b25wiico7gtm6kqyie56rcbdbsj6yw.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   cur_out_6 => mul_32, sum_12
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_32 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_22, %unsqueeze_23), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_12 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_32, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_8 = async_compile.triton('triton_red_fused_mm_8', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_8(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 4096
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_numel = 14336
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rnumel = r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rbase = r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         roffset = r0_offset
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         rindex = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_1 = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp2 = tl.load(in_ptr1 + (86016 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp12 = tl.load(in_ptr3 + (352321536 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xo/cxovjih66bogmdzezh2m2vplwymlvkd4lxpgf4x5lf3unr3wvgxz.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_7], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   cur_out_7 => mul_38, sum_15
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_38 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_28, %unsqueeze_29), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %sum_15 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_38, [1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_9 = async_compile.triton('triton_red_fused_mm_9', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_9', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 4096
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_numel = 14336
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rnumel = r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     rbase = r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         roffset = r0_offset
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         rindex = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         r0_1 = r0_index
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp2 = tl.load(in_ptr1 + (100352 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp12 = tl.load(in_ptr3 + (411041792 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/p2/cp2s4xmrbzkqwz3syvvx3kjmd4wcxohqk2vqrchxtwqm2tv2vavp.py
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   getitem_32 => index_8
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   ordered_outs => cat
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   weighted_ordered_outs => mul_39
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment:
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%convert_element_type_11, %convert_element_type_23, %convert_element_type_44, %mm_8, %convert_element_type_74, %convert_element_type_86],), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg19_1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] #   %mul_39 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_cat_index_mul_view_10 = async_compile.triton('triton_poi_fused_cat_index_mul_view_10', '''
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     filename=__file__,
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*i64', 'in_ptr7': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 7, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     min_elem_per_thread=0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused_cat_index_mul_view_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xnumel = 32768
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x1 = xindex // 4096
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     x2 = xindex
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp50 = tl.load(in_ptr6 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp0 = x1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp1 = tl.full([1], 0, tl.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp2 = tmp0 >= tmp1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp3 = tl.full([1], 1, tl.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp4 = tmp0 < tmp3
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp5 = tl.load(in_ptr0 + (x0), tmp4, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp7 = tl.full(tmp6.shape, 0.0, tmp6.dtype)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp8 = tl.where(tmp4, tmp6, tmp7)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp9 = tmp0 >= tmp3
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp10 = tl.full([1], 2, tl.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp11 = tmp0 < tmp10
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp12 = tmp9 & tmp11
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp13 = tl.load(in_ptr1 + (x0), tmp12, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp14 = tmp13.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp15 = tl.full(tmp14.shape, 0.0, tmp14.dtype)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp16 = tl.where(tmp12, tmp14, tmp15)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp17 = tmp0 >= tmp10
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp18 = tl.full([1], 3, tl.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp19 = tmp0 < tmp18
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp20 = tmp17 & tmp19
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp21 = tl.load(in_ptr2 + (x0), tmp20, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp22 = tmp21.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp23 = tl.full(tmp22.shape, 0.0, tmp22.dtype)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp24 = tl.where(tmp20, tmp22, tmp23)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp25 = tmp0 >= tmp18
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp26 = tl.full([1], 6, tl.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp27 = tmp0 < tmp26
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp28 = tmp25 & tmp27
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp29 = tl.load(in_ptr3 + (x0 + 4096*((-3) + x1)), tmp28, other=0.0).to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp30 = tmp0 >= tmp26
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp31 = tl.full([1], 7, tl.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp32 = tmp0 < tmp31
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp33 = tmp30 & tmp32
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp34 = tl.load(in_ptr4 + (x0), tmp33, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp35 = tmp34.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp36 = tl.full(tmp35.shape, 0.0, tmp35.dtype)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp37 = tl.where(tmp33, tmp35, tmp36)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp38 = tmp0 >= tmp31
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp39 = tl.full([1], 8, tl.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp40 = tmp0 < tmp39
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp41 = tl.load(in_ptr5 + (x0), tmp38, eviction_policy='evict_last', other=0.0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp42 = tmp41.to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp43 = tl.full(tmp42.shape, 0.0, tmp42.dtype)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp44 = tl.where(tmp38, tmp42, tmp43)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp45 = tl.where(tmp33, tmp37, tmp44)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp46 = tl.where(tmp28, tmp29, tmp45)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp47 = tl.where(tmp20, tmp24, tmp46)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp48 = tl.where(tmp12, tmp16, tmp47)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp49 = tl.where(tmp4, tmp8, tmp48)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp51 = tl.full([XBLOCK], 8, tl.int32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp52 = tmp50 + tmp51
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp53 = tmp50 < 0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp54 = tl.where(tmp53, tmp52, tmp50)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp55 = tl.load(in_ptr7 + (tmp54), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tmp56 = tmp49 * tmp55
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp56, None)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda')
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] async_compile.wait(globals())
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del async_compile
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def call(args):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1 = args
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     args.clear()
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     s0 = arg3_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     s1 = arg6_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     s3 = arg9_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     s4 = arg11_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg0_1, (1, ), (1, ))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg1_1, (4, 4096), (4096, 1))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg2_1, (1, ), (1, ))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg5_1, (1, ), (1, ))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg8_1, (3, ), (1, ))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg10_1, (1, ), (1, ))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg12_1, (1, ), (1, ))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg13_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg14_1, (8, 14336), (14336, 1))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg15_1, (8, 14336), (14336, 1))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg16_1, (8, 4096, 14336), (58720256, 14336, 1))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg17_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg18_1, (4, 2), (2, 1))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg19_1, (8, ), (1, ))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     assert_size_stride(arg20_1, (8, ), (1, ))
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf9 = empty_strided_cuda((3, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [cur_x_5], Original ATen: [aten.index]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_poi_fused_index_0.run(arg8_1, arg1_1, buf9, 12288, grid=grid(12288), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg8_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf22 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_poi_fused_index_mul_scatter_add_view_zeros_like_1.run(buf22, 16384, grid=grid(16384), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf12 = empty_strided_cuda((3, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [linear_16], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         extern_kernels.mm(buf9, reinterpret_tensor(arg17_1, (4096, 14336), (1, 4096), 293601280), out=buf12)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf10 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [linear_15], Original ATen: [aten._to_copy]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_poi_fused__to_copy_2.run(arg13_1, buf10, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf11 = empty_strided_cuda((3, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [linear_15], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         extern_kernels.mm(buf9, buf10, out=buf11)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf10
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf13 = buf11; del buf11  # reuse
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [linear_15, silu_5, mul_5], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_poi_fused_mul_silu_3.run(buf13, arg14_1, buf12, 43008, grid=grid(43008), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf12
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf14 = buf9; del buf9  # reuse
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [linear_15, silu_5, mul_5, cur_out_5], Original ATen: [aten.mul, aten.silu, aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         extern_kernels.mm(buf13, reinterpret_tensor(arg16_1, (14336, 4096), (1, 14336), 293601280), out=buf14)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf13
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf0 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf1 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf3 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf4 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf6 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf7 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf15 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf16 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf18 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf19 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [linear, linear_1, linear_3, linear_4, linear_9, linear_10, linear_18, linear_19, linear_21, linear_22], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_red_fused_mm_4.run(arg0_1, arg1_1, arg13_1, arg17_1, arg2_1, arg5_1, arg10_1, arg12_1, buf0, buf1, buf3, buf4, buf6, buf7, buf15, buf16, buf18, buf19, 14336, 4096, grid=grid(14336), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg0_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg10_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg12_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg13_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg17_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg1_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg2_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg5_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf2 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [cur_out], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_red_fused_mm_5.run(buf0, arg14_1, buf1, arg16_1, buf2, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf0
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf5 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_1], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_red_fused_mm_6.run(buf3, arg14_1, buf4, arg16_1, buf5, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf3
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf4
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf8 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_3], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_red_fused_mm_7.run(buf6, arg14_1, buf7, arg16_1, buf8, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf6
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf7
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf17 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_red_fused_mm_8.run(buf15, arg14_1, buf16, arg16_1, buf17, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf15
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf16
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf20 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_7], Original ATen: [aten.mm]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_red_fused_mm_9.run(buf18, arg14_1, buf19, arg16_1, buf20, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg14_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg16_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf18
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf19
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf21 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         buf23 = buf21; del buf21  # reuse
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         triton_poi_fused_cat_index_mul_view_10.run(buf23, buf2, buf5, buf8, buf14, buf17, buf20, arg19_1, arg18_1, 32768, grid=grid(32768), stream=stream0)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg18_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg19_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf14
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf17
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf20
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf5
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf8
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         aten.scatter_reduce_.two(buf22,0,reinterpret_tensor(arg20_1, (8, 4096), (1, 0), 0),buf23, reduce='sum', include_self=True)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del arg20_1
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]         del buf23
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     return (buf22, )
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg0_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg1_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg2_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg3_1 = 2
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg4_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg5_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg6_1 = 3
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg7_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg8_1 = rand_strided((3, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg9_1 = 6
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg10_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg11_1 = 7
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg12_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg13_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg14_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg15_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg16_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg17_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg18_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg19_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     arg20_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1])
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] if __name__ == "__main__":
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] 
 V0401 02:34:33.849000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/3] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/nr/cnr4atw5ay43j2qpni2f22cvzrd6lanjrovddcclybuolyunkawq.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] Output code: 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # AOT ID: ['9_inference']
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import torch
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import random
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import os
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import tempfile
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from math import inf, nan
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from cmath import nanj
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch import device, empty_strided
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     grid,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     split_scan_grid,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     grid_combo_kernels,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     start_graph,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     end_graph,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] aten = torch.ops.aten
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/pn/cpnadhcpx7pdda4wy2zq5f4yksvs57uy47uewrx3fzhgcm4mahjg.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_x, cur_x_3, cur_x_6], Original ATen: [aten.index]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   cur_x => index
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   cur_x_3 => index_3
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   cur_x_6 => index_6
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %index : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg0_1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %index_3 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg5_1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %index_6 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg11_1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 8192}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i64', 'in_ptr3': '*i64', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 8192
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x1 = xindex // 4096
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x2 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp6 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp11 = tl.load(in_ptr3 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tl.full([XBLOCK], 4, tl.int32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp3 = tmp0 < 0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp4 = tl.where(tmp3, tmp2, tmp0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp7 = tmp6 + tmp1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp8 = tmp6 < 0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp9 = tl.where(tmp8, tmp7, tmp6)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp10 = tl.load(in_ptr1 + (x0 + 4096*tmp9), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp12 = tmp11 + tmp1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp13 = tmp11 < 0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp14 = tl.where(tmp13, tmp12, tmp11)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp15 = tl.load(in_ptr1 + (x0 + 4096*tmp14), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr0 + (x2), tmp5, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr1 + (x2), tmp10, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr2 + (x2), tmp15, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/yj/cyj7p7xqlcmuj23z3eb2yyzznvkflquhslgxtbejde24is7cpycm.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear => convert_element_type
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute, torch.bfloat16), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_1 = async_compile.triton('triton_poi_fused__to_copy_1', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 58720256
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = tl.load(in_ptr0 + (x0), None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/nw/cnw5bk2sece6miamzj2agxakkl5ligtllxfirol7rltvneibcdz4.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear => mul
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   mul => mul_2
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   silu => convert_element_type_3, convert_element_type_4, mul_1, sigmoid
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm, %select_1), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type_3 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.float32), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_3,), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_3, %sigmoid), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type_4 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %mm_1), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_2 = async_compile.triton('triton_poi_fused_mul_silu_2', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_mul_silu_2(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 28672
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x2 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/mt/cmt5cljv4hutbqlrk2uagwbmi44fwurqzl2lfowdyjpc5htma74j.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear_9 => convert_element_type_27
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type_27 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_9, torch.bfloat16), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_3 = async_compile.triton('triton_poi_fused__to_copy_3', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused__to_copy_3(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 58720256
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = tl.load(in_ptr0 + (176160768 + x0), None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/nt/cntmomp6ushnh66gq7yjlozybhgulzvmqqe3dprdzlhbsgt3obhr.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear_9 => mul_9
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   mul_3 => mul_11
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   silu_3 => convert_element_type_30, convert_element_type_31, mul_10, sigmoid_3
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_9, %select_16), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type_30 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_9, torch.float32), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sigmoid_3 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_30,), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_30, %sigmoid_3), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type_31 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_10, torch.bfloat16), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_31, %mm_10), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_4 = async_compile.triton('triton_poi_fused_mul_silu_4', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_4', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_mul_silu_4(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 28672
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x2 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tl.load(in_ptr0 + (43008 + x0), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/gt/cgtnl56winvb2ne3rhqswbwokqaqdyux7zrtfohk57cyufcqfack.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_12, linear_13, linear_15, linear_16], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear_12 => mul_12, sum_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear_13 => mul_15, sum_2
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear_15 => mul_18, sum_4
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear_16 => mul_21, sum_5
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_12, [1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_15, [1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_18, [1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_21, [1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_5 = async_compile.triton('triton_red_fused_mm_5', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 16384, 'r0_': 4096},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     reduction_hint=ReductionHint.DEFAULT,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3, 5, 6, 7, 8, 9, 10), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 4, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_red_fused_mm_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 14336
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     r0_numel = 4096
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     rnumel = r0_numel
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     rbase = r0_base
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = tl.load(in_ptr0 + (0))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp20 = tl.load(in_ptr4 + (0))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp21 = tl.broadcast_to(tmp20, [XBLOCK, R0_BLOCK])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     _tmp37 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         roffset = r0_offset
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         rindex = r0_index
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_1 = r0_index
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp8 = tl.load(in_ptr2 + (234881024 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp14 = tl.load(in_ptr3 + (234881024 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp27 = tl.load(in_ptr2 + (293601280 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp33 = tl.load(in_ptr3 + (293601280 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp3 = tmp1 + tmp2
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp4 = tmp1 < 0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp5 = tl.where(tmp4, tmp3, tmp1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp13 = _tmp12 + tmp11
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp15 = tmp14.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp16 = tmp7 * tmp15
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp19 = _tmp18 + tmp17
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp22 = tmp21 + tmp2
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp23 = tmp21 < 0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp24 = tl.where(tmp23, tmp22, tmp21)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp25 = tl.load(in_ptr1 + (r0_1 + 4096*tmp24), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp26 = tmp25.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp28 = tmp27.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp29 = tmp26 * tmp28
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp32 = _tmp31 + tmp30
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp34 = tmp33.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp35 = tmp26 * tmp34
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp36 = tl.broadcast_to(tmp35, [XBLOCK, R0_BLOCK])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp38 = _tmp37 + tmp36
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         _tmp37 = tl.where(r0_mask & xmask, tmp38, _tmp37)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp12 = tl.sum(_tmp12, 1)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp18 = tl.sum(_tmp18, 1)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp31 = tl.sum(_tmp31, 1)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp37 = tl.sum(_tmp37, 1)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr0 + (x0), tmp12, xmask)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr1 + (x0), tmp18, xmask)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr2 + (x0), tmp31, xmask)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr3 + (x0), tmp37, xmask)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/vu/cvu76bra7dtrosp7wkj6uqmxw5ruguxxzxrxgyprzkq7swh77jxq.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_out_4], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   cur_out_4 => mul_17, sum_3
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_17, [1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_6 = async_compile.triton('triton_red_fused_mm_6', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_red_fused_mm_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 4096
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     r0_numel = 14336
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     rnumel = r0_numel
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     rbase = r0_base
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         roffset = r0_offset
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         rindex = r0_index
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_1 = r0_index
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp2 = tl.load(in_ptr1 + (57344 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp12 = tl.load(in_ptr3 + (234881024 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/kz/ckzpveqxn3ptfdcdbnp5jn46pozxmhrb2eqsz4bslvr2muyyuc4q.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_out_5], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   cur_out_5 => mul_23, sum_6
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_23 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_10, %unsqueeze_11), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sum_6 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_23, [1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_7 = async_compile.triton('triton_red_fused_mm_7', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 4096, 'r0_': 16384},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     reduction_hint=ReductionHint.INNER,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_red_fused_mm_7(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 4096
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     r0_numel = 14336
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     rnumel = r0_numel
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     rbase = r0_base
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     for r0_offset in range(0, r0_numel, R0_BLOCK):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_index = r0_offset + r0_base
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_mask = r0_index < r0_numel
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         roffset = r0_offset
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         rindex = r0_index
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         r0_1 = r0_index
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp2 = tl.load(in_ptr1 + (71680 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp12 = tl.load(in_ptr3 + (293601280 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp1 = tmp0.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp3 = tmp1 * tmp2
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp4 = tmp3.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp5 = tl.sigmoid(tmp4)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp6 = tmp4 * tmp5
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp7 = tmp6.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp9 = tmp8.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp10 = tmp7 * tmp9
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp11 = tmp10.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp13 = tmp12.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp14 = tmp11 * tmp13
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         tmp17 = _tmp16 + tmp15
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         _tmp16 = tl.where(r0_mask, tmp17, _tmp16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp16 = tl.sum(_tmp16, 1)[:, None]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr0 + (x0), tmp16, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/bn/cbntbuf2uowesjcvj7pijik2jtcococuwohpnparrrikin5bb3is.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear_18 => convert_element_type_60
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type_60 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_18, torch.bfloat16), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_8 = async_compile.triton('triton_poi_fused__to_copy_8', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 58720256
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = tl.load(in_ptr0 + (352321536 + x0), None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xk/cxkxa4iqtzpndenpxumel4jvuld773zjmzkir4txf2hm5n7vrya4.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   linear_18 => mul_24
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   mul_6 => mul_26
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   silu_6 => convert_element_type_63, convert_element_type_64, mul_25, sigmoid_6
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_24 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_12, %select_31), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type_63 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_24, torch.float32), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %sigmoid_6 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_63,), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_25 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_63, %sigmoid_6), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %convert_element_type_64 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_25, torch.bfloat16), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_26 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_64, %mm_13), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_9 = async_compile.triton('triton_poi_fused_mul_silu_9', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_9', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_mul_silu_9(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 28672
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x2 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tl.load(in_ptr0 + (86016 + x0), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/cp/ccp3stqbhb5f53fzompeeislgwlwcrpug5xm766de66u4ehpn2ml.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   getitem_32 => index_8
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   ordered_outs => cat
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   weighted_ordered_outs => mul_30
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%mm_2, %mm_11, %convert_element_type_47, %convert_element_type_59, %mm_14],), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg20_1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_cat_index_mul_view_10 = async_compile.triton('triton_poi_fused_cat_index_mul_view_10', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'in_ptr5': '*i64', 'in_ptr6': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_cat_index_mul_view_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 32768
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x1 = xindex // 4096
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x2 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp35 = tl.load(in_ptr5 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = x1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp1 = tl.full([1], 0, tl.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp2 = tmp0 >= tmp1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp3 = tl.full([1], 2, tl.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp4 = tmp0 < tmp3
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp5 = tl.load(in_ptr0 + (x0 + 4096*(x1)), tmp4, other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp6 = tmp0 >= tmp3
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp7 = tl.full([1], 4, tl.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp8 = tmp0 < tmp7
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp9 = tmp6 & tmp8
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp10 = tl.load(in_ptr1 + (x0 + 4096*((-2) + x1)), tmp9, other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp11 = tmp0 >= tmp7
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp12 = tl.full([1], 5, tl.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp13 = tmp0 < tmp12
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp14 = tmp11 & tmp13
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp15 = tl.load(in_ptr2 + (x0), tmp14, eviction_policy='evict_last', other=0.0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp16 = tmp15.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp17 = tl.full(tmp16.shape, 0.0, tmp16.dtype)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp18 = tl.where(tmp14, tmp16, tmp17)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp19 = tmp0 >= tmp12
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp20 = tl.full([1], 6, tl.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp21 = tmp0 < tmp20
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp22 = tmp19 & tmp21
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp23 = tl.load(in_ptr3 + (x0), tmp22, eviction_policy='evict_last', other=0.0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp24 = tmp23.to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp25 = tl.full(tmp24.shape, 0.0, tmp24.dtype)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp26 = tl.where(tmp22, tmp24, tmp25)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp27 = tmp0 >= tmp20
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp28 = tl.full([1], 8, tl.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp29 = tmp0 < tmp28
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp30 = tl.load(in_ptr4 + (x0 + 4096*((-6) + x1)), tmp27, other=0.0).to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp31 = tl.where(tmp22, tmp26, tmp30)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp32 = tl.where(tmp14, tmp18, tmp31)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp33 = tl.where(tmp9, tmp10, tmp32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp34 = tl.where(tmp4, tmp5, tmp33)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp36 = tl.full([XBLOCK], 8, tl.int32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp37 = tmp35 + tmp36
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp38 = tmp35 < 0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp39 = tl.where(tmp38, tmp37, tmp35)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp40 = tl.load(in_ptr6 + (tmp39), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp41 = tmp34 * tmp40
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp41, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/4l/c4lbtol734c43z6bxtljtt4jjq4qsrbkiyp5o525eddf3iovsttp.py
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   final_out => full_default
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   final_out_1 => scatter_add
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   getitem_32 => index_8
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   weighted_ordered_outs => mul_30
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment:
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg20_1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] #   %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_30), kwargs = {})
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_11 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_11', '''
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     filename=__file__,
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_11', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_11(out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xnumel = 16384
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     x0 = xindex
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tmp0 = 0.0
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     tl.store(out_ptr0 + (x0), tmp0, None)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] async_compile.wait(globals())
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del async_compile
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def call(args):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1 = args
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     args.clear()
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     s1 = arg2_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     s3 = arg6_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     s4 = arg8_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     s6 = arg10_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     s7 = arg12_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg0_1, (2, ), (1, ))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg1_1, (4, 4096), (4096, 1))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg5_1, (2, ), (1, ))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg7_1, (1, ), (1, ))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg9_1, (1, ), (1, ))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg11_1, (2, ), (1, ))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg14_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg15_1, (8, 14336), (14336, 1))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg16_1, (8, 14336), (14336, 1))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg17_1, (8, 4096, 14336), (58720256, 14336, 1))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg18_1, (8, 14336, 4096), (58720256, 4096, 1))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg19_1, (4, 2), (2, 1))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg20_1, (8, ), (1, ))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     assert_size_stride(arg21_1, (8, ), (1, ))
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     with torch.cuda._DeviceGuard(0):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         torch.cuda.set_device(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf0 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf6 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf18 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [cur_x, cur_x_3, cur_x_6], Original ATen: [aten.index]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused_index_0.run(arg0_1, arg1_1, arg5_1, arg11_1, buf0, buf6, buf18, 8192, grid=grid(8192), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg0_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg11_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg5_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf1 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused__to_copy_1.run(arg14_1, buf1, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf2 = empty_strided_cuda((2, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf0, buf1, out=buf2)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf3 = empty_strided_cuda((2, 14336), (14336, 1), torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_1], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf0, reinterpret_tensor(arg18_1, (4096, 14336), (1, 4096), 0), out=buf3)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf4 = buf2; del buf2  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused_mul_silu_2.run(buf4, arg15_1, buf3, 28672, grid=grid(28672), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf5 = buf0; del buf0  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear, silu, mul, cur_out], Original ATen: [aten.mul, aten.silu, aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf4, reinterpret_tensor(arg17_1, (14336, 4096), (1, 14336), 0), out=buf5)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf7 = buf1; del buf1  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused__to_copy_3.run(arg14_1, buf7, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf8 = buf4; del buf4  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf6, buf7, out=buf8)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf9 = buf3; del buf3  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_10], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf6, reinterpret_tensor(arg18_1, (4096, 14336), (1, 4096), 176160768), out=buf9)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf10 = buf8; del buf8  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused_mul_silu_4.run(buf10, arg15_1, buf9, 28672, grid=grid(28672), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf11 = buf6; del buf6  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3, cur_out_3], Original ATen: [aten.mul, aten.silu, aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf10, reinterpret_tensor(arg17_1, (14336, 4096), (1, 14336), 176160768), out=buf11)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf12 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf13 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf15 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf16 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_12, linear_13, linear_15, linear_16], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_red_fused_mm_5.run(arg7_1, arg1_1, arg14_1, arg18_1, arg9_1, buf12, buf13, buf15, buf16, 14336, 4096, grid=grid(14336), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg1_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg7_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg9_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf14 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_4], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_red_fused_mm_6.run(buf12, arg15_1, buf13, arg17_1, buf14, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf12
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf13
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf17 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [cur_out_5], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_red_fused_mm_7.run(buf15, arg15_1, buf16, arg17_1, buf17, 4096, 14336, grid=grid(4096), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf15
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf16
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf19 = buf7; del buf7  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused__to_copy_8.run(arg14_1, buf19, 58720256, grid=grid(58720256), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg14_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf20 = buf10; del buf10  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy, aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf18, buf19, out=buf20)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf19
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf21 = buf9; del buf9  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_19], Original ATen: [aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf18, reinterpret_tensor(arg18_1, (4096, 14336), (1, 4096), 352321536), out=buf21)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg18_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf22 = buf20; del buf20  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused_mul_silu_9.run(buf22, arg15_1, buf21, 28672, grid=grid(28672), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg15_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf21
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf23 = buf18; del buf18  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6, cur_out_6], Original ATen: [aten.mul, aten.silu, aten.mm]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         extern_kernels.mm(buf22, reinterpret_tensor(arg17_1, (14336, 4096), (1, 14336), 352321536), out=buf23)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg17_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf22
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf24 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf26 = buf24; del buf24  # reuse
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused_cat_index_mul_view_10.run(buf26, buf5, buf11, buf14, buf17, buf23, arg20_1, arg19_1, 32768, grid=grid(32768), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg19_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg20_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf11
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf14
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf17
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf23
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf5
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         buf25 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         stream0 = get_raw_stream(0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         triton_poi_fused_index_mul_scatter_add_view_zeros_like_11.run(buf25, 16384, grid=grid(16384), stream=stream0)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         aten.scatter_reduce_.two(buf25,0,reinterpret_tensor(arg21_1, (8, 4096), (1, 0), 0),buf26, reduce='sum', include_self=True)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del arg21_1
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]         del buf26
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     return (buf25, )
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     from torch._dynamo.testing import rand_strided
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     from torch._inductor.utils import print_performance
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg0_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg1_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg2_1 = 2
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg3_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg4_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg5_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg6_1 = 4
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg7_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg8_1 = 5
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg9_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg10_1 = 6
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg11_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg12_1 = 8
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg13_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg14_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg15_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg16_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg17_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg18_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg19_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg20_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     arg21_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1])
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] if __name__ == "__main__":
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code]     compiled_module_main('None', benchmark_compiled_module)
 V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] 
 V0401 02:34:34.275000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/4] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/vc/cvclk5nwwmijilkr6g36t3fpsaupngalce2feusbejlngivi4n6c.py
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] Output code: 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # AOT ID: ['10_inference']
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from ctypes import c_void_p, c_long, c_int
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import torch
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import math
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import random
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import os
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import tempfile
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from math import inf, nan
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from cmath import nanj
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.utils import maybe_profile
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch import device, empty_strided
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.async_compile import AsyncCompile
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.select_algorithm import extern_kernels
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_heuristics import (
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     grid,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     split_scan_grid,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     grid_combo_kernels,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     start_graph,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     end_graph,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     cooperative_reduction_grid,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] )
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] aten = torch.ops.aten
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_ops = torch.ops.inductor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] _quantized = torch.ops._quantized
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] async_compile = AsyncCompile()
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/wk/cwkeabswtnly4envvmyy47m2dnvt6tcq2mqvl23aerqffjvcpm5s.py
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [cur_x_2, cur_x_7], Original ATen: [aten.index]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   cur_x_2 => index_2
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   cur_x_7 => index_7
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %index_2 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg3_1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %index_7 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg12_1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', '''
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     size_hints={'x': 8192}, 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     filename=__file__,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i64', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] )
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xnumel = 8192
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x1 = xindex // 4096
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x0 = (xindex % 4096)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x2 = xindex
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp6 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last')
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp1 = tl.full([XBLOCK], 4, tl.int32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp2 = tmp0 + tmp1
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp3 = tmp0 < 0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp4 = tl.where(tmp3, tmp2, tmp0)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp7 = tmp6 + tmp1
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp8 = tmp6 < 0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp9 = tl.where(tmp8, tmp7, tmp6)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp10 = tl.load(in_ptr1 + (x0 + 4096*tmp9), None).to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tl.store(out_ptr0 + (x2), tmp5, None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tl.store(out_ptr1 + (x2), tmp10, None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/pp/cppt43372rvepxmwsrrmtoqxmhn7h57p7lpbofdbz5qy6rfpsuqv.py
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   final_out => full_default
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   final_out_1 => scatter_add
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   getitem_32 => index_8
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   ordered_token_activation_weights => view_17
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   weighted_ordered_outs => mul_36
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg19_1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_36), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_1 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_1', '''
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     size_hints={'x': 16384}, 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     filename=__file__,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] )
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_1(out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xnumel = 16384
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x0 = xindex
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp0 = 0.0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tl.store(out_ptr0 + (x0), tmp0, None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/z4/cz4vffmbvb2kxqbqoturm7qcthdzlfhrztfk6mboxfmn2a3jfcnq.py
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_6], Original ATen: [aten._to_copy]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_6 => convert_element_type_21
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %convert_element_type_21 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_6, torch.bfloat16), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused__to_copy_2 = async_compile.triton('triton_poi_fused__to_copy_2', '''
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     filename=__file__,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] )
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused__to_copy_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xnumel = 58720256
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x0 = xindex
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp0 = tl.load(in_ptr0 + (117440512 + x0), None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ia/cia4jpg2c2vn4yq7jq4nhf7q7scd62gvulaxucajatglemmgbfqq.py
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_6, silu_2, mul_2], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_6 => mul_9
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   mul_2 => mul_11
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   silu_2 => convert_element_type_24, convert_element_type_25, mul_10, sigmoid_2
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_3, %select_11), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %convert_element_type_24 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_9, torch.float32), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sigmoid_2 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_24,), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_24, %sigmoid_2), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %convert_element_type_25 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_10, torch.bfloat16), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_25, %mm_4), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused_mul_silu_3 = async_compile.triton('triton_poi_fused_mul_silu_3', '''
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     filename=__file__,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_3', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] )
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused_mul_silu_3(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xnumel = 28672
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x2 = xindex
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp1 = tl.load(in_ptr0 + (28672 + x0), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/kq/ckqvwkuuoeyn7m7x5ggcraabvfxslymsbl743lrf5eo2ynoebf2u.py
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_21], Original ATen: [aten._to_copy]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_21 => convert_element_type_75
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %convert_element_type_75 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_21, torch.bfloat16), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused__to_copy_4 = async_compile.triton('triton_poi_fused__to_copy_4', '''
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     size_hints={'x': 67108864}, 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     filename=__file__,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] )
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused__to_copy_4(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xnumel = 58720256
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x0 = xindex
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp0 = tl.load(in_ptr0 + (411041792 + x0), None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp1 = tmp0.to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tl.store(out_ptr0 + (x0), tmp1, None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/mq/cmqj7yusny67pmxxacb2tzwiol46m2m7fjrvkyfb6pjle642ghbh.py
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_21, silu_7, mul_7], Original ATen: [aten.mul, aten.silu]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_21 => mul_33
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   mul_7 => mul_35
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   silu_7 => convert_element_type_78, convert_element_type_79, mul_34, sigmoid_7
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_9, %select_36), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %convert_element_type_78 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_33, torch.float32), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sigmoid_7 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_78,), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_34 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_78, %sigmoid_7), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %convert_element_type_79 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_34, torch.bfloat16), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_35 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_79, %mm_10), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused_mul_silu_5 = async_compile.triton('triton_poi_fused_mul_silu_5', '''
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise(
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     size_hints={'x': 32768}, 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     filename=__file__,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_5', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     min_elem_per_thread=0
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] )
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused_mul_silu_5(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr):
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xnumel = 28672
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xmask = tl.full([XBLOCK], True, tl.int1)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x2 = xindex
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     x0 = (xindex % 14336)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp1 = tl.load(in_ptr0 + (100352 + x0), None, eviction_policy='evict_last').to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp2 = tmp0 * tmp1
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp3 = tmp2.to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp4 = tl.sigmoid(tmp3)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp5 = tmp3 * tmp4
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp6 = tmp5.to(tl.float32)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tmp8 = tmp6 * tmp7
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     tl.store(in_out_ptr0 + (x2), tmp8, None)
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda')
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/za/cza5auy5mwzv2rnfug7eub7cff2alhhumyrfxwv43oeoryxqn4ho.py
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_3, linear_4, linear_9, linear_10, linear_12, linear_13, linear_18, linear_19], Original ATen: [aten.mm]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_10 => mul_15, sum_5
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_12 => mul_18, sum_7
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_13 => mul_21, sum_8
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_18 => mul_27, sum_10
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_19 => mul_30, sum_11
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_3 => mul_3, sum_1
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_4 => mul_6, sum_2
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   linear_9 => mul_12, sum_4
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment:
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_3, [1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_6, [1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_12, [1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_15, [1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_12, %unsqueeze_13), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_18, [1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_14, %unsqueeze_15), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_21, [1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_18, %unsqueeze_19), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sum_10 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_27, [1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_20, %unsqueeze_21), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] #   %sum_11 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_30, [1]), kwargs = {})
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_red_fused_mm_6 = async_compile.triton('triton_red_fused_mm_6', '''
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu()
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] 
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.reduction(
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     size_hints={'x': 16384, 'r0_': 4096},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     reduction_hint=ReductionHint.DEFAULT,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     filename=__file__,
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*i64', 'in_ptr6': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'out_ptr4': '*fp32', 'out_ptr5': '*fp32', 'out_ptr6': '*fp32', 'out_ptr7': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 12, 'num_reduction': 8, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] )
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_red_fused_mm_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, out_ptr5, out_ptr6, out_ptr7, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xnumel = 14336
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     r0_numel = 4096
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     rnumel = r0_numel
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     RBLOCK: tl.constexpr = R0_BLOCK
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xoffset = tl.program_id(0) * XBLOCK
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     xmask = xindex < xnumel
 V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code]     r0_base = tl.arange(0, R0_BLOCK)[None, :]
 V0401 02:34:34.860000 3240940 site-packages
No results found