Created
April 1, 2025 09:36
-
-
Save HDCharles/91fc0690a4a1f00c654b0b92aa7ee239 to your computer and use it in GitHub Desktop.
transient error
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /home/cdhernandez/.conda/envs/pytorch-3.12/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature. | |
| self.gen = func(*args, **kwds) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] Output code: | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # AOT ID: ['0_inference'] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import torch | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import math | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import random | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import os | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import tempfile | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from math import inf, nan | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from cmath import nanj | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import triton | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import triton.language as tl | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] grid, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] split_scan_grid, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] grid_combo_kernels, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] start_graph, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] end_graph, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] ) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/nw/cnw55j7m7wguamk45hegxbqannr3ua2whtxlkam2qoztukgvzhaa.py | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # Topologically Sorted Source Nodes: [float_1, mul, mean, add, rsqrt, mul_1, output, mul_2], Original ATen: [aten._to_copy, aten.mul, aten.mean, aten.add, aten.rsqrt] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # add => add | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # float_1 => convert_element_type | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # mean => mean | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # mul => mul | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # mul_1 => mul_1 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # mul_2 => mul_2 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # output => convert_element_type_1 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # rsqrt => rsqrt | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # Graph fragment: | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # %convert_element_type : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg0_1, torch.float32), kwargs = {}) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %convert_element_type), kwargs = {}) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # %mean : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%mul, [-1], True), kwargs = {}) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mean, 1e-05), kwargs = {}) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # %rsqrt : [num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {}) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {}) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # %convert_element_type_1 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %arg1_1), kwargs = {}) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] triton_red_fused__to_copy_add_mean_mul_rsqrt_0 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_rsqrt_0', ''' | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import triton | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] import triton.language as tl | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] size_hints={'x': 4, 'r0_': 4096}, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] filename=__file__, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 4), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_rsqrt_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] ) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] @triton.jit | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] def triton_red_fused__to_copy_add_mean_mul_rsqrt_0(in_ptr0, in_ptr1, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] xnumel = 4 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] r0_numel = 4096 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] rbase = r0_base | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] x0 = xindex | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] roffset = r0_offset | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] rindex = r0_index | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] r0_1 = r0_index | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp2 = tmp1 * tmp1 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp5 = _tmp4 + tmp3 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] _tmp4 = tl.where(r0_mask & xmask, tmp5, _tmp4) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp4 = tl.sum(_tmp4, 1)[:, None] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] roffset = r0_offset | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] rindex = r0_index | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] r0_1 = r0_index | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp6 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp15 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp8 = 4096.0 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp9 = tmp4 / tmp8 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp10 = 1e-05 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp11 = tmp9 + tmp10 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp12 = libdevice.rsqrt(tmp11) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp13 = tmp7 * tmp12 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp14 = tmp13.to(tl.float32) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tmp16 = tmp14 * tmp15 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] tl.store(out_ptr1 + (r0_1 + 4096*x0), tmp16, r0_mask & xmask) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] del async_compile | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] def call(args): | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] arg0_1, arg1_1 = args | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] args.clear() | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] assert_size_stride(arg0_1, (4, 1, 4096), (4096, 4096, 1)) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] assert_size_stride(arg1_1, (4096, ), (1, )) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] buf1 = empty_strided_cuda((4, 1, 4096), (4096, 4096, 1), torch.bfloat16) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] # Topologically Sorted Source Nodes: [float_1, mul, mean, add, rsqrt, mul_1, output, mul_2], Original ATen: [aten._to_copy, aten.mul, aten.mean, aten.add, aten.rsqrt] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] triton_red_fused__to_copy_add_mean_mul_rsqrt_0.run(arg0_1, arg1_1, buf1, 4, 4096, grid=grid(4), stream=stream0) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] del arg0_1 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] del arg1_1 | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] return (buf1, ) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] arg0_1 = rand_strided((4, 1, 4096), (4096, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] arg1_1 = rand_strided((4096, ), (1, ), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] fn = lambda: call([arg0_1, arg1_1]) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1091] [2/0_1] [__output_code] | |
| V0401 02:34:28.775000 3240940 site-packages/torch/_inductor/codecache.py:1092] [2/0_1] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/kv/ckvcpd3fzxrnga3hqj7ehqmfd7ztquytgkda3j6jajm6zzru7noz.py | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] Output code: | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # AOT ID: ['1_inference'] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import torch | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import math | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import random | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import os | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import tempfile | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from math import inf, nan | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from cmath import nanj | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton.language as tl | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] grid, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] split_scan_grid, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] grid_combo_kernels, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] start_graph, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] end_graph, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] ) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/sz/cszgef2zhtwtfb5ukfdazs4v2uocs3t6l54we5tv4pgl32s324yr.py | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Topologically Sorted Source Nodes: [expert_weights], Original ATen: [aten._softmax] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # expert_weights => amax, convert_element_type_2, convert_element_type_3, div, exp, sub, sum_1 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Graph fragment: | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %convert_element_type_2 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mm, torch.float32), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %amax : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%convert_element_type_2, [-1], True), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_2, %amax), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %exp : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub,), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp, [-1], True), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %div : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp, %sum_1), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %convert_element_type_3 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_per_fused__softmax_0 = async_compile.triton('triton_per_fused__softmax_0', ''' | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton.language as tl | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] @triton_heuristics.persistent_reduction( | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] size_hints={'x': 4, 'r0_': 8}, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] filename=__file__, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0,), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__softmax_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] ) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] @triton.jit | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] def triton_per_fused__softmax_0(in_out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] xnumel = 4 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] r0_numel = 8 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] R0_BLOCK: tl.constexpr = 8 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] r0_index = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] r0_offset = 0 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] roffset = r0_offset | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] rindex = r0_index | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] r0_1 = r0_index | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] x0 = xindex | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp0 = tl.load(in_out_ptr0 + (r0_1 + 8*x0), xmask, other=0.0).to(tl.float32) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp4 = tl.where(xmask, tmp2, float("-inf")) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp5 = triton_helpers.max2(tmp4, 1)[:, None] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp6 = tmp1 - tmp5 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp7 = tl_math.exp(tmp6) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp8 = tl.broadcast_to(tmp7, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp10 = tl.where(xmask, tmp8, 0) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp11 = tl.sum(tmp10, 1)[:, None] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp12 = tmp7 / tmp11 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tl.store(in_out_ptr0 + (r0_1 + 8*x0), tmp13, xmask) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/5g/c5gztvklis5blbr72aeuxee7ogqe3urzratlqymlsfl5rml5odfk.py | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Topologically Sorted Source Nodes: [sum_1, expert_weights_2], Original ATen: [aten.sum, aten.div] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # expert_weights_2 => div_1 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # sum_1 => sum_2 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Graph fragment: | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%getitem, [-1], True), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # %div_1 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%getitem, %sum_2), kwargs = {}) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_poi_fused_div_sum_1 = async_compile.triton('triton_poi_fused_div_sum_1', ''' | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] import triton.language as tl | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] size_hints={'x': 8}, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] filename=__file__, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_div_sum_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] ) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] @triton.jit | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] def triton_poi_fused_div_sum_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] xnumel = 8 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] x2 = xindex | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] x1 = xindex // 2 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp0 = tl.load(in_ptr0 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp1 = tl.load(in_ptr0 + (2*x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp2 = tl.load(in_ptr0 + (1 + 2*x1), xmask, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp3 = tmp1 + tmp2 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tmp4 = tmp0 / tmp3 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] tl.store(out_ptr0 + (x2), tmp4, xmask) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] del async_compile | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] def call(args): | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] arg0_1, arg1_1 = args | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] args.clear() | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] assert_size_stride(arg0_1, (4, 1, 4096), (4096, 4096, 1)) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] assert_size_stride(arg1_1, (8, 4096), (4096, 1)) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] buf0 = empty_strided_cuda((4, 8), (8, 1), torch.bfloat16) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Topologically Sorted Source Nodes: [scores], Original ATen: [aten.mm] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] extern_kernels.mm(reinterpret_tensor(arg0_1, (4, 4096), (4096, 1), 0), reinterpret_tensor(arg1_1, (4096, 8), (1, 4096), 0), out=buf0) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] del arg1_1 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] buf3 = buf0; del buf0 # reuse | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Topologically Sorted Source Nodes: [expert_weights], Original ATen: [aten._softmax] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_per_fused__softmax_0.run(buf3, 4, 8, grid=grid(4), stream=stream0) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Topologically Sorted Source Nodes: [expert_weights, topk], Original ATen: [aten._softmax, aten.topk] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] buf4 = torch.ops.aten.topk.default(buf3, 2) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] del buf3 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] buf5 = buf4[0] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] assert_size_stride(buf5, (4, 2), (2, 1)) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] buf6 = buf4[1] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] assert_size_stride(buf6, (4, 2), (2, 1)) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] del buf4 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] buf7 = empty_strided_cuda((4, 2), (2, 1), torch.bfloat16) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] # Topologically Sorted Source Nodes: [sum_1, expert_weights_2], Original ATen: [aten.sum, aten.div] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] triton_poi_fused_div_sum_1.run(buf5, buf7, 8, grid=grid(8), stream=stream0) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] del buf5 | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] return (reinterpret_tensor(arg0_1, (4, 4096), (4096, 1), 0), buf6, buf7, ) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] arg0_1 = rand_strided((4, 1, 4096), (4096, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] arg1_1 = rand_strided((8, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] fn = lambda: call([arg0_1, arg1_1]) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:29.313000 3240940 site-packages/torch/_inductor/codecache.py:1091] [3/0_1] [__output_code] | |
| V0401 02:34:29.314000 3240940 site-packages/torch/_inductor/codecache.py:1092] [3/0_1] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/sa/csa7hmt3qvuirshz463bgy4sd3olawgiy7zmlwnkhgzoiuxkvuas.py | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] Output code: | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # AOT ID: ['2_inference'] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import torch | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import math | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import random | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import os | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import tempfile | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from math import inf, nan | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from cmath import nanj | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton.language as tl | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] grid, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] split_scan_grid, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] grid_combo_kernels, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] start_graph, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] end_graph, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] ) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/x5/cx5dcpwwkbcak7ec6f42gznrnizwakgvvysewhahd357r2afaoky.py | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Topologically Sorted Source Nodes: [cumsum], Original ATen: [aten.cumsum] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # cumsum => cumsum | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Graph fragment: | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # %cumsum : [num_users=1] = call_function[target=torch.ops.aten.cumsum.default](args = (%histc, 0), kwargs = {}) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_per_fused_cumsum_0 = async_compile.triton('triton_per_fused_cumsum_0', ''' | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton.language as tl | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton.jit | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def _triton_helper_fn_add0(arg0_0, arg1_0): | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp0 = arg0_0 + arg1_0 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] return tmp0 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton_heuristics.persistent_reduction( | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] size_hints={'x': 1, 'r0_': 16}, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] filename=__file__, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*i64', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0,), 'tt.equal_to': (1,)}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_cumsum_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] ) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton.jit | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def triton_per_fused_cumsum_0(in_out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] xnumel = 1 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_numel = 9 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] R0_BLOCK: tl.constexpr = 16 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_index = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_offset = 0 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] roffset = r0_offset | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] rindex = r0_index | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_0 = r0_index | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp0 = tl.load(in_out_ptr0 + (r0_0), r0_mask, other=0.0) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp1 = tmp0.to(tl.int64) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp3, = tl.associative_scan((tmp2,), 1, _triton_helper_fn_add0) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tl.store(in_out_ptr0 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp3, r0_mask) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/2l/c2ly6tanvidk6eyla4mydbbwkltgwzimgveg677mmndzmp3d4dag.py | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Topologically Sorted Source Nodes: [ordered_token_activations, div, floor, ordered_token_indices], Original ATen: [aten.sort, aten.div, aten.floor, aten._to_copy] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # div => div | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # floor => floor | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # ordered_token_activations => getitem_1, sort | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # ordered_token_indices => convert_element_type | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Graph fragment: | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # %sort : [num_users=1] = call_function[target=torch.ops.aten.sort.stable](args = (%view,), kwargs = {stable: True}) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # %getitem_1 : [num_users=2] = call_function[target=operator.getitem](args = (%sort, 1), kwargs = {}) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # %div : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%getitem_1, 2), kwargs = {}) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # %floor : [num_users=1] = call_function[target=torch.ops.aten.floor.default](args = (%div,), kwargs = {}) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%floor, torch.int64), kwargs = {}) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_per_fused__to_copy_div_floor_sort_1 = async_compile.triton('triton_per_fused__to_copy_div_floor_sort_1', ''' | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] import triton.language as tl | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton_heuristics.persistent_reduction( | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] size_hints={'x': 1, 'r0_': 8}, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] filename=__file__, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'out_ptr1': '*i64', 'out_ptr2': '*i64', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': (3,)}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_div_floor_sort_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] ) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] @triton.jit | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def triton_per_fused__to_copy_div_floor_sort_1(in_ptr0, out_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] xnumel = 1 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_numel = 8 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] R0_BLOCK: tl.constexpr = 8 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_index = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_offset = 0 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] roffset = r0_offset | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] rindex = r0_index | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] r0_0 = r0_index | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_0), None) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp1 = r0_0 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp2 = tmp1.to(tl.int16) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp4 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp5, tmp6, = triton_helpers.sort_with_index(tmp3, tmp4, None, 1, stable=True, descending=False) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp7 = tmp6.to(tl.int64) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp8 = tmp7.to(tl.float32) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp9 = 0.5 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp10 = tmp8 * tmp9 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp11 = libdevice.floor(tmp10) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tmp12 = tmp11.to(tl.int64) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tl.store(out_ptr1 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp7, None) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] tl.store(out_ptr2 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp12, None) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] del async_compile | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def call(args): | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] arg0_1, = args | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] args.clear() | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] assert_size_stride(arg0_1, (4, 2), (2, 1)) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Topologically Sorted Source Nodes: [num_tokens_per_expert], Original ATen: [aten.histc] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] buf4 = torch.ops.aten.histc.default(arg0_1, 9, -1, 8) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] buf5 = buf4 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] assert_size_stride(buf5, (9, ), (1, )) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] del buf4 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] buf6 = buf5; del buf5 # reuse | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Topologically Sorted Source Nodes: [cumsum], Original ATen: [aten.cumsum] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_per_fused_cumsum_0.run(buf6, 1, 9, grid=grid(1), stream=stream0) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] buf2 = empty_strided_cuda((8, ), (1, ), torch.int64) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] buf3 = empty_strided_cuda((8, ), (1, ), torch.int64) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] # Topologically Sorted Source Nodes: [ordered_token_activations, div, floor, ordered_token_indices], Original ATen: [aten.sort, aten.div, aten.floor, aten._to_copy] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] triton_per_fused__to_copy_div_floor_sort_1.run(arg0_1, buf2, buf3, 1, 8, grid=grid(1), stream=stream0) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] del arg0_1 | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] return (buf2, buf3, buf6, ) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] arg0_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] fn = lambda: call([arg0_1]) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:29.364000 3240940 site-packages/torch/_inductor/codecache.py:1091] [4/0_1] [__output_code] | |
| V0401 02:34:29.365000 3240940 site-packages/torch/_inductor/codecache.py:1092] [4/0_1] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/iz/cizfitgcjxxo52caxetfsorvxrxg6uvxvkugusw3tk2kha2klabs.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] Output code: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # AOT ID: ['3_inference'] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import torch | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import random | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import os | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import tempfile | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from math import inf, nan | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from cmath import nanj | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] grid, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] split_scan_grid, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] grid_combo_kernels, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] start_graph, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] end_graph, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/o4/co4xnn2io22jfgnxz4ht5jvbf2llatrrh2jxbjcrgdchs4g4oage.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_x], Original ATen: [aten.index] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # cur_x => index | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %index : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg0_1]), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 12288 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x2 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tl.full([XBLOCK], 4, tl.int32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp3 = tmp0 < 0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp4 = tl.where(tmp3, tmp2, tmp0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x2), tmp5, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/yj/cyj7p7xqlcmuj23z3eb2yyzznvkflquhslgxtbejde24is7cpycm.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # linear => convert_element_type | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_1 = async_compile.triton('triton_poi_fused__to_copy_1', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 58720256 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/sl/cslemfnvmno5niab2n5zknfoqfgbyu7zy6qecwrz2iim5mdzojk7.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # linear => mul | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # mul => mul_2 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # silu => convert_element_type_3, convert_element_type_4, mul_1, sigmoid | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm, %select_1), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type_3 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.float32), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_3,), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_3, %sigmoid), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type_4 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %mm_1), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_2 = async_compile.triton('triton_poi_fused_mul_silu_2', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 65536}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_mul_silu_2(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 43008 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x2 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, xmask) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/64/c64hgrupdizact3b6llutwalvnx33ksi5ddojikmmstwkklowf3j.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_6, linear_7], Original ATen: [aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # linear_6 => mul_6, sum_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # linear_7 => mul_9, sum_2 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_6, [1]), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_9, [1]), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_red_fused_mm_3 = async_compile.triton('triton_red_fused_mm_3', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 16384, 'r0_': 4096}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] reduction_hint=ReductionHint.DEFAULT, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_red_fused_mm_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 14336 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_numel = 4096 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] rnumel = r0_numel | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] rbase = r0_base | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_ptr0 + (0)) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] roffset = r0_offset | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] rindex = r0_index | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_1 = r0_index | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp8 = tl.load(in_ptr2 + (117440512 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp14 = tl.load(in_ptr3 + (117440512 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp3 = tmp1 + tmp2 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp4 = tmp1 < 0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp5 = tl.where(tmp4, tmp3, tmp1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp13 = _tmp12 + tmp11 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp15 = tmp14.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp16 = tmp7 * tmp15 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp19 = _tmp18 + tmp17 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp12 = tl.sum(_tmp12, 1)[:, None] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp18 = tl.sum(_tmp18, 1)[:, None] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x0), tmp12, xmask) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr1 + (x0), tmp18, xmask) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/p4/cp44ewu7oadqvpm7kqql2mch4jkm7suwnms7dg3lxm6kymm24pkq.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_out_2], Original ATen: [aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # cur_out_2 => mul_11, sum_3 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_11, [1]), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_red_fused_mm_4 = async_compile.triton('triton_red_fused_mm_4', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_red_fused_mm_4(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 4096 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_numel = 14336 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] rnumel = r0_numel | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] rbase = r0_base | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] roffset = r0_offset | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] rindex = r0_index | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] r0_1 = r0_index | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp2 = tl.load(in_ptr1 + (28672 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp12 = tl.load(in_ptr3 + (117440512 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/hz/chzpp3oidskhj7g4vx6jgtqvyd4whdkmzx6oypxs2tyfu45hwcqw.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_x_3, cur_x_6], Original ATen: [aten.index] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # cur_x_3 => index_3 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # cur_x_6 => index_6 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %index_3 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg4_1]), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %index_6 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg7_1]), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_index_5 = async_compile.triton('triton_poi_fused_index_5', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 8192}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i64', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_index_5(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 8192 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x2 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp6 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tl.full([XBLOCK], 4, tl.int32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp3 = tmp0 < 0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp4 = tl.where(tmp3, tmp2, tmp0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp7 = tmp6 + tmp1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp8 = tmp6 < 0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp9 = tl.where(tmp8, tmp7, tmp6) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp10 = tl.load(in_ptr1 + (x0 + 4096*tmp9), None).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x2), tmp5, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr1 + (x2), tmp10, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/la/clahscggfyxr4yqj5gamqksnyomqxgibxjiqa3ridzteknsvnwjm.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # linear_9 => convert_element_type_30 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type_30 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_9, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_6 = async_compile.triton('triton_poi_fused__to_copy_6', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused__to_copy_6(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 58720256 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_ptr0 + (176160768 + x0), None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/us/cusajh75vrnwh42pn5f4rv4knbjqkieyfcs5ogoltegkfmfkol4c.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # linear_9 => mul_12 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # mul_3 => mul_14 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # silu_3 => convert_element_type_33, convert_element_type_34, mul_13, sigmoid_3 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_6, %select_16), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type_33 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_12, torch.float32), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %sigmoid_3 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_33,), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_13 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_33, %sigmoid_3), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type_34 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_13, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_14 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_34, %mm_7), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_7 = async_compile.triton('triton_poi_fused_mul_silu_7', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_7', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_mul_silu_7(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 28672 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x2 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tl.load(in_ptr0 + (43008 + x0), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/bn/cbntbuf2uowesjcvj7pijik2jtcococuwohpnparrrikin5bb3is.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # linear_18 => convert_element_type_57 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type_57 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_18, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_8 = async_compile.triton('triton_poi_fused__to_copy_8', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 58720256 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_ptr0 + (352321536 + x0), None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xk/cxkxa4iqtzpndenpxumel4jvuld773zjmzkir4txf2hm5n7vrya4.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # linear_18 => mul_21 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # mul_6 => mul_23 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # silu_6 => convert_element_type_60, convert_element_type_61, mul_22, sigmoid_6 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_15, %select_31), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type_60 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_21, torch.float32), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %sigmoid_6 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_60,), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_22 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_60, %sigmoid_6), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %convert_element_type_61 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_22, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_23 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_61, %mm_16), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_9 = async_compile.triton('triton_poi_fused_mul_silu_9', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_9', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_mul_silu_9(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 28672 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x2 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tl.load(in_ptr0 + (86016 + x0), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ct/cctreepea7pb6izuwtwnqatvjccnzanpmpyx44n36pl3c3zqxuyk.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # ordered_outs => cat | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # weighted_ordered_outs => mul_27 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%mm_2, %convert_element_type_29, %mm_8, %mm_17],), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg15_1]), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_cat_index_mul_view_10 = async_compile.triton('triton_poi_fused_cat_index_mul_view_10', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_10', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_cat_index_mul_view_10(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 32768 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x2 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp26 = tl.load(in_ptr4 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = x1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp1 = tl.full([1], 0, tl.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp2 = tmp0 >= tmp1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp3 = tl.full([1], 3, tl.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp4 = tmp0 < tmp3 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp5 = tl.load(in_ptr0 + (x0 + 4096*(x1)), tmp4, other=0.0).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp6 = tmp0 >= tmp3 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp7 = tl.full([1], 4, tl.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp8 = tmp0 < tmp7 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp9 = tmp6 & tmp8 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp10 = tl.load(in_ptr1 + (x0), tmp9, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp12 = tl.full(tmp11.shape, 0.0, tmp11.dtype) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp13 = tl.where(tmp9, tmp11, tmp12) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp14 = tmp0 >= tmp7 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp15 = tl.full([1], 6, tl.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp16 = tmp0 < tmp15 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp17 = tmp14 & tmp16 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp18 = tl.load(in_ptr2 + (x0 + 4096*((-4) + x1)), tmp17, other=0.0).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp19 = tmp0 >= tmp15 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp20 = tl.full([1], 8, tl.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp21 = tmp0 < tmp20 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp22 = tl.load(in_ptr3 + (x0 + 4096*((-6) + x1)), tmp19, other=0.0).to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp23 = tl.where(tmp17, tmp18, tmp22) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp24 = tl.where(tmp9, tmp13, tmp23) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp25 = tl.where(tmp4, tmp5, tmp24) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp27 = tl.full([XBLOCK], 8, tl.int32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp28 = tmp26 + tmp27 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp29 = tmp26 < 0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp30 = tl.where(tmp29, tmp28, tmp26) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp31 = tl.load(in_ptr5 + (tmp30), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp32 = tmp25 * tmp31 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x2), tmp32, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/lx/clxfjvsgrm7woaqpniy5p2traw2xugy7umu7g2rnslg6lyqzyhtu.py | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [final_out, final_out_1], Original ATen: [aten.zeros_like, aten.scatter_add] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # final_out => full_default | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # final_out_1 => scatter_add | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_27), kwargs = {}) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_scatter_add_zeros_like_11 = async_compile.triton('triton_poi_fused_scatter_add_zeros_like_11', ''' | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_scatter_add_zeros_like_11', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] @triton.jit | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def triton_poi_fused_scatter_add_zeros_like_11(out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xnumel = 16384 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] x0 = xindex | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tmp0 = 0.0 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] tl.store(out_ptr0 + (x0), tmp0, None) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del async_compile | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def call(args): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1 = args | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] args.clear() | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg0_1, (3, ), (1, )) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg1_1, (4, 4096), (4096, 1)) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg3_1, (1, ), (1, )) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg4_1, (2, ), (1, )) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg7_1, (2, ), (1, )) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg9_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg10_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg11_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg12_1, (8, 4096, 14336), (58720256, 14336, 1)) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg13_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg14_1, (4, 2), (2, 1)) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg15_1, (8, ), (1, )) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] assert_size_stride(arg16_1, (8, ), (1, )) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf0 = empty_strided_cuda((3, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_x], Original ATen: [aten.index] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_index_0.run(arg0_1, arg1_1, buf0, 12288, grid=grid(12288), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg0_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf1 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_1.run(arg9_1, buf1, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf2 = empty_strided_cuda((3, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf0, buf1, out=buf2) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf3 = empty_strided_cuda((3, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_1], Original ATen: [aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf0, reinterpret_tensor(arg13_1, (4096, 14336), (1, 4096), 0), out=buf3) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf4 = buf2; del buf2 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_2.run(buf4, arg10_1, buf3, 43008, grid=grid(43008), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf3 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf5 = buf0; del buf0 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul, cur_out], Original ATen: [aten.mul, aten.silu, aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf4, reinterpret_tensor(arg12_1, (14336, 4096), (1, 14336), 0), out=buf5) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf4 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf6 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf7 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_6, linear_7], Original ATen: [aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_red_fused_mm_3.run(arg3_1, arg1_1, arg9_1, arg13_1, buf6, buf7, 14336, 4096, grid=grid(14336), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg3_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf8 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_out_2], Original ATen: [aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_red_fused_mm_4.run(buf6, arg10_1, buf7, arg12_1, buf8, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf6 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf7 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf9 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf15 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [cur_x_3, cur_x_6], Original ATen: [aten.index] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_index_5.run(arg4_1, arg1_1, arg7_1, buf9, buf15, 8192, grid=grid(8192), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg1_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg4_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg7_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf10 = buf1; del buf1 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_6.run(arg9_1, buf10, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf11 = empty_strided_cuda((2, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf9, buf10, out=buf11) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf12 = empty_strided_cuda((2, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_10], Original ATen: [aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf9, reinterpret_tensor(arg13_1, (4096, 14336), (1, 4096), 176160768), out=buf12) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf13 = buf11; del buf11 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_7.run(buf13, arg10_1, buf12, 28672, grid=grid(28672), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf14 = buf9; del buf9 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3, cur_out_3], Original ATen: [aten.mul, aten.silu, aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf13, reinterpret_tensor(arg12_1, (14336, 4096), (1, 14336), 176160768), out=buf14) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf16 = buf10; del buf10 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused__to_copy_8.run(arg9_1, buf16, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg9_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf17 = buf13; del buf13 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf15, buf16, out=buf17) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf16 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf18 = buf12; del buf12 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_19], Original ATen: [aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf15, reinterpret_tensor(arg13_1, (4096, 14336), (1, 4096), 352321536), out=buf18) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg13_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf19 = buf17; del buf17 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_mul_silu_9.run(buf19, arg10_1, buf18, 28672, grid=grid(28672), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg10_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf18 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf20 = buf15; del buf15 # reuse | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6, cur_out_6], Original ATen: [aten.mul, aten.silu, aten.mm] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] extern_kernels.mm(buf19, reinterpret_tensor(arg12_1, (14336, 4096), (1, 14336), 352321536), out=buf20) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg12_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf19 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf21 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_cat_index_mul_view_10.run(buf5, buf8, buf14, buf20, arg15_1, arg14_1, buf21, 32768, grid=grid(32768), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg14_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg15_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf14 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf20 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf5 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf8 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] buf22 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] # Topologically Sorted Source Nodes: [final_out, final_out_1], Original ATen: [aten.zeros_like, aten.scatter_add] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] triton_poi_fused_scatter_add_zeros_like_11.run(buf22, 16384, grid=grid(16384), stream=stream0) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] aten.scatter_reduce_.two(buf22,0,reinterpret_tensor(arg16_1, (8, 4096), (1, 0), 0),buf21, reduce='sum', include_self=True) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del arg16_1 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] del buf21 | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] return (buf22, ) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg0_1 = rand_strided((3, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg1_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg2_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg3_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg4_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg5_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg6_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg7_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg8_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg9_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg10_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg11_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg12_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg13_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg14_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg15_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] arg16_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1]) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:29.615000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/0] [__output_code] | |
| V0401 02:34:29.616000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/0] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/27/c27evdctcr5rhwr4lru3xk34rbtzfmvgxoznmbsawgwtxx5f6wwm.py | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] Output code: | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] # AOT ID: ['4_inference'] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import torch | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import math | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import random | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import os | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] import tempfile | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from math import inf, nan | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from cmath import nanj | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] del async_compile | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] def call(args): | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] arg0_1, = args | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] args.clear() | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] assert_size_stride(arg0_1, (4, 4096), (4096, 1)) | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] return (reinterpret_tensor(arg0_1, (4, 1, 4096), (4096, 4096, 1), 0), ) | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] arg0_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] fn = lambda: call([arg0_1]) | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1091] [8/0] [__output_code] | |
| V0401 02:34:29.854000 3240940 site-packages/torch/_inductor/codecache.py:1092] [8/0] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/m5/cm5thqm6z4ynaqklrs7ic6cv4pgvjmeyyf3i3jrqp3u7zfrxvkbp.py | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] Output code: | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # AOT ID: ['5_inference'] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import torch | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import math | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import random | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import os | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import tempfile | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from math import inf, nan | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from cmath import nanj | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import triton | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] grid, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] split_scan_grid, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] grid_combo_kernels, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] start_graph, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] end_graph, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] ) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ri/crizb6pffeptnetkgqxd7sozjkjnq6ne5zh2c2mhrig66r6wdi7p.py | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # Topologically Sorted Source Nodes: [out], Original ATen: [aten.add] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # out => add | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # Graph fragment: | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg0_1, %arg1_1), kwargs = {}) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] triton_poi_fused_add_0 = async_compile.triton('triton_poi_fused_add_0', ''' | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import triton | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] import triton.language as tl | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] filename=__file__, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] ) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] @triton.jit | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] def triton_poi_fused_add_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] xnumel = 16384 | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] x0 = xindex | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] tmp1 = tl.load(in_ptr1 + (x0), None).to(tl.float32) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] tl.store(out_ptr0 + (x0), tmp2, None) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] del async_compile | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] def call(args): | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] arg0_1, arg1_1 = args | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] args.clear() | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] assert_size_stride(arg0_1, (4, 1, 4096), (4096, 4096, 1)) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] assert_size_stride(arg1_1, (4, 1, 4096), (4096, 4096, 1)) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] buf0 = empty_strided_cuda((4, 1, 4096), (4096, 4096, 1), torch.bfloat16) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] # Topologically Sorted Source Nodes: [out], Original ATen: [aten.add] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] triton_poi_fused_add_0.run(arg0_1, arg1_1, buf0, 16384, grid=grid(16384), stream=stream0) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] del arg0_1 | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] del arg1_1 | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] return (buf0, ) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] arg0_1 = rand_strided((4, 1, 4096), (4096, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] arg1_1 = rand_strided((4, 1, 4096), (4096, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] fn = lambda: call([arg0_1, arg1_1]) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1091] [9/0] [__output_code] | |
| V0401 02:34:29.897000 3240940 site-packages/torch/_inductor/codecache.py:1092] [9/0] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/pr/cprgnlfeu2ugvkfc3eeslbdmpypbt4grnfduu3t5ayen3dpmlt23.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] Output code: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # AOT ID: ['6_inference'] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import torch | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import random | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import os | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import tempfile | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from math import inf, nan | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from cmath import nanj | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] grid, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] split_scan_grid, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] grid_combo_kernels, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] start_graph, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] end_graph, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xu/cxubyjb4f3dfq4lxjugwcaa5ojabp22ovyl6ijcs5lrkzaqqadka.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_3, linear_4, linear_12, linear_13, linear_15, linear_16, linear_18, linear_19], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_12 => mul_15, sum_4 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_13 => mul_18, sum_5 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_15 => mul_21, sum_7 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_16 => mul_24, sum_8 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_18 => mul_27, sum_10 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_19 => mul_30, sum_11 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_3 => mul_3, sum_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_4 => mul_6, sum_2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_3, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_6, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_15, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_18, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_12, %unsqueeze_13), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_21, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_24 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_14, %unsqueeze_15), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_24, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_18, %unsqueeze_19), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_10 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_27, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_20, %unsqueeze_21), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_11 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_30, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_0 = async_compile.triton('triton_red_fused_mm_0', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 16384, 'r0_': 4096}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] reduction_hint=ReductionHint.DEFAULT, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*i64', 'in_ptr6': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'out_ptr4': '*fp32', 'out_ptr5': '*fp32', 'out_ptr6': '*fp32', 'out_ptr7': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 12, 'num_reduction': 8, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, out_ptr5, out_ptr6, out_ptr7, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 14336 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_numel = 4096 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rbase = r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = tl.load(in_ptr0 + (0)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp20 = tl.load(in_ptr4 + (0)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp21 = tl.broadcast_to(tmp20, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp37 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp39 = tl.load(in_ptr5 + (0)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp40 = tl.broadcast_to(tmp39, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp50 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp56 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp58 = tl.load(in_ptr6 + (0)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp59 = tl.broadcast_to(tmp58, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp69 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp75 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] roffset = r0_offset | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rindex = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_1 = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp8 = tl.load(in_ptr2 + (58720256 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp14 = tl.load(in_ptr3 + (58720256 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp27 = tl.load(in_ptr2 + (234881024 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp33 = tl.load(in_ptr3 + (234881024 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp46 = tl.load(in_ptr2 + (293601280 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp52 = tl.load(in_ptr3 + (293601280 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp65 = tl.load(in_ptr2 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp71 = tl.load(in_ptr3 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp3 = tmp1 + tmp2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp4 = tmp1 < 0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp5 = tl.where(tmp4, tmp3, tmp1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp13 = _tmp12 + tmp11 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp15 = tmp14.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp16 = tmp7 * tmp15 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp19 = _tmp18 + tmp17 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp22 = tmp21 + tmp2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp23 = tmp21 < 0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp24 = tl.where(tmp23, tmp22, tmp21) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp25 = tl.load(in_ptr1 + (r0_1 + 4096*tmp24), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp26 = tmp25.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp28 = tmp27.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp29 = tmp26 * tmp28 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp32 = _tmp31 + tmp30 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp34 = tmp33.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp35 = tmp26 * tmp34 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp36 = tl.broadcast_to(tmp35, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp38 = _tmp37 + tmp36 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp37 = tl.where(r0_mask & xmask, tmp38, _tmp37) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp41 = tmp40 + tmp2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp42 = tmp40 < 0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp43 = tl.where(tmp42, tmp41, tmp40) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp44 = tl.load(in_ptr1 + (r0_1 + 4096*tmp43), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp45 = tmp44.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp47 = tmp46.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp48 = tmp45 * tmp47 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp49 = tl.broadcast_to(tmp48, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp51 = _tmp50 + tmp49 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp50 = tl.where(r0_mask & xmask, tmp51, _tmp50) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp53 = tmp52.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp54 = tmp45 * tmp53 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp55 = tl.broadcast_to(tmp54, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp57 = _tmp56 + tmp55 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp56 = tl.where(r0_mask & xmask, tmp57, _tmp56) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp60 = tmp59 + tmp2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp61 = tmp59 < 0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp62 = tl.where(tmp61, tmp60, tmp59) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp63 = tl.load(in_ptr1 + (r0_1 + 4096*tmp62), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp64 = tmp63.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp66 = tmp65.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp67 = tmp64 * tmp66 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp68 = tl.broadcast_to(tmp67, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp70 = _tmp69 + tmp68 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp69 = tl.where(r0_mask & xmask, tmp70, _tmp69) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp72 = tmp71.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp73 = tmp64 * tmp72 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp74 = tl.broadcast_to(tmp73, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp76 = _tmp75 + tmp74 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp75 = tl.where(r0_mask & xmask, tmp76, _tmp75) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp12 = tl.sum(_tmp12, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp18 = tl.sum(_tmp18, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp31 = tl.sum(_tmp31, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp37 = tl.sum(_tmp37, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp50 = tl.sum(_tmp50, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp56 = tl.sum(_tmp56, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp69 = tl.sum(_tmp69, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp75 = tl.sum(_tmp75, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr0 + (x0), tmp12, xmask) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr1 + (x0), tmp18, xmask) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr2 + (x0), tmp31, xmask) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr3 + (x0), tmp37, xmask) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr4 + (x0), tmp50, xmask) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr5 + (x0), tmp56, xmask) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr6 + (x0), tmp69, xmask) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr7 + (x0), tmp75, xmask) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ur/curt75b3i4q2ncyfteskzogwmeozmkcorskoajpt6t5cxsox3shf.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_1], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # cur_out_1 => mul_8, sum_3 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_8 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_8, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_1 = async_compile.triton('triton_red_fused_mm_1', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_1(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 4096 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_numel = 14336 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rbase = r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] roffset = r0_offset | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rindex = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_1 = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp2 = tl.load(in_ptr1 + (14336 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp12 = tl.load(in_ptr3 + (58720256 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/nv/cnvigyarqkoqkrzao6xsn6e2aq3m2k6tu3qmtpxrwxa4r5jseolx.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_4], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # cur_out_4 => mul_20, sum_6 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_20 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_10, %unsqueeze_11), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_6 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_20, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_2 = async_compile.triton('triton_red_fused_mm_2', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 4096 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_numel = 14336 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rbase = r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] roffset = r0_offset | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rindex = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_1 = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp2 = tl.load(in_ptr1 + (57344 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp12 = tl.load(in_ptr3 + (234881024 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/df/cdf5pfpbj4fujw6qckkl2cwjemrdxy2tlk77lleu3lglvjxmqoxu.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_5], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # cur_out_5 => mul_26, sum_9 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_26 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_16, %unsqueeze_17), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_9 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_26, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_3 = async_compile.triton('triton_red_fused_mm_3', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 4096 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_numel = 14336 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rbase = r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] roffset = r0_offset | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rindex = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_1 = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp2 = tl.load(in_ptr1 + (71680 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp12 = tl.load(in_ptr3 + (293601280 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/4e/c4ezrplevfovmql56fqmbjyg3ukwfiepbujmfc366pbac235zmy2.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # cur_out_6 => mul_32, sum_12 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_32 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_22, %unsqueeze_23), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sum_12 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_32, [1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_4 = async_compile.triton('triton_red_fused_mm_4', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_red_fused_mm_4(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 4096 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_numel = 14336 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rnumel = r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rbase = r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] roffset = r0_offset | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] rindex = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] r0_1 = r0_index | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp2 = tl.load(in_ptr1 + (86016 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp12 = tl.load(in_ptr3 + (352321536 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/iw/ciwidkcd3ljl5a5zq2hjsxygmbf64oghthcmaew3lnibva6zev7h.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_x_7], Original ATen: [aten.index] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # cur_x_7 => index_7 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %index_7 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg11_1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_index_5 = async_compile.triton('triton_poi_fused_index_5', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused_index_5(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 16384 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x2 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tl.full([XBLOCK], 4, tl.int32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp3 = tmp0 < 0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp4 = tl.where(tmp3, tmp2, tmp0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr0 + (x2), tmp5, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ui/cuircasukcqg6f73z26qb6qfe5gfxff4efagdgqou4pdcrmpi7nj.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_21], Original ATen: [aten._to_copy] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_21 => convert_element_type_75 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %convert_element_type_75 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_21, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused__to_copy_6 = async_compile.triton('triton_poi_fused__to_copy_6', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused__to_copy_6(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 58720256 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = tl.load(in_ptr0 + (411041792 + x0), None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/bk/cbknq2eijpxdvrpijconf66fgurvapv6anth5r24fs4ukfzsebkj.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_21, silu_7, mul_7], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # linear_21 => mul_33 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # mul_7 => mul_35 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # silu_7 => convert_element_type_78, convert_element_type_79, mul_34, sigmoid_7 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_9, %select_36), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %convert_element_type_78 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_33, torch.float32), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %sigmoid_7 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_78,), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_34 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_78, %sigmoid_7), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %convert_element_type_79 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_34, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_35 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_79, %mm_10), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_mul_silu_7 = async_compile.triton('triton_poi_fused_mul_silu_7', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 65536}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_7', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused_mul_silu_7(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 57344 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x2 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tl.load(in_ptr0 + (100352 + x0), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ll/clljbokytk2lektwzfmhr7qpaxzbo5jvh6ocbzqat67rzsxp2hzg.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # ordered_outs => cat | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # weighted_ordered_outs => mul_36 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%convert_element_type_20, %convert_element_type_50, %convert_element_type_62, %convert_element_type_74, %mm_11],), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg18_1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_cat_index_mul_view_8 = async_compile.triton('triton_poi_fused_cat_index_mul_view_8', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'in_ptr5': '*i64', 'in_ptr6': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_8', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused_cat_index_mul_view_8(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 32768 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x2 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp41 = tl.load(in_ptr5 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = x1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp1 = tl.full([1], 0, tl.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp2 = tmp0 >= tmp1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp3 = tl.full([1], 1, tl.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp4 = tmp0 < tmp3 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp5 = tl.load(in_ptr0 + (x0), tmp4, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp7 = tl.full(tmp6.shape, 0.0, tmp6.dtype) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp8 = tl.where(tmp4, tmp6, tmp7) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp9 = tmp0 >= tmp3 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp10 = tl.full([1], 2, tl.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp11 = tmp0 < tmp10 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp12 = tmp9 & tmp11 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp13 = tl.load(in_ptr1 + (x0), tmp12, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp14 = tmp13.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp15 = tl.full(tmp14.shape, 0.0, tmp14.dtype) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp16 = tl.where(tmp12, tmp14, tmp15) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp17 = tmp0 >= tmp10 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp18 = tl.full([1], 3, tl.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp19 = tmp0 < tmp18 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp20 = tmp17 & tmp19 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp21 = tl.load(in_ptr2 + (x0), tmp20, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp22 = tmp21.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp23 = tl.full(tmp22.shape, 0.0, tmp22.dtype) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp24 = tl.where(tmp20, tmp22, tmp23) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp25 = tmp0 >= tmp18 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp26 = tl.full([1], 4, tl.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp27 = tmp0 < tmp26 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp28 = tmp25 & tmp27 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp29 = tl.load(in_ptr3 + (x0), tmp28, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp30 = tmp29.to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp31 = tl.full(tmp30.shape, 0.0, tmp30.dtype) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp32 = tl.where(tmp28, tmp30, tmp31) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp33 = tmp0 >= tmp26 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp34 = tl.full([1], 8, tl.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp35 = tmp0 < tmp34 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp36 = tl.load(in_ptr4 + (x0 + 4096*((-4) + x1)), tmp33, other=0.0).to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp37 = tl.where(tmp28, tmp32, tmp36) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp38 = tl.where(tmp20, tmp24, tmp37) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp39 = tl.where(tmp12, tmp16, tmp38) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp40 = tl.where(tmp4, tmp8, tmp39) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp42 = tl.full([XBLOCK], 8, tl.int32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp43 = tmp41 + tmp42 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp44 = tmp41 < 0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp45 = tl.where(tmp44, tmp43, tmp41) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp46 = tl.load(in_ptr6 + (tmp45), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp47 = tmp40 * tmp46 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(in_out_ptr0 + (x2), tmp47, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/5v/c5vf3qxpfhadxjjk2iaok6tatmp3ma3zbvp3i55o72txamnxemhp.py | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # final_out => full_default | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # final_out_1 => scatter_add | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # weighted_ordered_outs => mul_36 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Graph fragment: | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg18_1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_36), kwargs = {}) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_9 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_9', ''' | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] import triton.language as tl | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] filename=__file__, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_9', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] @triton.jit | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_9(out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xnumel = 16384 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] x0 = xindex | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tmp0 = 0.0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] tl.store(out_ptr0 + (x0), tmp0, None) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del async_compile | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def call(args): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1 = args | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] args.clear() | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] s0 = arg6_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] s1 = arg8_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] s3 = arg10_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg1_1, (4, 4096), (4096, 1)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg2_1, (1, ), (1, )) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg5_1, (1, ), (1, )) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg7_1, (1, ), (1, )) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg9_1, (1, ), (1, )) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg11_1, (4, ), (1, )) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg12_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg13_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg14_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg15_1, (8, 4096, 14336), (58720256, 14336, 1)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg16_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg17_1, (4, 2), (2, 1)) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg18_1, (8, ), (1, )) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] assert_size_stride(arg19_1, (8, ), (1, )) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf0 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf1 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf3 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf4 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf6 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf7 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf9 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf10 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_3, linear_4, linear_12, linear_13, linear_15, linear_16, linear_18, linear_19], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_0.run(arg2_1, arg1_1, arg12_1, arg16_1, arg5_1, arg7_1, arg9_1, buf0, buf1, buf3, buf4, buf6, buf7, buf9, buf10, 14336, 4096, grid=grid(14336), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg2_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg5_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg7_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg9_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf2 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_1], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_1.run(buf0, arg13_1, buf1, arg15_1, buf2, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf0 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf5 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_4], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_2.run(buf3, arg13_1, buf4, arg15_1, buf5, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf3 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf4 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf8 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_5], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_3.run(buf6, arg13_1, buf7, arg15_1, buf8, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf6 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf7 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf11 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_red_fused_mm_4.run(buf9, arg13_1, buf10, arg15_1, buf11, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf10 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf9 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf12 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [cur_x_7], Original ATen: [aten.index] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_index_5.run(arg11_1, arg1_1, buf12, 16384, grid=grid(16384), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg11_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg1_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf13 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_21], Original ATen: [aten._to_copy] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused__to_copy_6.run(arg12_1, buf13, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg12_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf14 = empty_strided_cuda((4, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_21], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] extern_kernels.mm(buf12, buf13, out=buf14) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf13 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf15 = empty_strided_cuda((4, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_22], Original ATen: [aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] extern_kernels.mm(buf12, reinterpret_tensor(arg16_1, (4096, 14336), (1, 4096), 411041792), out=buf15) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg16_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf16 = buf14; del buf14 # reuse | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_21, silu_7, mul_7], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_mul_silu_7.run(buf16, arg13_1, buf15, 57344, grid=grid(57344), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg13_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf15 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf17 = buf12; del buf12 # reuse | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [linear_21, silu_7, mul_7, cur_out_7], Original ATen: [aten.mul, aten.silu, aten.mm] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] extern_kernels.mm(buf16, reinterpret_tensor(arg15_1, (14336, 4096), (1, 14336), 411041792), out=buf17) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg15_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf16 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf18 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf20 = buf18; del buf18 # reuse | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_cat_index_mul_view_8.run(buf20, buf2, buf5, buf8, buf11, buf17, arg18_1, arg17_1, 32768, grid=grid(32768), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg17_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg18_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf11 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf5 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf8 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] buf19 = buf17; del buf17 # reuse | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_9.run(buf19, 16384, grid=grid(16384), stream=stream0) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] aten.scatter_reduce_.two(buf19,0,reinterpret_tensor(arg19_1, (8, 4096), (1, 0), 0),buf20, reduce='sum', include_self=True) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del arg19_1 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] del buf20 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] return (buf19, ) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg0_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg1_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg2_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg3_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg4_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg5_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg6_1 = 2 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg7_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg8_1 = 3 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg9_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg10_1 = 4 | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg11_1 = rand_strided((4, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg12_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg13_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg14_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg15_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg16_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg17_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg18_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] arg19_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1]) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:30.147000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/1] [__output_code] | |
| V0401 02:34:30.148000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/1] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/pf/cpfssnwtpri3hothbev44nsvvpmfegzizjh2thez24irbonmltkz.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] Output code: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # AOT ID: ['7_inference'] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import torch | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import random | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import os | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import tempfile | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from math import inf, nan | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from cmath import nanj | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] grid, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] split_scan_grid, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] grid_combo_kernels, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] start_graph, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] end_graph, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/47/c47usox3ylfenecsd3rpl5hb7bu7kaksuvekxjnzdtgvc77etdkx.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_x], Original ATen: [aten.index] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # cur_x => index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %index : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg2_1, [%arg1_1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 8192}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x2 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tl.full([XBLOCK], 4, tl.int32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = tmp0 < 0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tl.where(tmp3, tmp2, tmp0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x2), tmp5, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/oa/coajgi5zdctbara3th3yr7xwipnynkbqk3jhttvbn3efmnkmfqyf.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_x_4], Original ATen: [aten.index] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # cur_x_4 => index_4 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %index_4 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg2_1, [%arg9_1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_1 = async_compile.triton('triton_poi_fused_index_1', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_index_1(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x2 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tl.full([XBLOCK], 4, tl.int32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = tmp0 < 0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tl.where(tmp3, tmp2, tmp0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x2), tmp5, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xt/cxtfkt422w6p7oijexohdeo6ybueo56ffm6ki2mwmywpjyghzgiu.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # final_out => full_default | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # final_out_1 => scatter_add | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # weighted_ordered_outs => mul_77 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg21_1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_77 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_77), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_2 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_2', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_2(out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xnumel = 16384 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = 0.0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x0), tmp0, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ga/cgabqhaao3hxvgbvdsfjezkeg3qz23gm44ggolarwouffgf3nkvw.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear => convert_element_type | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused__to_copy_3 = async_compile.triton('triton_poi_fused__to_copy_3', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused__to_copy_3(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xnumel = 58720256 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/n7/cn7or4u2r2ljcbjwejxdis6zjw7vfoadf2pemj3q4ntgk6cxrpwy.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.view, aten.silu] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear => mul_10, view_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # mul => mul_22 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # silu => convert_element_type_3, convert_element_type_4, mul_17, sigmoid | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm, %select_1), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %view_1 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%mul_10, [%arg0_1, 14336]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %convert_element_type_3 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_1, torch.float32), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_3,), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_3, %sigmoid), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %convert_element_type_4 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_17, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_22 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %mm_1), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_mul_silu_view_4 = async_compile.triton('triton_poi_fused_mul_silu_view_4', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_view_4', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_mul_silu_view_4(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x2 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, xmask) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/3m/c3m7zthtl55aow3uylw7vzdvgth7ef6kt5ltrw7mdrr2etnqxobu.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_12], Original ATen: [aten._to_copy] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear_12 => convert_element_type_42 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %convert_element_type_42 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_12, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused__to_copy_5 = async_compile.triton('triton_poi_fused__to_copy_5', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused__to_copy_5(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xnumel = 58720256 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_ptr0 + (234881024 + x0), None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/24/c24pf2rswgtq45keb26c7ezzzrfmmnckx2dvvhk6h4hnrp57vy5k.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_12, silu_4, mul_4], Original ATen: [aten.mul, aten.view, aten.silu] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear_12 => mul_48, view_9 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # mul_4 => mul_60 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # silu_4 => convert_element_type_45, convert_element_type_46, mul_55, sigmoid_4 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_48 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_6, %select_21), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %view_9 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%mul_48, [%sym_size_int_1, 14336]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %convert_element_type_45 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_9, torch.float32), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sigmoid_4 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_45,), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_55 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_45, %sigmoid_4), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %convert_element_type_46 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_55, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_60 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_46, %mm_7), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_mul_silu_view_6 = async_compile.triton('triton_poi_fused_mul_silu_view_6', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 65536}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_view_6', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_mul_silu_view_6(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x2 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tl.load(in_ptr0 + (57344 + x0), xmask, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, xmask) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/yo/cyozbv757sudj3555go56uooa4ozemydbyqiiau3krglmf7unlnn.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_6, linear_7, linear_9, linear_10, linear_18, linear_19], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear_10 => mul_39, sum_5 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear_18 => mul_68, sum_7 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear_19 => mul_71, sum_8 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear_6 => mul_30, sum_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear_7 => mul_33, sum_2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # linear_9 => mul_36, sum_4 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_30, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_33, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_36, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_39 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_39, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_68 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_12, %unsqueeze_13), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_68, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_71 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_14, %unsqueeze_15), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_71, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_7 = async_compile.triton('triton_red_fused_mm_7', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 16384, 'r0_': 4096}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] reduction_hint=ReductionHint.DEFAULT, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'out_ptr4': '*fp32', 'out_ptr5': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 9, 'num_reduction': 6, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_red_fused_mm_7(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, out_ptr5, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xnumel = 14336 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_numel = 4096 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rnumel = r0_numel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rbase = r0_base | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_ptr0 + (0)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp20 = tl.load(in_ptr4 + (0)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp21 = tl.broadcast_to(tmp20, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp37 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp39 = tl.load(in_ptr5 + (0)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp40 = tl.broadcast_to(tmp39, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp50 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp56 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] roffset = r0_offset | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rindex = r0_index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_1 = r0_index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp8 = tl.load(in_ptr2 + (117440512 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp14 = tl.load(in_ptr3 + (117440512 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp27 = tl.load(in_ptr2 + (176160768 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp33 = tl.load(in_ptr3 + (176160768 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp46 = tl.load(in_ptr2 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp52 = tl.load(in_ptr3 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = tmp1 + tmp2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tmp1 < 0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tl.where(tmp4, tmp3, tmp1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp13 = _tmp12 + tmp11 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp15 = tmp14.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp16 = tmp7 * tmp15 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp19 = _tmp18 + tmp17 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp22 = tmp21 + tmp2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp23 = tmp21 < 0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp24 = tl.where(tmp23, tmp22, tmp21) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp25 = tl.load(in_ptr1 + (r0_1 + 4096*tmp24), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp26 = tmp25.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp28 = tmp27.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp29 = tmp26 * tmp28 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp32 = _tmp31 + tmp30 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp34 = tmp33.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp35 = tmp26 * tmp34 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp36 = tl.broadcast_to(tmp35, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp38 = _tmp37 + tmp36 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp37 = tl.where(r0_mask & xmask, tmp38, _tmp37) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp41 = tmp40 + tmp2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp42 = tmp40 < 0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp43 = tl.where(tmp42, tmp41, tmp40) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp44 = tl.load(in_ptr1 + (r0_1 + 4096*tmp43), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp45 = tmp44.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp47 = tmp46.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp48 = tmp45 * tmp47 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp49 = tl.broadcast_to(tmp48, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp51 = _tmp50 + tmp49 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp50 = tl.where(r0_mask & xmask, tmp51, _tmp50) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp53 = tmp52.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp54 = tmp45 * tmp53 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp55 = tl.broadcast_to(tmp54, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp57 = _tmp56 + tmp55 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp56 = tl.where(r0_mask & xmask, tmp57, _tmp56) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp12 = tl.sum(_tmp12, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp18 = tl.sum(_tmp18, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp31 = tl.sum(_tmp31, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp37 = tl.sum(_tmp37, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp50 = tl.sum(_tmp50, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp56 = tl.sum(_tmp56, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x0), tmp12, xmask) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr1 + (x0), tmp18, xmask) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr2 + (x0), tmp31, xmask) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr3 + (x0), tmp37, xmask) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr4 + (x0), tmp50, xmask) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr5 + (x0), tmp56, xmask) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/py/cpyklkc22mkxjlh53mqbh3dd2w37zcfownerro63nvx6ixbipxmt.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_2], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # cur_out_2 => mul_35, sum_3 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_35 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_35, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_8 = async_compile.triton('triton_red_fused_mm_8', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_red_fused_mm_8(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xnumel = 4096 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_numel = 14336 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rnumel = r0_numel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rbase = r0_base | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] roffset = r0_offset | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rindex = r0_index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_1 = r0_index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tl.load(in_ptr1 + (28672 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp12 = tl.load(in_ptr3 + (117440512 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/4j/c4jlady6omslbrb4jox42a2c5mvgk3ipozvtusxlzasl57vvucsk.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_3], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # cur_out_3 => mul_41, sum_6 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_41 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_10, %unsqueeze_11), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_6 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_41, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_9 = async_compile.triton('triton_red_fused_mm_9', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_9', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_red_fused_mm_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xnumel = 4096 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_numel = 14336 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rnumel = r0_numel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rbase = r0_base | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] roffset = r0_offset | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rindex = r0_index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_1 = r0_index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tl.load(in_ptr1 + (43008 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp12 = tl.load(in_ptr3 + (176160768 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/2j/c2jjhi4uu44klkhr5p2jsotr2ixgc5f7m654aac4jtevdhmevht5.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # cur_out_6 => mul_73, sum_9 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_73 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_16, %unsqueeze_17), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %sum_9 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_73, [1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_10 = async_compile.triton('triton_red_fused_mm_10', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_10', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_red_fused_mm_10(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xnumel = 4096 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_numel = 14336 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rnumel = r0_numel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rbase = r0_base | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] roffset = r0_offset | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] rindex = r0_index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] r0_1 = r0_index | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tl.load(in_ptr1 + (86016 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp12 = tl.load(in_ptr3 + (352321536 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/vv/cvvjurpv46iafznus7yhpgxuegye2ukrl4r7m3nkcrzai6zc4wag.py | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # ordered_outs => cat | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # weighted_ordered_outs => mul_77 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Graph fragment: | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%mm_2, %convert_element_type_29, %convert_element_type_41, %mm_8, %convert_element_type_71],), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg21_1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # %mul_77 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_cat_index_mul_view_11 = async_compile.triton('triton_poi_fused_cat_index_mul_view_11', ''' | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] import triton.language as tl | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] filename=__file__, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*i64', 'in_ptr6': '*bf16', 'ks0': 'i32', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 9), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_11', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] @triton.jit | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def triton_poi_fused_cat_index_mul_view_11(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, ks0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xnumel = 32768 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] x2 = xindex | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp38 = tl.load(in_ptr5 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp0 = x1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp1 = tl.full([1], 0, tl.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp2 = tmp0 >= tmp1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp3 = ks0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp4 = tmp0 < tmp3 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp5 = tl.load(in_ptr0 + (x0 + 4096*(x1)), tmp4, other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp6 = tmp0 >= tmp3 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp7 = 1 + ks0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp8 = tmp0 < tmp7 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp9 = tmp6 & tmp8 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp10 = tl.load(in_ptr1 + (x0), tmp9, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp12 = tl.full(tmp11.shape, 0.0, tmp11.dtype) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp13 = tl.where(tmp9, tmp11, tmp12) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp14 = tmp0 >= tmp7 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp15 = 2 + ks0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp16 = tmp0 < tmp15 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp17 = tmp14 & tmp16 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp18 = tl.load(in_ptr2 + (x0), tmp17, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp19 = tmp18.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp20 = tl.full(tmp19.shape, 0.0, tmp19.dtype) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp21 = tl.where(tmp17, tmp19, tmp20) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp22 = tmp0 >= tmp15 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp23 = tl.full([1], 7, tl.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp24 = tmp0 < tmp23 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp25 = tmp22 & tmp24 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp26 = tl.load(in_ptr3 + (x0 + 4096*((-2) + x1 + ((-1)*ks0))), tmp25, other=0.0).to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp27 = tmp0 >= tmp23 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp28 = tl.full([1], 8, tl.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp29 = tmp0 < tmp28 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp30 = tl.load(in_ptr4 + (x0), tmp27, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp31 = tmp30.to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp32 = tl.full(tmp31.shape, 0.0, tmp31.dtype) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp33 = tl.where(tmp27, tmp31, tmp32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp34 = tl.where(tmp25, tmp26, tmp33) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp35 = tl.where(tmp17, tmp21, tmp34) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp36 = tl.where(tmp9, tmp13, tmp35) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp37 = tl.where(tmp4, tmp5, tmp36) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp39 = tl.full([XBLOCK], 8, tl.int32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp40 = tmp38 + tmp39 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp41 = tmp38 < 0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp42 = tl.where(tmp41, tmp40, tmp38) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp43 = tl.load(in_ptr6 + (tmp42), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tmp44 = tmp37 * tmp43 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] tl.store(in_out_ptr0 + (x2), tmp44, None) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del async_compile | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def call(args): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1 = args | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] args.clear() | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] s0 = arg0_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] s1 = arg3_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] s2 = arg6_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] s4 = arg8_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] s5 = arg10_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] s6 = arg13_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg1_1, (s0, ), (1, )) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg2_1, (4, 4096), (4096, 1)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg5_1, (1, ), (1, )) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg7_1, (1, ), (1, )) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg9_1, (5 + ((-1)*s0), ), (1, )) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg12_1, (1, ), (1, )) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg15_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg16_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg17_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg18_1, (8, 4096, 14336), (58720256, 14336, 1)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg19_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg20_1, (4, 2), (2, 1)) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg21_1, (8, ), (1, )) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] assert_size_stride(arg22_1, (8, ), (1, )) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf0 = empty_strided_cuda((s0, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_x], Original ATen: [aten.index] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_0_xnumel = 4096*s0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_0.run(arg1_1, arg2_1, buf0, triton_poi_fused_index_0_xnumel, grid=grid(triton_poi_fused_index_0_xnumel), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg1_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf12 = empty_strided_cuda((5 + ((-1)*s0), 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_x_4], Original ATen: [aten.index] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_1_xnumel = 20480 + ((-4096)*s0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_1.run(arg9_1, arg2_1, buf12, triton_poi_fused_index_1_xnumel, grid=grid(triton_poi_fused_index_1_xnumel), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg9_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf22 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_2.run(buf22, 16384, grid=grid(16384), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf3 = empty_strided_cuda((s0, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_1], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] extern_kernels.mm(buf0, reinterpret_tensor(arg19_1, (4096, 14336), (1, 4096), 0), out=buf3) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf1 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused__to_copy_3.run(arg15_1, buf1, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf2 = empty_strided_cuda((s0, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] extern_kernels.mm(buf0, buf1, out=buf2) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf4 = buf2; del buf2 # reuse | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.view, aten.silu] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_mul_silu_view_4_xnumel = 14336*s0 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_mul_silu_view_4.run(buf4, arg16_1, buf3, triton_poi_fused_mul_silu_view_4_xnumel, grid=grid(triton_poi_fused_mul_silu_view_4_xnumel), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf3 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf5 = buf0; del buf0 # reuse | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul, cur_out], Original ATen: [aten.mul, aten.view, aten.silu, aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] extern_kernels.mm(buf4, reinterpret_tensor(arg18_1, (14336, 4096), (1, 14336), 0), out=buf5) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf4 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf15 = empty_strided_cuda((5 + ((-1)*s0), 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_13], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] extern_kernels.mm(buf12, reinterpret_tensor(arg19_1, (4096, 14336), (1, 4096), 234881024), out=buf15) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf13 = buf1; del buf1 # reuse | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_12], Original ATen: [aten._to_copy] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused__to_copy_5.run(arg15_1, buf13, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf14 = empty_strided_cuda((5 + ((-1)*s0), 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_12], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] extern_kernels.mm(buf12, buf13, out=buf14) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf13 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf16 = buf14; del buf14 # reuse | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_12, silu_4, mul_4], Original ATen: [aten.mul, aten.view, aten.silu] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_mul_silu_view_6_xnumel = 71680 + ((-14336)*s0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_mul_silu_view_6.run(buf16, arg16_1, buf15, triton_poi_fused_mul_silu_view_6_xnumel, grid=grid(triton_poi_fused_mul_silu_view_6_xnumel), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf15 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf17 = buf12; del buf12 # reuse | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_12, silu_4, mul_4, cur_out_4], Original ATen: [aten.mul, aten.view, aten.silu, aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] extern_kernels.mm(buf16, reinterpret_tensor(arg18_1, (14336, 4096), (1, 14336), 234881024), out=buf17) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf16 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf6 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf7 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf9 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf10 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf18 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf19 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [linear_6, linear_7, linear_9, linear_10, linear_18, linear_19], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_7.run(arg5_1, arg2_1, arg15_1, arg19_1, arg7_1, arg12_1, buf6, buf7, buf9, buf10, buf18, buf19, 14336, 4096, grid=grid(14336), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg12_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg15_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg19_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg2_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg5_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg7_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf8 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_2], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_8.run(buf6, arg16_1, buf7, arg18_1, buf8, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf6 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf7 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf11 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_3], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_9.run(buf9, arg16_1, buf10, arg18_1, buf11, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf10 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf9 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf20 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_red_fused_mm_10.run(buf18, arg16_1, buf19, arg18_1, buf20, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg16_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg18_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf18 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf19 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf21 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] buf23 = buf21; del buf21 # reuse | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] triton_poi_fused_cat_index_mul_view_11.run(buf23, buf5, buf8, buf11, buf17, buf20, arg21_1, arg20_1, s0, 32768, grid=grid(32768), stream=stream0) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg20_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg21_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf11 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf17 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf20 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf5 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf8 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] aten.scatter_reduce_.two(buf22,0,reinterpret_tensor(arg22_1, (8, 4096), (1, 0), 0),buf23, reduce='sum', include_self=True) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del arg22_1 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] del buf23 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] return (buf22, ) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg0_1 = 2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg1_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg2_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg3_1 = 2 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg4_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg5_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg6_1 = 3 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg7_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg8_1 = 4 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg9_1 = rand_strided((3, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg10_1 = 7 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg11_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg12_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg13_1 = 8 | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg14_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg15_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg16_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg17_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg18_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg19_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg20_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg21_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] arg22_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1]) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:31.725000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/2] [__output_code] | |
| V0401 02:34:31.726000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/2] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/hs/chsirooun7kydovvjk4oinw35bsqbrcbopw4tqtfnvy6v3ejoj7c.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] Output code: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # AOT ID: ['8_inference'] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import torch | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import random | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import os | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import tempfile | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from math import inf, nan | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from cmath import nanj | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] grid, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] split_scan_grid, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] grid_combo_kernels, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] start_graph, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] end_graph, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/fs/cfshdmnxmjiadh6stjyoymktb3rxxlyv6fwu5jxfh3dk5sm6riaz.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_x_5], Original ATen: [aten.index] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # cur_x_5 => index_5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %index_5 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg8_1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 12288 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x2 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tl.full([XBLOCK], 4, tl.int32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tmp0 < 0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tl.where(tmp3, tmp2, tmp0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x2), tmp5, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/pp/cppt43372rvepxmwsrrmtoqxmhn7h57p7lpbofdbz5qy6rfpsuqv.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # final_out => full_default | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # final_out_1 => scatter_add | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # weighted_ordered_outs => mul_39 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg19_1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_39 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_39), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_1 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_1', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_1(out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 16384 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = 0.0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x0), tmp0, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ni/cnizkrlflmp6ljf6woclsajqj7kqw27zspbpp5ttw66c5ftx2avg.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_15], Original ATen: [aten._to_copy] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_15 => convert_element_type_54 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %convert_element_type_54 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_15, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused__to_copy_2 = async_compile.triton('triton_poi_fused__to_copy_2', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused__to_copy_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 58720256 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_ptr0 + (293601280 + x0), None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/44/c44yb5mjd3q5nyxs2qd2wdzheklkfzgjouugy7d5wzerdsbdvo75.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_15, silu_5, mul_5], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_15 => mul_24 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # mul_5 => mul_26 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # silu_5 => convert_element_type_57, convert_element_type_58, mul_25, sigmoid_5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_24 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_6, %select_26), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %convert_element_type_57 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_24, torch.float32), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sigmoid_5 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_57,), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_25 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_57, %sigmoid_5), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %convert_element_type_58 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_25, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_26 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_58, %mm_7), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_mul_silu_3 = async_compile.triton('triton_poi_fused_mul_silu_3', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 65536}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_3', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused_mul_silu_3(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 43008 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x2 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tl.load(in_ptr0 + (71680 + x0), xmask, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), xmask).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/oz/cozllcw53qa3x32uidkhkhpsdwytxdwkjt76sc6ebejd7rj2ey4k.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear, linear_1, linear_3, linear_4, linear_9, linear_10, linear_18, linear_19, linear_21, linear_22], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear => mul, sum_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_1 => mul_3, sum_2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_10 => mul_18, sum_8 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_18 => mul_27, sum_10 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_19 => mul_30, sum_11 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_21 => mul_33, sum_13 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_22 => mul_36, sum_14 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_3 => mul_6, sum_4 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_4 => mul_9, sum_5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # linear_9 => mul_15, sum_7 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_3, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_6, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_9, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_12, %unsqueeze_13), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_15, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_14, %unsqueeze_15), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_18, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_18, %unsqueeze_19), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_10 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_27, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_20, %unsqueeze_21), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_11 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_30, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_24, %unsqueeze_25), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_13 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_33, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_26, %unsqueeze_27), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_14 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_36, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_4 = async_compile.triton('triton_red_fused_mm_4', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 16384, 'r0_': 4096}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] reduction_hint=ReductionHint.DEFAULT, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*i64', 'in_ptr6': '*i64', 'in_ptr7': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'out_ptr4': '*fp32', 'out_ptr5': '*fp32', 'out_ptr6': '*fp32', 'out_ptr7': '*fp32', 'out_ptr8': '*fp32', 'out_ptr9': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 15, 'num_reduction': 10, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_4(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, out_ptr5, out_ptr6, out_ptr7, out_ptr8, out_ptr9, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 14336 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_numel = 4096 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rnumel = r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rbase = r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_ptr0 + (0)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp20 = tl.load(in_ptr4 + (0)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp21 = tl.broadcast_to(tmp20, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp37 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp39 = tl.load(in_ptr5 + (0)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp40 = tl.broadcast_to(tmp39, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp50 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp56 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp58 = tl.load(in_ptr6 + (0)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp59 = tl.broadcast_to(tmp58, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp69 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp75 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp77 = tl.load(in_ptr7 + (0)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp78 = tl.broadcast_to(tmp77, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp88 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp94 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] roffset = r0_offset | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rindex = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_1 = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp14 = tl.load(in_ptr3 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp27 = tl.load(in_ptr2 + (58720256 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp33 = tl.load(in_ptr3 + (58720256 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp46 = tl.load(in_ptr2 + (176160768 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp52 = tl.load(in_ptr3 + (176160768 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp65 = tl.load(in_ptr2 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp71 = tl.load(in_ptr3 + (352321536 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp84 = tl.load(in_ptr2 + (411041792 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp90 = tl.load(in_ptr3 + (411041792 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tmp1 + tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tmp1 < 0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tl.where(tmp4, tmp3, tmp1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp13 = _tmp12 + tmp11 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp15 = tmp14.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp16 = tmp7 * tmp15 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp19 = _tmp18 + tmp17 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp22 = tmp21 + tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp23 = tmp21 < 0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp24 = tl.where(tmp23, tmp22, tmp21) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp25 = tl.load(in_ptr1 + (r0_1 + 4096*tmp24), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp26 = tmp25.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp28 = tmp27.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp29 = tmp26 * tmp28 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp32 = _tmp31 + tmp30 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp34 = tmp33.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp35 = tmp26 * tmp34 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp36 = tl.broadcast_to(tmp35, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp38 = _tmp37 + tmp36 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp37 = tl.where(r0_mask & xmask, tmp38, _tmp37) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp41 = tmp40 + tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp42 = tmp40 < 0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp43 = tl.where(tmp42, tmp41, tmp40) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp44 = tl.load(in_ptr1 + (r0_1 + 4096*tmp43), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp45 = tmp44.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp47 = tmp46.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp48 = tmp45 * tmp47 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp49 = tl.broadcast_to(tmp48, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp51 = _tmp50 + tmp49 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp50 = tl.where(r0_mask & xmask, tmp51, _tmp50) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp53 = tmp52.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp54 = tmp45 * tmp53 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp55 = tl.broadcast_to(tmp54, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp57 = _tmp56 + tmp55 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp56 = tl.where(r0_mask & xmask, tmp57, _tmp56) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp60 = tmp59 + tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp61 = tmp59 < 0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp62 = tl.where(tmp61, tmp60, tmp59) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp63 = tl.load(in_ptr1 + (r0_1 + 4096*tmp62), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp64 = tmp63.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp66 = tmp65.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp67 = tmp64 * tmp66 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp68 = tl.broadcast_to(tmp67, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp70 = _tmp69 + tmp68 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp69 = tl.where(r0_mask & xmask, tmp70, _tmp69) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp72 = tmp71.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp73 = tmp64 * tmp72 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp74 = tl.broadcast_to(tmp73, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp76 = _tmp75 + tmp74 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp75 = tl.where(r0_mask & xmask, tmp76, _tmp75) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp79 = tmp78 + tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp80 = tmp78 < 0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp81 = tl.where(tmp80, tmp79, tmp78) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp82 = tl.load(in_ptr1 + (r0_1 + 4096*tmp81), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp83 = tmp82.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp85 = tmp84.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp86 = tmp83 * tmp85 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp87 = tl.broadcast_to(tmp86, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp89 = _tmp88 + tmp87 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp88 = tl.where(r0_mask & xmask, tmp89, _tmp88) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp91 = tmp90.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp92 = tmp83 * tmp91 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp93 = tl.broadcast_to(tmp92, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp95 = _tmp94 + tmp93 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp94 = tl.where(r0_mask & xmask, tmp95, _tmp94) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp12 = tl.sum(_tmp12, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp18 = tl.sum(_tmp18, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp31 = tl.sum(_tmp31, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp37 = tl.sum(_tmp37, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp50 = tl.sum(_tmp50, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp56 = tl.sum(_tmp56, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp69 = tl.sum(_tmp69, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp75 = tl.sum(_tmp75, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp88 = tl.sum(_tmp88, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp94 = tl.sum(_tmp94, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x0), tmp12, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr1 + (x0), tmp18, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr2 + (x0), tmp31, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr3 + (x0), tmp37, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr4 + (x0), tmp50, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr5 + (x0), tmp56, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr6 + (x0), tmp69, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr7 + (x0), tmp75, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr8 + (x0), tmp88, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr9 + (x0), tmp94, xmask) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xx/cxx6xntbnboc2g6tdf7vyxn4y73k7jimowg7zyfn5bowklvzwsdg.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # cur_out => mul_5, sum_3 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_5, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_5 = async_compile.triton('triton_red_fused_mm_5', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 4096 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_numel = 14336 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rnumel = r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rbase = r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] roffset = r0_offset | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rindex = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_1 = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp12 = tl.load(in_ptr3 + (r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ce/cceviquawbkgm2efk7w2ykpozllrbcae74m5k44pyzscgiiogapd.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_1], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # cur_out_1 => mul_11, sum_6 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_10, %unsqueeze_11), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_6 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_11, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_6 = async_compile.triton('triton_red_fused_mm_6', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 4096 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_numel = 14336 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rnumel = r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rbase = r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] roffset = r0_offset | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rindex = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_1 = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tl.load(in_ptr1 + (14336 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp12 = tl.load(in_ptr3 + (58720256 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/if/ciffkzjjwbz6mpsov6naxwgkjnoztxaxx5mlr5kykxugcuwjhqlr.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_3], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # cur_out_3 => mul_20, sum_9 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_20 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_16, %unsqueeze_17), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_9 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_20, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_7 = async_compile.triton('triton_red_fused_mm_7', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_7(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 4096 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_numel = 14336 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rnumel = r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rbase = r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] roffset = r0_offset | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rindex = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_1 = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tl.load(in_ptr1 + (43008 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp12 = tl.load(in_ptr3 + (176160768 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/6g/c6gqgrsqygxdbqf2zsinu6b25wiico7gtm6kqyie56rcbdbsj6yw.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # cur_out_6 => mul_32, sum_12 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_32 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_22, %unsqueeze_23), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_12 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_32, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_8 = async_compile.triton('triton_red_fused_mm_8', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_8(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 4096 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_numel = 14336 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rnumel = r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rbase = r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] roffset = r0_offset | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rindex = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_1 = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tl.load(in_ptr1 + (86016 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp12 = tl.load(in_ptr3 + (352321536 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xo/cxovjih66bogmdzezh2m2vplwymlvkd4lxpgf4x5lf3unr3wvgxz.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_7], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # cur_out_7 => mul_38, sum_15 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_38 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_28, %unsqueeze_29), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %sum_15 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_38, [1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_9 = async_compile.triton('triton_red_fused_mm_9', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_9', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_red_fused_mm_9(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 4096 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_numel = 14336 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rnumel = r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rbase = r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] roffset = r0_offset | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] rindex = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] r0_1 = r0_index | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tl.load(in_ptr1 + (100352 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp12 = tl.load(in_ptr3 + (411041792 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/p2/cp2s4xmrbzkqwz3syvvx3kjmd4wcxohqk2vqrchxtwqm2tv2vavp.py | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # ordered_outs => cat | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # weighted_ordered_outs => mul_39 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Graph fragment: | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%convert_element_type_11, %convert_element_type_23, %convert_element_type_44, %mm_8, %convert_element_type_74, %convert_element_type_86],), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg19_1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # %mul_39 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_cat_index_mul_view_10 = async_compile.triton('triton_poi_fused_cat_index_mul_view_10', ''' | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] import triton.language as tl | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] filename=__file__, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*i64', 'in_ptr7': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 7, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] @triton.jit | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def triton_poi_fused_cat_index_mul_view_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xnumel = 32768 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] x2 = xindex | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp50 = tl.load(in_ptr6 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp0 = x1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp1 = tl.full([1], 0, tl.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp2 = tmp0 >= tmp1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp3 = tl.full([1], 1, tl.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp4 = tmp0 < tmp3 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp5 = tl.load(in_ptr0 + (x0), tmp4, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp7 = tl.full(tmp6.shape, 0.0, tmp6.dtype) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp8 = tl.where(tmp4, tmp6, tmp7) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp9 = tmp0 >= tmp3 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp10 = tl.full([1], 2, tl.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp11 = tmp0 < tmp10 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp12 = tmp9 & tmp11 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp13 = tl.load(in_ptr1 + (x0), tmp12, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp14 = tmp13.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp15 = tl.full(tmp14.shape, 0.0, tmp14.dtype) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp16 = tl.where(tmp12, tmp14, tmp15) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp17 = tmp0 >= tmp10 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp18 = tl.full([1], 3, tl.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp19 = tmp0 < tmp18 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp20 = tmp17 & tmp19 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp21 = tl.load(in_ptr2 + (x0), tmp20, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp22 = tmp21.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp23 = tl.full(tmp22.shape, 0.0, tmp22.dtype) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp24 = tl.where(tmp20, tmp22, tmp23) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp25 = tmp0 >= tmp18 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp26 = tl.full([1], 6, tl.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp27 = tmp0 < tmp26 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp28 = tmp25 & tmp27 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp29 = tl.load(in_ptr3 + (x0 + 4096*((-3) + x1)), tmp28, other=0.0).to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp30 = tmp0 >= tmp26 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp31 = tl.full([1], 7, tl.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp32 = tmp0 < tmp31 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp33 = tmp30 & tmp32 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp34 = tl.load(in_ptr4 + (x0), tmp33, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp35 = tmp34.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp36 = tl.full(tmp35.shape, 0.0, tmp35.dtype) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp37 = tl.where(tmp33, tmp35, tmp36) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp38 = tmp0 >= tmp31 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp39 = tl.full([1], 8, tl.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp40 = tmp0 < tmp39 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp41 = tl.load(in_ptr5 + (x0), tmp38, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp42 = tmp41.to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp43 = tl.full(tmp42.shape, 0.0, tmp42.dtype) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp44 = tl.where(tmp38, tmp42, tmp43) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp45 = tl.where(tmp33, tmp37, tmp44) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp46 = tl.where(tmp28, tmp29, tmp45) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp47 = tl.where(tmp20, tmp24, tmp46) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp48 = tl.where(tmp12, tmp16, tmp47) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp49 = tl.where(tmp4, tmp8, tmp48) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp51 = tl.full([XBLOCK], 8, tl.int32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp52 = tmp50 + tmp51 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp53 = tmp50 < 0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp54 = tl.where(tmp53, tmp52, tmp50) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp55 = tl.load(in_ptr7 + (tmp54), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tmp56 = tmp49 * tmp55 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] tl.store(in_out_ptr0 + (x2), tmp56, None) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del async_compile | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def call(args): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1 = args | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] args.clear() | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] s0 = arg3_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] s1 = arg6_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] s3 = arg9_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] s4 = arg11_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg0_1, (1, ), (1, )) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg1_1, (4, 4096), (4096, 1)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg2_1, (1, ), (1, )) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg5_1, (1, ), (1, )) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg8_1, (3, ), (1, )) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg10_1, (1, ), (1, )) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg12_1, (1, ), (1, )) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg13_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg14_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg15_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg16_1, (8, 4096, 14336), (58720256, 14336, 1)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg17_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg18_1, (4, 2), (2, 1)) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg19_1, (8, ), (1, )) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] assert_size_stride(arg20_1, (8, ), (1, )) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf9 = empty_strided_cuda((3, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_x_5], Original ATen: [aten.index] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_index_0.run(arg8_1, arg1_1, buf9, 12288, grid=grid(12288), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg8_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf22 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_1.run(buf22, 16384, grid=grid(16384), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf12 = empty_strided_cuda((3, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_16], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] extern_kernels.mm(buf9, reinterpret_tensor(arg17_1, (4096, 14336), (1, 4096), 293601280), out=buf12) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf10 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_15], Original ATen: [aten._to_copy] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused__to_copy_2.run(arg13_1, buf10, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf11 = empty_strided_cuda((3, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_15], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] extern_kernels.mm(buf9, buf10, out=buf11) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf10 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf13 = buf11; del buf11 # reuse | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_15, silu_5, mul_5], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_mul_silu_3.run(buf13, arg14_1, buf12, 43008, grid=grid(43008), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf12 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf14 = buf9; del buf9 # reuse | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear_15, silu_5, mul_5, cur_out_5], Original ATen: [aten.mul, aten.silu, aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] extern_kernels.mm(buf13, reinterpret_tensor(arg16_1, (14336, 4096), (1, 14336), 293601280), out=buf14) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf13 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf0 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf1 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf3 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf4 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf6 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf7 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf15 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf16 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf18 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf19 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [linear, linear_1, linear_3, linear_4, linear_9, linear_10, linear_18, linear_19, linear_21, linear_22], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_4.run(arg0_1, arg1_1, arg13_1, arg17_1, arg2_1, arg5_1, arg10_1, arg12_1, buf0, buf1, buf3, buf4, buf6, buf7, buf15, buf16, buf18, buf19, 14336, 4096, grid=grid(14336), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg0_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg10_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg12_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg13_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg17_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg1_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg2_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg5_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf2 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_5.run(buf0, arg14_1, buf1, arg16_1, buf2, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf0 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf5 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_1], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_6.run(buf3, arg14_1, buf4, arg16_1, buf5, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf3 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf4 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf8 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_3], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_7.run(buf6, arg14_1, buf7, arg16_1, buf8, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf6 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf7 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf17 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_6], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_8.run(buf15, arg14_1, buf16, arg16_1, buf17, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf15 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf16 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf20 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [cur_out_7], Original ATen: [aten.mm] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_red_fused_mm_9.run(buf18, arg14_1, buf19, arg16_1, buf20, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg14_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg16_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf18 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf19 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf21 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] buf23 = buf21; del buf21 # reuse | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] triton_poi_fused_cat_index_mul_view_10.run(buf23, buf2, buf5, buf8, buf14, buf17, buf20, arg19_1, arg18_1, 32768, grid=grid(32768), stream=stream0) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg18_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg19_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf14 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf17 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf20 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf5 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf8 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] aten.scatter_reduce_.two(buf22,0,reinterpret_tensor(arg20_1, (8, 4096), (1, 0), 0),buf23, reduce='sum', include_self=True) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del arg20_1 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] del buf23 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] return (buf22, ) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg0_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg1_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg2_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg3_1 = 2 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg4_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg5_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg6_1 = 3 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg7_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg8_1 = rand_strided((3, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg9_1 = 6 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg10_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg11_1 = 7 | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg12_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg13_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg14_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg15_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg16_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg17_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg18_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg19_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] arg20_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1]) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:33.848000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/3] [__output_code] | |
| V0401 02:34:33.849000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/3] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/nr/cnr4atw5ay43j2qpni2f22cvzrd6lanjrovddcclybuolyunkawq.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] Output code: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # AOT ID: ['9_inference'] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import torch | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import random | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import os | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import tempfile | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from math import inf, nan | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from cmath import nanj | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] grid, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] split_scan_grid, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] grid_combo_kernels, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] start_graph, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] end_graph, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/pn/cpnadhcpx7pdda4wy2zq5f4yksvs57uy47uewrx3fzhgcm4mahjg.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_x, cur_x_3, cur_x_6], Original ATen: [aten.index] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # cur_x => index | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # cur_x_3 => index_3 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # cur_x_6 => index_6 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %index : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg0_1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %index_3 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg5_1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %index_6 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg11_1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 8192}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i64', 'in_ptr3': '*i64', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, out_ptr2, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 8192 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x2 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp6 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp11 = tl.load(in_ptr3 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tl.full([XBLOCK], 4, tl.int32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp3 = tmp0 < 0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp4 = tl.where(tmp3, tmp2, tmp0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp7 = tmp6 + tmp1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp8 = tmp6 < 0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp9 = tl.where(tmp8, tmp7, tmp6) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp10 = tl.load(in_ptr1 + (x0 + 4096*tmp9), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp12 = tmp11 + tmp1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp13 = tmp11 < 0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp14 = tl.where(tmp13, tmp12, tmp11) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp15 = tl.load(in_ptr1 + (x0 + 4096*tmp14), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr0 + (x2), tmp5, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr1 + (x2), tmp10, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr2 + (x2), tmp15, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/yj/cyj7p7xqlcmuj23z3eb2yyzznvkflquhslgxtbejde24is7cpycm.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear => convert_element_type | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_1 = async_compile.triton('triton_poi_fused__to_copy_1', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused__to_copy_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 58720256 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_ptr0 + (x0), None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/nw/cnw5bk2sece6miamzj2agxakkl5ligtllxfirol7rltvneibcdz4.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear => mul | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # mul => mul_2 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # silu => convert_element_type_3, convert_element_type_4, mul_1, sigmoid | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm, %select_1), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type_3 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.float32), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_3,), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_3, %sigmoid), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type_4 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %mm_1), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_2 = async_compile.triton('triton_poi_fused_mul_silu_2', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_mul_silu_2(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 28672 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x2 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/mt/cmt5cljv4hutbqlrk2uagwbmi44fwurqzl2lfowdyjpc5htma74j.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear_9 => convert_element_type_27 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type_27 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_9, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_3 = async_compile.triton('triton_poi_fused__to_copy_3', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused__to_copy_3(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 58720256 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_ptr0 + (176160768 + x0), None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/nt/cntmomp6ushnh66gq7yjlozybhgulzvmqqe3dprdzlhbsgt3obhr.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear_9 => mul_9 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # mul_3 => mul_11 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # silu_3 => convert_element_type_30, convert_element_type_31, mul_10, sigmoid_3 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_9, %select_16), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type_30 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_9, torch.float32), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sigmoid_3 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_30,), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_30, %sigmoid_3), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type_31 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_10, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_31, %mm_10), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_4 = async_compile.triton('triton_poi_fused_mul_silu_4', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_4', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_mul_silu_4(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 28672 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x2 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tl.load(in_ptr0 + (43008 + x0), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/gt/cgtnl56winvb2ne3rhqswbwokqaqdyux7zrtfohk57cyufcqfack.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_12, linear_13, linear_15, linear_16], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear_12 => mul_12, sum_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear_13 => mul_15, sum_2 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear_15 => mul_18, sum_4 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear_16 => mul_21, sum_5 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_12, [1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_15, [1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_18, [1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_21, [1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_5 = async_compile.triton('triton_red_fused_mm_5', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 16384, 'r0_': 4096}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] reduction_hint=ReductionHint.DEFAULT, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 2, 3, 5, 6, 7, 8, 9, 10), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_5', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 4, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_red_fused_mm_5(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 14336 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_numel = 4096 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rnumel = r0_numel | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rbase = r0_base | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_ptr0 + (0)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp12 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp18 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp20 = tl.load(in_ptr4 + (0)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp21 = tl.broadcast_to(tmp20, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp37 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] roffset = r0_offset | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rindex = r0_index | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_1 = r0_index | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp8 = tl.load(in_ptr2 + (234881024 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp14 = tl.load(in_ptr3 + (234881024 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp27 = tl.load(in_ptr2 + (293601280 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp33 = tl.load(in_ptr3 + (293601280 + r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp2 = tl.full([XBLOCK, R0_BLOCK], 4, tl.int32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp3 = tmp1 + tmp2 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp4 = tmp1 < 0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp5 = tl.where(tmp4, tmp3, tmp1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp6 = tl.load(in_ptr1 + (r0_1 + 4096*tmp5), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp11 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp13 = _tmp12 + tmp11 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp12 = tl.where(r0_mask & xmask, tmp13, _tmp12) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp15 = tmp14.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp16 = tmp7 * tmp15 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp17 = tl.broadcast_to(tmp16, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp19 = _tmp18 + tmp17 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp18 = tl.where(r0_mask & xmask, tmp19, _tmp18) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp22 = tmp21 + tmp2 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp23 = tmp21 < 0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp24 = tl.where(tmp23, tmp22, tmp21) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp25 = tl.load(in_ptr1 + (r0_1 + 4096*tmp24), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp26 = tmp25.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp28 = tmp27.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp29 = tmp26 * tmp28 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp32 = _tmp31 + tmp30 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp34 = tmp33.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp35 = tmp26 * tmp34 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp36 = tl.broadcast_to(tmp35, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp38 = _tmp37 + tmp36 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp37 = tl.where(r0_mask & xmask, tmp38, _tmp37) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp12 = tl.sum(_tmp12, 1)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp18 = tl.sum(_tmp18, 1)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp31 = tl.sum(_tmp31, 1)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp37 = tl.sum(_tmp37, 1)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr0 + (x0), tmp12, xmask) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr1 + (x0), tmp18, xmask) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr2 + (x0), tmp31, xmask) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr3 + (x0), tmp37, xmask) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/vu/cvu76bra7dtrosp7wkj6uqmxw5ruguxxzxrxgyprzkq7swh77jxq.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_out_4], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # cur_out_4 => mul_17, sum_3 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_4, %unsqueeze_5), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_17, [1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_6 = async_compile.triton('triton_red_fused_mm_6', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_red_fused_mm_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 4096 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_numel = 14336 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rnumel = r0_numel | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rbase = r0_base | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] roffset = r0_offset | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rindex = r0_index | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_1 = r0_index | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp2 = tl.load(in_ptr1 + (57344 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp12 = tl.load(in_ptr3 + (234881024 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/kz/ckzpveqxn3ptfdcdbnp5jn46pozxmhrb2eqsz4bslvr2muyyuc4q.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_out_5], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # cur_out_5 => mul_23, sum_6 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_23 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_10, %unsqueeze_11), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sum_6 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_23, [1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_7 = async_compile.triton('triton_red_fused_mm_7', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 4096, 'r0_': 16384}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] reduction_hint=ReductionHint.INNER, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_red_fused_mm_7(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 4096 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_numel = 14336 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rnumel = r0_numel | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rbase = r0_base | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp16 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] for r0_offset in range(0, r0_numel, R0_BLOCK): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_index = r0_offset + r0_base | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_mask = r0_index < r0_numel | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] roffset = r0_offset | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] rindex = r0_index | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] r0_1 = r0_index | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_ptr0 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp2 = tl.load(in_ptr1 + (71680 + r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp8 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp12 = tl.load(in_ptr3 + (293601280 + r0_1 + 14336*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp3 = tmp1 * tmp2 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp4 = tmp3.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp5 = tl.sigmoid(tmp4) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp6 = tmp4 * tmp5 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp7 = tmp6.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp9 = tmp8.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp10 = tmp7 * tmp9 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp11 = tmp10.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp13 = tmp12.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp14 = tmp11 * tmp13 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp15 = tl.broadcast_to(tmp14, [XBLOCK, R0_BLOCK]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp17 = _tmp16 + tmp15 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] _tmp16 = tl.where(r0_mask, tmp17, _tmp16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp16 = tl.sum(_tmp16, 1)[:, None] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr0 + (x0), tmp16, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/bn/cbntbuf2uowesjcvj7pijik2jtcococuwohpnparrrikin5bb3is.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear_18 => convert_element_type_60 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type_60 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_18, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_8 = async_compile.triton('triton_poi_fused__to_copy_8', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_8', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused__to_copy_8(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 58720256 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_ptr0 + (352321536 + x0), None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/xk/cxkxa4iqtzpndenpxumel4jvuld773zjmzkir4txf2hm5n7vrya4.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # linear_18 => mul_24 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # mul_6 => mul_26 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # silu_6 => convert_element_type_63, convert_element_type_64, mul_25, sigmoid_6 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_24 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_12, %select_31), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type_63 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_24, torch.float32), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %sigmoid_6 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_63,), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_25 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_63, %sigmoid_6), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %convert_element_type_64 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_25, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_26 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_64, %mm_13), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_9 = async_compile.triton('triton_poi_fused_mul_silu_9', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_9', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_mul_silu_9(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 28672 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x2 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tl.load(in_ptr0 + (86016 + x0), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/cp/ccp3stqbhb5f53fzompeeislgwlwcrpug5xm766de66u4ehpn2ml.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # ordered_outs => cat | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # weighted_ordered_outs => mul_30 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%mm_2, %mm_11, %convert_element_type_47, %convert_element_type_59, %mm_14],), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg20_1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_cat_index_mul_view_10 = async_compile.triton('triton_poi_fused_cat_index_mul_view_10', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'in_ptr5': '*i64', 'in_ptr6': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_index_mul_view_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_cat_index_mul_view_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 32768 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x2 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp35 = tl.load(in_ptr5 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = x1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp1 = tl.full([1], 0, tl.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp2 = tmp0 >= tmp1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp3 = tl.full([1], 2, tl.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp4 = tmp0 < tmp3 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp5 = tl.load(in_ptr0 + (x0 + 4096*(x1)), tmp4, other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp6 = tmp0 >= tmp3 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp7 = tl.full([1], 4, tl.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp8 = tmp0 < tmp7 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp9 = tmp6 & tmp8 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp10 = tl.load(in_ptr1 + (x0 + 4096*((-2) + x1)), tmp9, other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp11 = tmp0 >= tmp7 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp12 = tl.full([1], 5, tl.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp13 = tmp0 < tmp12 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp14 = tmp11 & tmp13 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp15 = tl.load(in_ptr2 + (x0), tmp14, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp16 = tmp15.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp17 = tl.full(tmp16.shape, 0.0, tmp16.dtype) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp18 = tl.where(tmp14, tmp16, tmp17) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp19 = tmp0 >= tmp12 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp20 = tl.full([1], 6, tl.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp21 = tmp0 < tmp20 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp22 = tmp19 & tmp21 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp23 = tl.load(in_ptr3 + (x0), tmp22, eviction_policy='evict_last', other=0.0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp24 = tmp23.to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp25 = tl.full(tmp24.shape, 0.0, tmp24.dtype) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp26 = tl.where(tmp22, tmp24, tmp25) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp27 = tmp0 >= tmp20 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp28 = tl.full([1], 8, tl.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp29 = tmp0 < tmp28 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp30 = tl.load(in_ptr4 + (x0 + 4096*((-6) + x1)), tmp27, other=0.0).to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp31 = tl.where(tmp22, tmp26, tmp30) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp32 = tl.where(tmp14, tmp18, tmp31) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp33 = tl.where(tmp9, tmp10, tmp32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp34 = tl.where(tmp4, tmp5, tmp33) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp36 = tl.full([XBLOCK], 8, tl.int32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp37 = tmp35 + tmp36 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp38 = tmp35 < 0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp39 = tl.where(tmp38, tmp37, tmp35) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp40 = tl.load(in_ptr6 + (tmp39), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp41 = tmp34 * tmp40 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(in_out_ptr0 + (x2), tmp41, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/4l/c4lbtol734c43z6bxtljtt4jjq4qsrbkiyp5o525eddf3iovsttp.py | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # final_out => full_default | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # final_out_1 => scatter_add | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # weighted_ordered_outs => mul_30 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Graph fragment: | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg20_1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_30), kwargs = {}) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_11 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_11', ''' | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] import triton.language as tl | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] filename=__file__, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_11', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] @triton.jit | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_11(out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xnumel = 16384 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] x0 = xindex | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tmp0 = 0.0 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] tl.store(out_ptr0 + (x0), tmp0, None) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] async_compile.wait(globals()) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del async_compile | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def call(args): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1 = args | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] args.clear() | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] s1 = arg2_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] s3 = arg6_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] s4 = arg8_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] s6 = arg10_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] s7 = arg12_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg0_1, (2, ), (1, )) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg1_1, (4, 4096), (4096, 1)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg5_1, (2, ), (1, )) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg7_1, (1, ), (1, )) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg9_1, (1, ), (1, )) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg11_1, (2, ), (1, )) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg14_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg15_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg16_1, (8, 14336), (14336, 1)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg17_1, (8, 4096, 14336), (58720256, 14336, 1)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg18_1, (8, 14336, 4096), (58720256, 4096, 1)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg19_1, (4, 2), (2, 1)) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg20_1, (8, ), (1, )) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] assert_size_stride(arg21_1, (8, ), (1, )) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] with torch.cuda._DeviceGuard(0): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] torch.cuda.set_device(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf0 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf6 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf18 = empty_strided_cuda((2, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_x, cur_x_3, cur_x_6], Original ATen: [aten.index] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_index_0.run(arg0_1, arg1_1, arg5_1, arg11_1, buf0, buf6, buf18, 8192, grid=grid(8192), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg0_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg11_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg5_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf1 = empty_strided_cuda((4096, 14336), (1, 4096), torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_1.run(arg14_1, buf1, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf2 = empty_strided_cuda((2, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf0, buf1, out=buf2) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf3 = empty_strided_cuda((2, 14336), (14336, 1), torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_1], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf0, reinterpret_tensor(arg18_1, (4096, 14336), (1, 4096), 0), out=buf3) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf4 = buf2; del buf2 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_2.run(buf4, arg15_1, buf3, 28672, grid=grid(28672), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf5 = buf0; del buf0 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear, silu, mul, cur_out], Original ATen: [aten.mul, aten.silu, aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf4, reinterpret_tensor(arg17_1, (14336, 4096), (1, 14336), 0), out=buf5) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf7 = buf1; del buf1 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_3.run(arg14_1, buf7, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf8 = buf4; del buf4 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_9], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf6, buf7, out=buf8) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf9 = buf3; del buf3 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_10], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf6, reinterpret_tensor(arg18_1, (4096, 14336), (1, 4096), 176160768), out=buf9) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf10 = buf8; del buf8 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_4.run(buf10, arg15_1, buf9, 28672, grid=grid(28672), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf11 = buf6; del buf6 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_9, silu_3, mul_3, cur_out_3], Original ATen: [aten.mul, aten.silu, aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf10, reinterpret_tensor(arg17_1, (14336, 4096), (1, 14336), 176160768), out=buf11) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf12 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf13 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf15 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf16 = empty_strided_cuda((1, 14336), (14336, 1), torch.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_12, linear_13, linear_15, linear_16], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_5.run(arg7_1, arg1_1, arg14_1, arg18_1, arg9_1, buf12, buf13, buf15, buf16, 14336, 4096, grid=grid(14336), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg1_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg7_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg9_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf14 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_out_4], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_6.run(buf12, arg15_1, buf13, arg17_1, buf14, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf12 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf13 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf17 = empty_strided_cuda((1, 4096), (4096, 1), torch.float32) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [cur_out_5], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_red_fused_mm_7.run(buf15, arg15_1, buf16, arg17_1, buf17, 4096, 14336, grid=grid(4096), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf15 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf16 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf19 = buf7; del buf7 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused__to_copy_8.run(arg14_1, buf19, 58720256, grid=grid(58720256), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg14_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf20 = buf10; del buf10 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_18], Original ATen: [aten._to_copy, aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf18, buf19, out=buf20) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf19 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf21 = buf9; del buf9 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_19], Original ATen: [aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf18, reinterpret_tensor(arg18_1, (4096, 14336), (1, 4096), 352321536), out=buf21) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg18_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf22 = buf20; del buf20 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_mul_silu_9.run(buf22, arg15_1, buf21, 28672, grid=grid(28672), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg15_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf21 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf23 = buf18; del buf18 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [linear_18, silu_6, mul_6, cur_out_6], Original ATen: [aten.mul, aten.silu, aten.mm] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] extern_kernels.mm(buf22, reinterpret_tensor(arg17_1, (14336, 4096), (1, 14336), 352321536), out=buf23) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg17_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf22 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf24 = empty_strided_cuda((8, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf26 = buf24; del buf24 # reuse | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [ordered_outs, getitem_32, ordered_token_activation_weights, weighted_ordered_outs], Original ATen: [aten.cat, aten.index, aten.view, aten.mul] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_cat_index_mul_view_10.run(buf26, buf5, buf11, buf14, buf17, buf23, arg20_1, arg19_1, 32768, grid=grid(32768), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg19_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg20_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf11 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf14 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf17 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf23 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf5 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] buf25 = empty_strided_cuda((4, 4096), (4096, 1), torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] stream0 = get_raw_stream(0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_11.run(buf25, 16384, grid=grid(16384), stream=stream0) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] aten.scatter_reduce_.two(buf25,0,reinterpret_tensor(arg21_1, (8, 4096), (1, 0), 0),buf26, reduce='sum', include_self=True) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del arg21_1 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] del buf26 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] return (buf25, ) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] def benchmark_compiled_module(times=10, repeat=10): | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._dynamo.testing import rand_strided | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.utils import print_performance | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg0_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg1_1 = rand_strided((4, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg2_1 = 2 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg3_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg4_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg5_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg6_1 = 4 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg7_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg8_1 = 5 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg9_1 = rand_strided((1, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg10_1 = 6 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg11_1 = rand_strided((2, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg12_1 = 8 | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg13_1 = rand_strided((0, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg14_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.int8) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg15_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg16_1 = rand_strided((8, 14336), (14336, 1), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg17_1 = rand_strided((8, 4096, 14336), (58720256, 14336, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg18_1 = rand_strided((8, 14336, 4096), (58720256, 4096, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg19_1 = rand_strided((4, 2), (2, 1), device='cuda:0', dtype=torch.bfloat16) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg20_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] arg21_1 = rand_strided((8, ), (1, ), device='cuda:0', dtype=torch.int64) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1]) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] return print_performance(fn, times=times, repeat=repeat) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] if __name__ == "__main__": | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] compiled_module_main('None', benchmark_compiled_module) | |
| V0401 02:34:34.274000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/4] [__output_code] | |
| V0401 02:34:34.275000 3240940 site-packages/torch/_inductor/codecache.py:1092] [7/4] [__output_code] Output code written to: /tmp/torchinductor_cdhernandez/vc/cvclk5nwwmijilkr6g36t3fpsaupngalce2feusbejlngivi4n6c.py | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] Output code: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # AOT ID: ['10_inference'] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from ctypes import c_void_p, c_long, c_int | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import torch | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import math | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import random | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import os | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import tempfile | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from math import inf, nan | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from cmath import nanj | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.hooks import run_intermediate_hooks | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.utils import maybe_profile | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.codegen.memory_planning import _align as align | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch import device, empty_strided | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.async_compile import AsyncCompile | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.select_algorithm import extern_kernels | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_heuristics import ( | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] grid, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] split_scan_grid, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] grid_combo_kernels, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] start_graph, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] end_graph, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] cooperative_reduction_grid, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._C import _cuda_getCurrentRawStream as get_raw_stream | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] aten = torch.ops.aten | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_ops = torch.ops.inductor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] _quantized = torch.ops._quantized | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] async_compile = AsyncCompile() | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/wk/cwkeabswtnly4envvmyy47m2dnvt6tcq2mqvl23aerqffjvcpm5s.py | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [cur_x_2, cur_x_7], Original ATen: [aten.index] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # cur_x_2 => index_2 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # cur_x_7 => index_7 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %index_2 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg3_1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %index_7 : [num_users=2] = call_function[target=torch.ops.aten.index.Tensor](args = (%arg1_1, [%arg12_1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused_index_0 = async_compile.triton('triton_poi_fused_index_0', ''' | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] size_hints={'x': 8192}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] filename=__file__, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i64', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (1, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused_index_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xnumel = 8192 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x1 = xindex // 4096 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x0 = (xindex % 4096) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x2 = xindex | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp0 = tl.load(in_ptr0 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp6 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last') | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp1 = tl.full([XBLOCK], 4, tl.int32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp2 = tmp0 + tmp1 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp3 = tmp0 < 0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp4 = tl.where(tmp3, tmp2, tmp0) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp5 = tl.load(in_ptr1 + (x0 + 4096*tmp4), None).to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp7 = tmp6 + tmp1 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp8 = tmp6 < 0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp9 = tl.where(tmp8, tmp7, tmp6) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp10 = tl.load(in_ptr1 + (x0 + 4096*tmp9), None).to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tl.store(out_ptr0 + (x2), tmp5, None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tl.store(out_ptr1 + (x2), tmp10, None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/pp/cppt43372rvepxmwsrrmtoqxmhn7h57p7lpbofdbz5qy6rfpsuqv.py | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [final_out, getitem_32, ordered_token_activation_weights, weighted_ordered_outs, final_out_1], Original ATen: [aten.zeros_like, aten.index, aten.view, aten.mul, aten.scatter_add] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # final_out => full_default | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # final_out_1 => scatter_add | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # getitem_32 => index_8 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # ordered_token_activation_weights => view_17 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # weighted_ordered_outs => mul_36 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([4, 4096], 0), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %index_8 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_16, [%arg19_1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_8, [-1, 1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_36 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %view_17), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %scatter_add : [num_users=1] = call_function[target=torch.ops.aten.scatter_add.default](args = (%full_default, 0, %expand, %mul_36), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused_index_mul_scatter_add_view_zeros_like_1 = async_compile.triton('triton_poi_fused_index_mul_scatter_add_view_zeros_like_1', ''' | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] size_hints={'x': 16384}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] filename=__file__, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_meta={'signature': {'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_mul_scatter_add_view_zeros_like_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused_index_mul_scatter_add_view_zeros_like_1(out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xnumel = 16384 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x0 = xindex | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp0 = 0.0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tl.store(out_ptr0 + (x0), tmp0, None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/z4/cz4vffmbvb2kxqbqoturm7qcthdzlfhrztfk6mboxfmn2a3jfcnq.py | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_6], Original ATen: [aten._to_copy] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_6 => convert_element_type_21 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %convert_element_type_21 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_6, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused__to_copy_2 = async_compile.triton('triton_poi_fused__to_copy_2', ''' | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] filename=__file__, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused__to_copy_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xnumel = 58720256 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x0 = xindex | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp0 = tl.load(in_ptr0 + (117440512 + x0), None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/ia/cia4jpg2c2vn4yq7jq4nhf7q7scd62gvulaxucajatglemmgbfqq.py | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_6, silu_2, mul_2], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_6 => mul_9 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # mul_2 => mul_11 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # silu_2 => convert_element_type_24, convert_element_type_25, mul_10, sigmoid_2 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_3, %select_11), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %convert_element_type_24 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_9, torch.float32), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sigmoid_2 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_24,), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_24, %sigmoid_2), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %convert_element_type_25 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_10, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_25, %mm_4), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused_mul_silu_3 = async_compile.triton('triton_poi_fused_mul_silu_3', ''' | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] filename=__file__, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_3', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused_mul_silu_3(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xnumel = 28672 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x2 = xindex | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp1 = tl.load(in_ptr0 + (28672 + x0), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/kq/ckqvwkuuoeyn7m7x5ggcraabvfxslymsbl743lrf5eo2ynoebf2u.py | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_21], Original ATen: [aten._to_copy] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_21 => convert_element_type_75 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %convert_element_type_75 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%permute_21, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused__to_copy_4 = async_compile.triton('triton_poi_fused__to_copy_4', ''' | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] size_hints={'x': 67108864}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] filename=__file__, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_meta={'signature': {'in_ptr0': '*i8', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused__to_copy_4(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xnumel = 58720256 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x0 = xindex | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp0 = tl.load(in_ptr0 + (411041792 + x0), None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp1 = tmp0.to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tl.store(out_ptr0 + (x0), tmp1, None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/mq/cmqj7yusny67pmxxacb2tzwiol46m2m7fjrvkyfb6pjle642ghbh.py | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_21, silu_7, mul_7], Original ATen: [aten.mul, aten.silu] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_21 => mul_33 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # mul_7 => mul_35 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # silu_7 => convert_element_type_78, convert_element_type_79, mul_34, sigmoid_7 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mm_9, %select_36), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %convert_element_type_78 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_33, torch.float32), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sigmoid_7 : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_78,), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_34 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_78, %sigmoid_7), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %convert_element_type_79 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_34, torch.bfloat16), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_35 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_79, %mm_10), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_poi_fused_mul_silu_5 = async_compile.triton('triton_poi_fused_mul_silu_5', ''' | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.pointwise( | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] size_hints={'x': 32768}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] filename=__file__, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_5', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] min_elem_per_thread=0 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_poi_fused_mul_silu_5(in_out_ptr0, in_ptr0, in_ptr1, xnumel, XBLOCK : tl.constexpr): | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xnumel = 28672 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xmask = tl.full([XBLOCK], True, tl.int1) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x2 = xindex | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] x0 = (xindex % 14336) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp1 = tl.load(in_ptr0 + (100352 + x0), None, eviction_policy='evict_last').to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp7 = tl.load(in_ptr1 + (x2), None).to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp2 = tmp0 * tmp1 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp3 = tmp2.to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp4 = tl.sigmoid(tmp3) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp5 = tmp3 * tmp4 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp6 = tmp5.to(tl.float32) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tmp8 = tmp6 * tmp7 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] tl.store(in_out_ptr0 + (x2), tmp8, None) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ''', device_str='cuda') | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # kernel path: /tmp/torchinductor_cdhernandez/za/cza5auy5mwzv2rnfug7eub7cff2alhhumyrfxwv43oeoryxqn4ho.py | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Topologically Sorted Source Nodes: [linear_3, linear_4, linear_9, linear_10, linear_12, linear_13, linear_18, linear_19], Original ATen: [aten.mm] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Source node to ATen node mapping: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_10 => mul_15, sum_5 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_12 => mul_18, sum_7 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_13 => mul_21, sum_8 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_18 => mul_27, sum_10 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_19 => mul_30, sum_11 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_3 => mul_3, sum_1 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_4 => mul_6, sum_2 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # linear_9 => mul_12, sum_4 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # Graph fragment: | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze, %unsqueeze_1), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_3, [1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_2, %unsqueeze_3), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_6, [1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_6, %unsqueeze_7), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_12, [1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_8, %unsqueeze_9), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_15, [1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_12, %unsqueeze_13), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_18, [1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_21 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_14, %unsqueeze_15), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_21, [1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_27 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_18, %unsqueeze_19), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sum_10 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_27, [1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%unsqueeze_20, %unsqueeze_21), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] # %sum_11 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_30, [1]), kwargs = {}) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_red_fused_mm_6 = async_compile.triton('triton_red_fused_mm_6', ''' | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] import triton.language as tl | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from triton.compiler.compiler import AttrsDescriptor | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime import triton_helpers, triton_heuristics | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_helpers.set_driver_to_gpu() | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton_heuristics.reduction( | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] size_hints={'x': 16384, 'r0_': 4096}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] reduction_hint=ReductionHint.DEFAULT, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] filename=__file__, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*i8', 'in_ptr3': '*bf16', 'in_ptr4': '*i64', 'in_ptr5': '*i64', 'in_ptr6': '*i64', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'out_ptr2': '*fp32', 'out_ptr3': '*fp32', 'out_ptr4': '*fp32', 'out_ptr5': '*fp32', 'out_ptr6': '*fp32', 'out_ptr7': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]}, | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_mm_6', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 12, 'num_reduction': 8, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': False, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False} | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] ) | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] @triton.jit | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] def triton_red_fused_mm_6(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, out_ptr5, out_ptr6, out_ptr7, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xnumel = 14336 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] r0_numel = 4096 | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] rnumel = r0_numel | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] RBLOCK: tl.constexpr = R0_BLOCK | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xoffset = tl.program_id(0) * XBLOCK | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xindex = xoffset + tl.arange(0, XBLOCK)[:, None] | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] xmask = xindex < xnumel | |
| V0401 02:34:34.860000 3240940 site-packages/torch/_inductor/codecache.py:1091] [7/5] [__output_code] r0_base = tl.arange(0, R0_BLOCK)[None, :] | |
| V0401 02:34:34.860000 3240940 site-packages |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment