Skip to content

Instantly share code, notes, and snippets.

@leegao
Created January 8, 2026 04:32
Show Gist options
  • Select an option

  • Save leegao/c237a207fa61f73859fe8282470f3d56 to your computer and use it in GitHub Desktop.

Select an option

Save leegao/c237a207fa61f73859fe8282470f3d56 to your computer and use it in GitHub Desktop.
VLIW dump of mini_attention (softmax(x @ w1) @ w2)
!rm -rf compiler_dump
!rm compiler_dump.zip
import os
# # Create dump directories
DUMP_ROOT = "compiler_dump/"
HLO_DUMP_PATH = os.path.join(DUMP_ROOT, "hlo")
LLO_DUMP_PATH = os.path.join(DUMP_ROOT, "llo")
os.makedirs(HLO_DUMP_PATH, exist_ok=True)
os.makedirs(LLO_DUMP_PATH, exist_ok=True)
# os.environ["XLA_FLAGS"] = (
# f"--xla_dump_hlo_as_text "
# f"--xla_dump_to={HLO_DUMP_PATH} "
# f"--xla_dump_hlo_pass_re=.* "
# )
os.environ["LIBTPU_INIT_ARGS"] = (
f"--xla_jf_dump_to={LLO_DUMP_PATH} "
f"--xla_jf_dump_hlo_text=true "
f"--xla_jf_dump_llo_text=true "
f"--xla_jf_dump_llo_html=false "
f"--xla_jf_dump_llo_static_gaps=true "
f"--xla_jf_emit_annotations=true "
f"--xla_jf_debug_level=2 "
f"--xla_jf_dump_debug_info=true "
f"--xla_jf_dump_fusion_computations=false"
)
# Import JAX after setting env vars
import jax
import jax.numpy as jnp
@jax.named_call
def softmax(h):
"""Stage 3: Softmax (row-wise, numerically stable)"""
h_max = jnp.max(h, axis=-1, keepdims=True)
exp_h = jnp.exp(h - h_max)
return exp_h / jnp.sum(exp_h, axis=-1, keepdims=True)
def mini_attention(x, w1, w2):
"""
A minimal attention-like block:
matmul → softmax → matmul
"""
h = x @ w1
h = softmax(h)
out = h @ w2
return out
batch, d_in, d_mid, d_out = 64, 32, 64, 32
key = jax.random.PRNGKey(42)
k1, k2, k3 = jax.random.split(key, 3)
x = jax.random.normal(k1, (batch, d_in))
w1 = jax.random.normal(k2, (d_in, d_mid)) * 0.02
w2 = jax.random.normal(k3, (d_mid, d_out)) * 0.02
data = jax.jit(mini_attention, backend='tpu').lower(x, w1, w2).compile().as_text()
print(data)
==> compiler_dump/llo/1767846219999249198-TLP-79-final_bundles.txt <==
= control target key start
LH: loop header
LB: loop body
LE: loop exit
PB: predicated region body
PF: predicated region fallthrough
CT: control target
= control target key end
0 : { %s237_s0 = sld [smem:[#allocation22]] } /* Start region 0 :: Start region 281 */
0x1 : { %s238_s1 = sand.u32 134217727, %s237_s0 }
0x2 : { %s239_s2 = sor.u32 4026531840, %s238_s1 }
0x3 : { %240 = vtrace %s239_s2 }
0x4 : { %s231_s3 = sld [smem:[#allocation19]] }
0x5 : { %232 = vtrace %s231_s3 }
0x6 : { %s233_s4 = sld [smem:[#allocation20]] }
0x7 : { %234 = vtrace %s233_s4 }
0x8 : { %s235_s5 = sld [smem:[#allocation21]] }
0x9 : { %236 = vtrace %s235_s5 }
0xa : { %v218_v0 = vlaneseq } /* Start region 43 :: Start region 44 :: Start region 45 */
0xb : { %v310_v1 = vshrl.u32 %v218_v0, 7 }
0xc : { %v220_v2 = vshrl.u32 %v310_v1, 1 ;; %v221_v3 = vand.u32 1, %v310_v1 }
0xd : { %v222_v4 = vshll.u32 %v221_v3, 2 ;; %v229_v6 = vsub.s32 %v220_v2, %v310_v1 }
0xe : { %v223_v5 = vadd.s32 %v222_v4, %v220_v2 }
0xf : { %v224_v7 = vsub.s32 %v223_v5, %v310_v1 }
0x10 : { %225 = vsetiar.raw.iar0 %v224_v7 /* EvenOdd Store IAR initialization */ }
0x11 : { %230 = vsetiar.raw.iar1 %v229_v6 /* EvenOdd Load IAR initialization */ }
0x12 : { %s98_s6 = sld [smem:[#allocation16]] }
0x13 : { %p241_p0 = scmp.eq.s32.totalorder %s98_s6, 0 } /* End region 43 */
0x14 : { %s115_s7 = sxor.u32 (!%p241_p0), 2925155241, %s98_s6 }
0x15 : { %102 = sbr.rel (%p241_p0) target bundleno = 160 (0xa0), region = 46 }
0x16 : { %s116_s8 = smul.u32 (!%p241_p0), 2223506493, %s115_s7 }
0x17 : {}
0x18 : { %s117_s9 = sshrl.u32 (!%p241_p0), %s116_s8, 16 }
0x19 : { %s118_s10 = sxor.u32 (!%p241_p0), %s117_s9, %s116_s8 } /* End region 44 */
0x1a : { %v107_v8 = vand.u32 127, %v218_v0 ;; %s123_s11 = smul.u32 3389127133, %s118_s10 ;; %vm242_vm0 = vcmp.eq.s32.totalorder %v310_v1, 1 ;; %vm243_vm2 = vcmp.eq.s32.totalorder %v310_v1, 2 ;; %vm244_vm3 = vcmp.eq.s32.totalorder %v310_v1, 3 }
0x1b : { %v111_v9 = vxor.u32 1135663077, %v107_v8 ;; %v125_v17 = vstv %s123_s11 }
0x1c : { %v112_v10 = vmul.u32 2925155241, %v111_v9 }
0x1d : { %v113_v11 = vshrl.u32 %v112_v10, 16 }
0x1e : { %v114_v12 = vxor.u32 %v113_v11, %v112_v10 }
0x1f : { %v119_v13 = vxor.u32 2223506493, %v114_v12 ;; %v133_v26 = vmul.u32 3389127133, %v114_v12 }
0x20 : { %v120_v14 = vmul.u32 1519409121, %v119_v13 }
0x21 : { %v121_v15 = vshrl.u32 %v120_v14, 16 }
0x22 : { %v122_v16 = vxor.u32 %v121_v15, %v120_v14 }
0x23 : { %v124_v18 = vmul.u32 1232336661, %v122_v16 }
0x24 : { %v126_v19 = vsub.s32 %v125_v17, %v124_v18 }
0x25 : { %v127_v20 = vshrl.u32 %v126_v19, 16 }
0x26 : { %v128_v21 = vxor.u32 %v127_v20, %v126_v19 }
0x27 : { %v129_v22 = vxor.u32 1519409121, %v128_v21 ;; %v142_v31 = vxor.u32 2925155241, %v128_v21 }
0x28 : { %v130_v23 = vmul.u32 2449846741, %v129_v22 ;; %v143_v34 = vmul.u32 2223506493, %v142_v31 }
0x29 : { %v131_v24 = vshrl.u32 %v130_v23, 16 ;; %v144_v37 = vshrl.u32 %v143_v34, 16 }
0x2a : { %v132_v25 = vxor.u32 %v131_v24, %v130_v23 ;; %v145_v39 = vxor.u32 %v144_v37, %v143_v34 }
0x2b : { %v134_v27 = vmul.u32 1232336661, %v132_v25 ;; %v150_v43 = vmul.u32 3389127133, %v145_v39 }
0x2c : { %v135_v28 = vsub.s32 %v133_v26, %v134_v27 }
0x2d : { %v136_v29 = vshrl.u32 %v135_v28, 16 }
0x2e : { %v137_v30 = vxor.u32 %v136_v29, %v135_v28 }
0x2f : { %v138_v32 = vxor.u32 1135663077, %v137_v30 }
0x30 : { %v139_v33 = vmul.u32 2925155241, %v138_v32 }
0x31 : { %v140_v35 = vshrl.u32 %v139_v33, 16 }
0x32 : { %v141_v36 = vxor.u32 %v140_v35, %v139_v33 }
0x33 : { %v146_v38 = vxor.u32 2223506493, %v141_v36 ;; %v159_v52 = vmul.u32 3389127133, %v141_v36 }
0x34 : { %v147_v40 = vmul.u32 1519409121, %v146_v38 }
0x35 : { %v148_v41 = vshrl.u32 %v147_v40, 16 }
0x36 : { %v149_v42 = vxor.u32 %v148_v41, %v147_v40 }
0x37 : { %v151_v44 = vmul.u32 1232336661, %v149_v42 }
0x38 : { %v152_v45 = vsub.s32 %v150_v43, %v151_v44 }
0x39 : { %v153_v46 = vshrl.u32 %v152_v45, 16 }
0x3a : { %v154_v47 = vxor.u32 %v153_v46, %v152_v45 }
0x3b : { %v155_v48 = vxor.u32 1519409121, %v154_v47 ;; %v172_v53 = vxor.u32 1179257497, %v154_v47 ;; %v188_v57 = vxor.u32 3546938817, %v154_v47 }
0x3c : { %v176_v60 = vxor.u32 461070425, %v154_v47 ;; %v192_v63 = vxor.u32 728804945, %v154_v47 }
0x3d : { %v156_v49 = vmul.u32 2449846741, %v155_v48 ;; %v173_v56 = vmul.u32 2174555301, %v172_v53 ;; %v189_v62 = vmul.u32 1343633581, %v188_v57 }
0x3e : { %v177_v6 = vmul.u32 702470093, %v176_v60 ;; %v193_v10 = vmul.u32 1920080165, %v192_v63 }
0x3f : { %v157_v50 = vshrl.u32 %v156_v49, 16 ;; %v174_v61 = vshrl.u32 %v173_v56, 16 ;; %v190_v14 = vshrl.u32 %v189_v62, 16 }
0x40 : { %v178_v18 = vshrl.u32 %v177_v6, 16 ;; %v194_v20 = vshrl.u32 %v193_v10, 16 }
0x41 : { %v158_v51 = vxor.u32 %v157_v50, %v156_v49 ;; %v175_v12 = vxor.u32 %v174_v61, %v173_v56 ;; %v191_v21 = vxor.u32 %v190_v14, %v189_v62 }
0x42 : { %v179_v25 = vxor.u32 %v178_v18, %v177_v6 ;; %v195_v27 = vxor.u32 %v194_v20, %v193_v10 }
0x43 : { %v160_v54 = vmul.u32 1232336661, %v158_v51 }
0x44 : { %v161_v55 = vsub.s32 %v159_v52, %v160_v54 }
0x45 : { %v162_v58 = vshrl.u32 %v161_v55, 16 }
0x46 : { %v163_v59 = vxor.u32 %v162_v58, %v161_v55 }
0x47 : { %v164_v0 = vxor.u32 2337405405, %v163_v59 ;; %v168_v2 = vxor.u32 747796405, %v163_v59 ;; %v180_v3 = vxor.u32 2174555301, %v163_v59 }
0x48 : { %v184_v4 = vxor.u32 702470093, %v163_v59 }
0x49 : { %v165_v5 = vmul.u32 1179257497, %v164_v0 ;; %v181_v7 = vmul.u32 3546938817, %v180_v3 ;; %v169_v8 = vmul.u32 461070425, %v168_v2 }
0x4a : { %v185_v9 = vmul.u32 728804945, %v184_v4 }
0x4b : { %v166_v11 = vshrl.u32 %v165_v5, 16 ;; %v182_v13 = vshrl.u32 %v181_v7, 16 ;; %v170_v17 = vshrl.u32 %v169_v8, 16 }
0x4c : { %v186_v19 = vshrl.u32 %v185_v9, 16 }
0x4d : { %v167_v15 = vxor.u32 %v166_v11, %v165_v5 ;; %v183_v16 = vxor.u32 %v182_v13, %v181_v7 ;; %v171_v24 = vxor.u32 %v170_v17, %v169_v8 }
0x4e : { %v187_v26 = vxor.u32 %v186_v19, %v185_v9 }
0x4f : { %v196_v22 = vor.u32 %v175_v12, %v167_v15 }
0x50 : { %v197_v23 = vor.u32 %v196_v22, %v183_v16 }
0x51 : { %v198_v28 = vor.u32 %v197_v23, %v191_v21 }
0x52 : { %vm199_vm1 = vcmp.eq.s32.totalorder %v198_v28, 0 }
0x53 : { %v209_v29 = vsel /*vm=*/%vm199_vm1, /*on_true_vy=*/%v171_v24, /*on_false_vx=*/%v167_v15 ;; %v210_v30 = vsel /*vm=*/%vm199_vm1, /*on_true_vy=*/%v179_v25, /*on_false_vx=*/%v175_v12 ;; %v212_v31 = vsel /*vm=*/%vm199_vm1, /*on_true_vy=*/%v187_v26, /*on_false_vx=*/%v183_v16 ;; %v214_v32 = vsel /*vm=*/%vm199_vm1, /*on_true_vy=*/%v195_v27, /*on_false_vx=*/%v191_v21 }
0x54 : { %v211_v33 = vsel /*vm=*/%vm242_vm0, /*on_true_vy=*/%v210_v30, /*on_false_vx=*/%v209_v29 }
0x55 : { %v213_v34 = vsel /*vm=*/%vm243_vm2, /*on_true_vy=*/%v212_v31, /*on_false_vx=*/%v211_v33 }
0x56 : { %v215_v35 = vsel /*vm=*/%vm244_vm3, /*on_true_vy=*/%v214_v32, /*on_false_vx=*/%v213_v34 }
0x57 : { %216 = setrngseed %v215_v35 /* Rng seed initialization */ }
0x58 : { %v217_v36 = vrng /* Rng seed initialization */ } /* End region 45 */
0x59 PF: { %93 = vsettm 1 } /* Start/End empty region 46 */
0x5a : { %s295_s12 = smov 2147483646 /* materialized constant */ }
0x5b : { %92 = vsettm %s295_s12 }
0x5c : { %90 = vtrace 2415919103 }
0x5d : { %0 = vtrace 2952790016 }
0x5e : { %1 = vtrace 3221225472 }
0x5f : { %s2_s13 = sld [smem:[#allocation0]] } /* End region 281 :: Start region 1 :: Start region 2 */
0x60 : { %p245_p1 = scmp.ne.s32.totalorder %s2_s13, 1 } /* End region 1 */
0x61 : { %6 = sbr.rel (%p245_p1) target bundleno = 244 (0xf4), region = 4 }
0x62 : {}
0x63 : {}
0x64 : {}
0x65 : {} /* End region 2 */
0x66 : { %s319_s4 = sld [smem:[#allocation2]] } /* Start region 3 :: Start region 282 :: Start/End empty region 6 */
0x67 : { %337 = sst [smem:[#allocation23_spill]] %s319_s4 }
0x68 : { %s9_s14 = scalar_parameter_address 0 } /* Start/End empty region 7 */
0x69 : { %s10_s1 = scalar_parameter_address 1 } /* Start/End empty region 8 */
0x6a : { %33 = vtrace 2147483648 ;; %12 = compiler-scheduling-barrier ;; %14 = compiler-scheduling-barrier } /* Start/End empty region 9 :: Start/End empty region 10 :: Start/End empty region 11 :: Start region 12 */
0x6b : { %17 = vsyncpa [#allocation8], 0 ;; %s18_s15 = sld [smem:[#allocation9]] ;; %15 = compiler-scheduling-barrier }
0x6c : { %p19_p2 = scmp.ne.s32.totalorder %s18_s15, 0 }
0x6d : { %s20_s16 = scalar_select /*predicate=*/%p19_p2, /*on_true=*/32, /*on_false=*/0 }
0x6e : { %s21_s17 = sshll.u32 %s20_s16, 4 }
0x6f : { %22 = vsyncadd [#allocation8], %s21_s17 ;; %s23_s18 = scalar_select /*predicate=*/%p19_p2, /*on_true=*/0, /*on_false=*/32 }
0x70 : { %s25_s19 = sshll.u32 %s9_s14, 4 ;; %s296_s21 = smov [#allocation7] /* materialized constant */ ;; %s26_s19 = int_to_ptr.hbm [resolvable:$true] %s25_s19 }
0x71 : { %s24_s20 = sshll.u32 %s23_s18, 4 ;; %s27_s22 = sshll.u32 %s296_s21, 4 ;; %s28_s22 = int_to_ptr.vmem [resolvable:$true] %s27_s22 }
0x72 : { %s261_s23 = sshra.s32 %s28_s22, 4 ;; %s263_s24 = sshrl.u32 %s24_s20, 4 ;; %s262_s23 = int_to_ptr.vmem [resolvable:$true] %s261_s23 }
0x73 : { %s268_s25 = scalar_lea.vmem %s262_s23, %s263_s24 ;; %s270_s26 = scalar_lea.vmem %s296_s21, 32 }
0x74 : { %p269_p3 = scmp.ne.s32.totalorder %s262_s23, %s268_s25 ;; %p272_p4 = scmp.lt.s32.totalorder %s270_s26, %s268_s25 }
0x75 : { %p274_p5 = pnand %p272_p4, %p269_p3 }
0x76 : { %277 = shalt.err (!%p274_p5) /* BoundsCheck 0 [deref of %s28] for %30 = dma.hbm_to_vmem [thread:$1] /*hbm=*/%s26, /*size_in_granules=*/%s24, /*vmem=*/%s28, /*dst_syncflagno=*/[#allocation8]
hlo: copy-start
*/ }
0x77 : { %30 = dma.hbm_to_vmem [thread:$1] /*hbm=*/%s26_s19, /*size_in_granules=*/%s24_s20, /*vmem=*/%s28_s22, /*dst_syncflagno=*/[#allocation8] }
0x78 : { %34 = vtrace 2415919104 ;; %32 = compiler-scheduling-barrier } /* End region 12 */
0x79 : { %46 = vtrace 2147483649 ;; %35 = compiler-scheduling-barrier ;; %37 = compiler-scheduling-barrier ;; %38 = compiler-scheduling-barrier ;; %40 = compiler-scheduling-barrier } /* Start/End empty region 15 :: Start/End empty region 16 :: Start region 17 */
0x7a : { %291 = dma.done.wait [#allocation8], 512 /* local-dma-wait */ ;; %41 = compiler-scheduling-barrier }
0x7b : { %292 = vsyncadd [#allocation8], 4294966784 }
0x7c : { %44 = vsyncpa [#allocation8], 1 }
0x7d : { %47 = vtrace 2415919105 ;; %45 = compiler-scheduling-barrier } /* End region 282 :: End region 17 */
0x7e : { %53 = vtrace 2147483650 } /* Start region 20 :: Start region 283 */
0x7f : { %s297_s0 = smov [#allocation7] /* materialized constant */ ;; %s298_s2 = smov [#allocation10] /* materialized constant */ ;; %48 = compiler-scheduling-barrier }
0x80 : { %s299_s3 = smov [#allocation11] /* materialized constant */ } /* End region 283 */
0x81 : { %50 = inlined_call %s297_s0, %s10_s1, %s298_s2, %s299_s3 /* %fusion.5 = fusion(%copy-done, %Arg_1.2) */ }
0x82 : { %54 = vtrace 2415919106 ;; %52 = compiler-scheduling-barrier } /* End region 20 :: Start region 284 :: End region 284 */
0x83 : { %71 = vtrace 2147483651 ;; %55 = compiler-scheduling-barrier ;; %56 = compiler-scheduling-barrier ;; %57 = compiler-scheduling-barrier ;; %58 = compiler-scheduling-barrier } /* Start region 285 :: Start/End empty region 23 :: Start/End empty region 24 :: Start region 25 */
0x84 : { %61 = vsyncpa [#allocation13], 0 ;; %s338_s27 = scalar_parameter_address 2 ;; %s300_s29 = smov [#allocation12] /* materialized constant */ ;; %59 = compiler-scheduling-barrier }
0x85 : { %s63_s28 = sshll.u32 %s338_s27, 4 ;; %s65_s30 = sshll.u32 %s300_s29, 4 ;; %s64_s28 = int_to_ptr.hbm [resolvable:$true] %s63_s28 ;; %s66_s30 = int_to_ptr.vmem [resolvable:$true] %s65_s30 }
0x86 : { %68 = dma.hbm_to_vmem [thread:$1] /*hbm=*/%s64_s28, /*size_in_granules=*/512, /*vmem=*/%s66_s30, /*dst_syncflagno=*/[#allocation13] }
0x87 : { %72 = vtrace 2415919107 ;; %70 = compiler-scheduling-barrier } /* End region 285 :: End region 25 */
0x88 : { %76 = vtrace 2147483652 } /* Start region 28 :: Start region 286 */
0x89 : { %s301_s0 = smov [#allocation11] /* materialized constant */ ;; %s302_s1 = smov [#allocation10] /* materialized constant */ ;; %73 = compiler-scheduling-barrier }
0x8a : { %s303_s2 = smov [#allocation14] /* materialized constant */ } /* End region 286 */
0x8b : { %74 = inlined_call %s301_s0, %s302_s1, %s303_s2 /* %fusion.2 = fusion(%get-tuple-element.1, %get-tuple-element) */ }
0x8c : { %77 = vtrace 2415919108 ;; %75 = compiler-scheduling-barrier } /* End region 28 :: Start region 287 :: End region 287 */
0x8d : { %83 = vtrace 2147483653 } /* Start region 288 :: Start region 31 */
0x8e : { %293 = dma.done.wait [#allocation13], 512 /* local-dma-wait */ ;; %78 = compiler-scheduling-barrier }
0x8f : { %294 = vsyncadd [#allocation13], 4294966784 }
0x90 : { %81 = vsyncpa [#allocation13], 1 }
0x91 : { %84 = vtrace 2415919109 ;; %82 = compiler-scheduling-barrier } /* End region 288 :: End region 31 */
0x92 : { %88 = vtrace 2147483654 } /* Start region 34 :: Start region 289 */
0x93 : { %s304_s0 = smov [#allocation12] /* materialized constant */ ;; %s339_s4 = sld [smem:[#allocation23_spill]] ;; %85 = compiler-scheduling-barrier ;; %s340_s4 = int_to_ptr.hbm [resolvable:$false] %s339_s4 }
0x94 : { %s305_s1 = smov [#allocation11] /* materialized constant */ ;; %s306_s2 = smov [#allocation14] /* materialized constant */ }
0x95 : { %s307_s3 = smov [#allocation10] /* materialized constant */ } /* End region 289 */
0x96 : { %86 = inlined_call %s304_s0, %s305_s1, %s306_s2, %s307_s3, %s340_s4 /* %fusion = fusion(%copy-done.1, %get-tuple-element.1, %fusion.2, %get-tuple-element) */ }
0x97 : { %89 = vtrace 2415919110 ;; %87 = compiler-scheduling-barrier } /* End region 3 :: End region 34 :: Start region 290 :: End region 290 */
0x98 PF: { %91 = vtrace 2684354559 } /* Start/End empty region 291 :: Start/End empty region 4 :: Start region 292 */
0x99 : { %s308_s5 = smov 2147483647 /* materialized constant */ }
0x9a : { %94 = vsettm %s308_s5 }
0x9b : { %95 = vdelay 1 }
0x9c : { %96 = sfence }
0x9d : { %s309_s6 = smov 0 /* materialized constant */ }
0x9e : { %97 = sst [smem:[#allocation15]] %s309_s6 } /* End region 0 :: End region 292 */
==> compiler_dump/llo/1767846220022669355-fusion.2-74-final_bundles.txt <==
= control target key start
LH: loop header
LB: loop body
LE: loop exit
PB: predicated region body
PF: predicated region fallthrough
CT: control target
= control target key end
0 : { %v182_v0 = vmov 0.0 /* materialized constant */ ;; %v15_v5 = vlaneseq ;; %s236_s0 = inlined_call_operand.vmem [shape: f32[64,64], index: 0, kind: input, shape index: {}] /* operand 0 */ ;; %s237_s1 = inlined_call_operand.vmem [shape: f32[64], index: 1, kind: input, shape index: {}] /* operand 1 */ ;; %s238_s2 = inlined_call_operand.vmem [shape: f32[64], index: 2, kind: output, shape index: {}] /* operand 2 */ } /* entry bundle: %fusion.2 = fusion(%get-tuple-element.1, %get-tuple-element) */
0x1 : { %4 = vst [vmem:[#allocation0] sm:$0xff] /*vst_source=*/%v182_v0 ;; %v7_v1 = vld [vmem:[%s236_s0] sm:$0xff] ;; %v158_v3 = vld [vmem:[%s236_s0 + $0x8] sm:$0xff] ;; %v159_v7 = vld [vmem:[%s236_s0 + $0x10] sm:$0xff] }
0x2 : { %v8_v2 = vld [vmem:[%s237_s1] ss:$0 sm:$0xff] ;; %v160_v8 = vld [vmem:[%s236_s0 + $0x18] sm:$0xff] ;; %v162_v13 = vld [vmem:[%s236_s0 + $0x28] sm:$0xff] ;; %v16_v22 = vand.u32 127, %v15_v5 }
0x3 : { %v11_v4 = vsub.f32 %v7_v1, %v8_v2 ;; %v23_v6 = vsub.f32 %v158_v3, %v8_v2 ;; %v161_v9 = vld [vmem:[%s236_s0 + $0x20] sm:$0xff] ;; %v39_v10 = vsub.f32 %v159_v7, %v8_v2 ;; %v55_v11 = vsub.f32 %v160_v8, %v8_v2 ;; %v163_v14 = vld [vmem:[%s236_s0 + $0x30] sm:$0xff] ;; %v164_v15 = vld [vmem:[%s236_s0 + $0x38] sm:$0xff] }
0x4 : { %v71_v12 = vsub.f32 %v161_v9, %v8_v2 ;; %v87_v18 = vsub.f32 %v162_v13, %v8_v2 ;; %v103_v21 = vsub.f32 %v163_v14, %v8_v2 ;; %v119_v24 = vsub.f32 %v164_v15, %v8_v2 }
0x5 : { %v13_v16 = vmul.f32 1.442695, %v11_v4 ;; %v25_v17 = vmul.f32 1.442695, %v23_v6 ;; %v41_v19 = vmul.f32 1.442695, %v39_v10 }
0x6 : { %v57_v20 = vmul.f32 1.442695, %v55_v11 ;; %v73_v23 = vmul.f32 1.442695, %v71_v12 ;; %v89_v25 = vmul.f32 1.442695, %v87_v18 }
0x7 : { %166 = vpow2.f32 %v13_v16 ;; %v105_v26 = vmul.f32 1.442695, %v103_v21 ;; %vm17_vm0 = vcmp.lt.s32.totalorder %v16_v22, 64 ;; %v121_v27 = vmul.f32 1.442695, %v119_v24 }
0x8 : { %168 = vpow2.f32 %v25_v17 ;; %v131_v56 = vld [vmem:[#allocation0] ss:$0 sm:$0xff] }
0x9 : { %170 = vpow2.f32 %v41_v19 }
0xa : { %172 = vpow2.f32 %v57_v20 }
0xb : { %174 = vpow2.f32 %v73_v23 }
0xc : { %176 = vpow2.f32 %v89_v25 }
0xd : { %v167_v28 = vpop.eup %166 ;; %178 = vpow2.f32 %v105_v26 }
0xe : { %v169_v29 = vpop.eup %168 ;; %v18_v30 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v167_v28, /*on_false_vx=*/0.0 ;; %180 = vpow2.f32 %v121_v27 }
0xf : { %v171_v31 = vpop.eup %170 ;; %v30_v32 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v169_v29, /*on_false_vx=*/0.0 }
0x10 : { %v173_v33 = vpop.eup %172 ;; %v33_v34 = vadd.f32 %v30_v32, %v18_v30 ;; %v46_v35 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v171_v31, /*on_false_vx=*/0.0 }
0x11 : { %v175_v36 = vpop.eup %174 ;; %v62_v37 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v173_v33, /*on_false_vx=*/0.0 }
0x12 : { %v177_v38 = vpop.eup %176 ;; %v49_v39 = vadd.f32 %v46_v35, %v33_v34 ;; %v78_v40 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v175_v36, /*on_false_vx=*/0.0 }
0x13 : { %v179_v41 = vpop.eup %178 ;; %v94_v43 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v177_v38, /*on_false_vx=*/0.0 }
0x14 : { %v65_v42 = vadd.f32 %v62_v37, %v49_v39 ;; %v181_v44 = vpop.eup %180 ;; %v110_v46 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v179_v41, /*on_false_vx=*/0.0 }
0x15 : { %v126_v48 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v181_v44, /*on_false_vx=*/0.0 }
0x16 : { %v81_v45 = vadd.f32 %v78_v40, %v65_v42 }
0x17 : { %v97_v47 = vadd.f32 %v94_v43, %v81_v45 }
0x18 : { %v113_v49 = vadd.f32 %v110_v46, %v97_v47 }
0x19 : { %v129_v50 = vadd.f32 %v126_v48, %v113_v49 }
0x1a : { %v132_v51 = vrot.slane %v129_v50, 4 }
0x1b : { %v135_v52 = vadd.f32 %v132_v51, %v129_v50 }
0x1c : { %v137_v53 = vrot.slane %v135_v52, 2 }
0x1d : { %v140_v54 = vadd.f32 %v137_v53, %v135_v52 }
0x1e : { %v142_v55 = vrot.slane %v140_v54, 1 }
0x1f : { %v145_v57 = vadd.f32 %v142_v55, %v140_v54 }
0x20 : { %v149_v58 = vadd.f32 %v145_v57, %v131_v56 }
0x21 : { %151 = vst [vmem:[#allocation0] sm:$0x1] /*vst_source=*/%v149_v58 }
0x22 : { %v154_v59 = vld [vmem:[#allocation0] sm:$0x1] }
0x23 : { %157 = vst [vmem:[%s238_s2] sm:$0x1] /*vst_source=*/%v154_v59 } /* exit bundle: %fusion.2 = fusion(%get-tuple-element.1, %get-tuple-element) */
==> compiler_dump/llo/1767846220022805106-fusion.5-78-final_bundles.txt <==
= control target key start
LH: loop header
LB: loop body
LE: loop exit
PB: predicated region body
PF: predicated region fallthrough
CT: control target
= control target key end
0 : { %v292_v0 = vmov -inf /* materialized constant */ ;; %s364_s0 = inlined_call_operand.vmem [shape: f32[64,32], index: 0, kind: input, shape index: {}] /* operand 0 */ ;; %s365_s1 = inlined_call_operand.hbm [shape: f32[32,64], index: 1, kind: input, shape index: {}] /* operand 1 */ ;; %s366_s2 = inlined_call_operand.vmem [shape: f32[64], index: 2, kind: output, shape index: {0}] /* operand 2 */ ;; %s367_s3 = inlined_call_operand.vmem [shape: f32[64,64], index: 3, kind: output, shape index: {1}] /* operand 3 */ } /* entry bundle: %fusion.5 = fusion(%copy-done, %Arg_1.2) */
0x1 : { %6 = vst [vmem:[#allocation1] sm:$0xff] /*vst_source=*/%v292_v0 }
0x2 : { %7 = vsyncpa [#allocation3], 0 ;; %s15_s14 = sshll.u32 %s365_s1, 4 ;; %s293_s15 = smov [#allocation2] /* materialized constant */ ;; %s16_s14 = int_to_ptr.hbm [resolvable:$true] %s15_s14 } /* Start region 2 */
0x3 : { %s17_s16 = sshll.u32 %s293_s15, 4 ;; %s18_s16 = int_to_ptr.vmem [resolvable:$true] %s17_s16 }
0x4 : { %20 = dma.hbm_to_vmem [thread:$0] /*hbm=*/%s16_s14, /*size_in_granules=*/512, /*vmem=*/%s18_s16, /*dst_syncflagno=*/[#allocation3] /*
base_bounds: (4, 1)
dynamic_base_bounds: (4, 1)
window_bounds: (4, 1)
iteration_bounds: (1, 1, 1)
strides: (4, 1)
pad_low: (0, 0)
pad_high: (0, 0)
element_size_in_bytes: 4096 */ }
0x5 : { %290 = dma.done.wait [#allocation3], 512 /* pipeline-emitter-dma-wait */ }
0x6 : { %291 = vsyncadd [#allocation3], 4294966784 ;; %v43_v1 = vld [vmem:[#allocation2] sm:$0xff] ;; %v242_v2 = vld [vmem:[%s364_s0 + $0x18] sm:$0xff] ;; %v106_v22 = vlaneseq }
0x7 : { %61 = vxpose.xlu0.b32.start [1/4] (short) (narrow) /*vx=*/%v43_v1, /*width=*/64 ;; %v243_v3 = vld [vmem:[%s364_s0 + $0x10] sm:$0xff] ;; %86 = vmatpush.msra.mxu0 %v242_v2 ;; %v244_v4 = vld [vmem:[%s364_s0 + $0x8] sm:$0xff] ;; %v53_v6 = vld [vmem:[#allocation2 + $0x10] sm:$0xff] }
0x8 : { %252 = vmatpush.msra.mxu1 %v242_v2 ;; %253 = vmatpush.msra.mxu2 %v242_v2 ;; %v48_v5 = vld [vmem:[#allocation2 + $0x8] sm:$0xff] ;; %v58_v7 = vld [vmem:[#allocation2 + $0x18] sm:$0xff] ;; %v107_v23 = vand.u32 127, %v106_v22 }
0x9 : { %254 = vmatpush.msra.mxu3 %v242_v2 ;; %92 = vmatpush.msra.mxu0 %v243_v3 ;; %v99_v8 = vld [vmem:[%s364_s0] sm:$0xff] }
0xa : { %255 = vmatpush.msra.mxu1 %v243_v3 ;; %256 = vmatpush.msra.mxu2 %v243_v3 ;; %vm108_vm0 = vcmp.lt.s32.totalorder %v107_v23, 64 ;; %v210_v47 = vld [vmem:[#allocation1] ss:$0 sm:$0xff] }
0xb : { %257 = vmatpush.msra.mxu3 %v243_v3 ;; %98 = vmatpush.msra.mxu0 %v244_v4 }
0xc : { %258 = vmatpush.msra.mxu1 %v244_v4 ;; %259 = vmatpush.msra.mxu2 %v244_v4 }
0xd : { %260 = vmatpush.msra.mxu3 %v244_v4 ;; %102 = vmatpush.msra.mxu0 %v99_v8 }
0xe : { %261 = vmatpush.msra.mxu1 %v99_v8 ;; %262 = vmatpush.msra.mxu2 %v99_v8 }
0xf : { %62 = vxpose.xlu0.b32.cont [2/4] (short) (narrow) /*vx=*/%v48_v5, /*width=*/64 ;; %263 = vmatpush.msra.mxu3 %v99_v8 }
0x10 : { %63 = vxpose.xlu0.b32.cont [3/4] (short) (narrow) /*vx=*/%v53_v6, /*width=*/64 }
0x11 : { %64 = vxpose.xlu0.b32.end [4/4] (short) (narrow) /*vx=*/%v58_v7, /*width=*/64 }
0x12 : { %v65_v9 = vpop.trf.xlu0 }
0x13 : { %103 = vmatmul.f32.vlgmr.msra.gmra.mxu0 %v65_v9 }
0x14 : { %v66_v10 = vpop.trf.xlu0 }
0x15 : { %111 = vmatmul.f32.gmra.mxu0 %v66_v10 }
0x16 : { %v67_v11 = vpop.trf.xlu0 }
0x17 : { %125 = vmatmul.f32.vlgmr.msra.gmra.mxu1 %v67_v11 }
0x18 : { %v68_v12 = vpop.trf.xlu0 }
0x19 : { %139 = vmatmul.f32.gmra.mxu1 %v68_v12 }
0x1a : { %v69_v13 = vpop.trf.xlu0 }
0x1b : { %153 = vmatmul.f32.vlgmr.msra.gmra.mxu2 %v69_v13 }
0x1c : { %v70_v14 = vpop.trf.xlu0 }
0x1d : { %167 = vmatmul.f32.gmra.mxu2 %v70_v14 }
0x1e : { %v71_v15 = vpop.trf.xlu0 }
0x1f : { %181 = vmatmul.f32.vlgmr.msra.gmra.mxu3 %v71_v15 }
0x20 : { %v72_v16 = vpop.trf.xlu0 }
0x21 : { %195 = vmatmul.f32.gmra.mxu3 %v72_v16 }
0x22 : { %v104_v17 = vpop.f32.mrf.mxu0 }
0x23 : { %110 = vst [vmem:[%s367_s3] sm:$0xff] /*vst_source=*/%v104_v17 ;; %v109_v26 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v104_v17, /*on_false_vx=*/-inf }
0x24 : { %v112_v18 = vpop.f32.mrf.mxu0 }
0x25 : { %245 = vst [vmem:[%s367_s3 + $0x8] sm:$0xff] /*vst_source=*/%v112_v18 ;; %v118_v25 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v112_v18, /*on_false_vx=*/-inf }
0x26 : { %v121_v28 = vmax.f32 %v109_v26, %v118_v25 }
0x27 : { %v126_v19 = vpop.f32.mrf.mxu1 }
0x28 : { %246 = vst [vmem:[%s367_s3 + $0x10] sm:$0xff] /*vst_source=*/%v126_v19 ;; %v132_v27 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v126_v19, /*on_false_vx=*/-inf }
0x29 : { %v135_v30 = vmax.f32 %v121_v28, %v132_v27 }
0x2a : { %v140_v20 = vpop.f32.mrf.mxu1 }
0x2b : { %247 = vst [vmem:[%s367_s3 + $0x18] sm:$0xff] /*vst_source=*/%v140_v20 ;; %v146_v29 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v140_v20, /*on_false_vx=*/-inf }
0x2c : { %v149_v33 = vmax.f32 %v135_v30, %v146_v29 }
0x2d : { %v154_v21 = vpop.f32.mrf.mxu2 }
0x2e : { %248 = vst [vmem:[%s367_s3 + $0x20] sm:$0xff] /*vst_source=*/%v154_v21 ;; %v160_v32 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v154_v21, /*on_false_vx=*/-inf }
0x2f : { %v163_v35 = vmax.f32 %v149_v33, %v160_v32 }
0x30 : { %v168_v24 = vpop.f32.mrf.mxu2 }
0x31 : { %249 = vst [vmem:[%s367_s3 + $0x28] sm:$0xff] /*vst_source=*/%v168_v24 ;; %v174_v34 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v168_v24, /*on_false_vx=*/-inf }
0x32 : { %v177_v37 = vmax.f32 %v163_v35, %v174_v34 }
0x33 : { %v182_v31 = vpop.f32.mrf.mxu3 }
0x34 : { %250 = vst [vmem:[%s367_s3 + $0x30] sm:$0xff] /*vst_source=*/%v182_v31 ;; %v188_v36 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v182_v31, /*on_false_vx=*/-inf }
0x35 : { %v191_v38 = vmax.f32 %v177_v37, %v188_v36 }
0x36 : { %v196_v39 = vpop.f32.mrf.mxu3 }
0x37 : { %v202_v40 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v196_v39, /*on_false_vx=*/-inf ;; %251 = vst [vmem:[%s367_s3 + $0x38] sm:$0xff] /*vst_source=*/%v196_v39 }
0x38 : { %v205_v41 = vmax.f32 %v191_v38, %v202_v40 }
0x39 : { %v211_v42 = vrot.slane %v205_v41, 4 }
0x3a : { %v214_v43 = vmax.f32 %v205_v41, %v211_v42 }
0x3b : { %v216_v44 = vrot.slane %v214_v43, 2 }
0x3c : { %v219_v45 = vmax.f32 %v214_v43, %v216_v44 }
0x3d : { %v221_v46 = vrot.slane %v219_v45, 1 }
0x3e : { %v224_v48 = vmax.f32 %v219_v45, %v221_v46 }
0x3f : { %v228_v49 = vmax.f32 %v210_v47, %v224_v48 }
0x40 : { %230 = vst [vmem:[#allocation1] sm:$0x1] /*vst_source=*/%v228_v49 }
0x41 : { %235 = vsyncpa [#allocation3], 1 } /* End region 2 */
0x42 : { %v238_v50 = vld [vmem:[#allocation1] sm:$0x1] }
0x43 : { %241 = vst [vmem:[%s366_s2] sm:$0x1] /*vst_source=*/%v238_v50 } /* exit bundle: %fusion.5 = fusion(%copy-done, %Arg_1.2) */
==> compiler_dump/llo/1767846220024102775-fusion-78-final_bundles.txt <==
= control target key start
LH: loop header
LB: loop body
LE: loop exit
PB: predicated region body
PF: predicated region fallthrough
CT: control target
= control target key end
0 : { %v14_v9 = vlaneseq ;; %s471_s0 = inlined_call_operand.vmem [shape: f32[64,32], index: 0, kind: input, shape index: {}] /* operand 0 */ ;; %s472_s1 = inlined_call_operand.vmem [shape: f32[64,64], index: 1, kind: input, shape index: {}] /* operand 1 */ ;; %s473_s2 = inlined_call_operand.vmem [shape: f32[64], index: 2, kind: input, shape index: {}] /* operand 2 */ ;; %s474_s3 = inlined_call_operand.vmem [shape: f32[64], index: 3, kind: input, shape index: {}] /* operand 3 */ ;; %s475_s4 = inlined_call_operand.hbm [shape: f32[64,32], index: 4, kind: output, shape index: {}] /* operand 4 */ } /* entry bundle: %fusion = fusion(%copy-done.1, %get-tuple-element.1, %fusion.2, %get-tuple-element) */
0x1 : { %v312_v0 = vld [vmem:[%s472_s1 + $0x38] sm:$0xff] ;; %v61_v1 = vld [vmem:[%s473_s2] ss:$0 sm:$0xff] ;; %v313_v4 = vld [vmem:[%s472_s1 + $0x30] sm:$0xff] }
0x2 : { %v62_v2 = vld [vmem:[%s474_s3] ss:$0 sm:$0xff] ;; %353 = vrcp.f32 %v61_v1 ;; %v314_v5 = vld [vmem:[%s472_s1 + $0x28] sm:$0xff] ;; %v316_v11 = vld [vmem:[%s472_s1 + $0x18] sm:$0xff] }
0x3 : { %v65_v3 = vsub.f32 %v312_v0, %v62_v2 ;; %v92_v6 = vsub.f32 %v313_v4, %v62_v2 ;; %v315_v7 = vld [vmem:[%s472_s1 + $0x20] sm:$0xff] ;; %v119_v8 = vsub.f32 %v314_v5, %v62_v2 ;; %v317_v14 = vld [vmem:[%s472_s1 + $0x10] sm:$0xff] }
0x4 : { %v146_v13 = vsub.f32 %v315_v7, %v62_v2 }
0x5 : { %v67_v10 = vmul.f32 1.442695, %v65_v3 ;; %v94_v12 = vmul.f32 1.442695, %v92_v6 }
0x6 : { %5 = vsyncpa [#allocation3], 0 ;; %v121_v16 = vmul.f32 1.442695, %v119_v8 ;; %v173_v17 = vsub.f32 %v316_v11, %v62_v2 ;; %v447_v18 = vand.u32 127, %v14_v9 ;; %v80_v20 = vand.u32 2147483648, %v61_v1 }
0x7 : { %355 = vpow2.f32 %v67_v10 ;; %v318_v21 = vld [vmem:[%s472_s1 + $0x8] sm:$0xff] ;; %v148_v22 = vmul.f32 1.442695, %v146_v13 ;; %v200_v23 = vsub.f32 %v317_v14, %v62_v2 ;; %vm74_vm0 = vweird.f32 %v61_v1 ;; %v249_v26 = vld [vmem:[%s472_s1] sm:$0xff] ;; %v310_v54 = vld [vmem:[%s471_s0 + $0x10] sm:$0xff] ;; %s397_s14 = smov [#allocation1] /* materialized constant */ }
0x8 : { %v354_v15 = vpop.eup %353 ;; %357 = vpow2.f32 %v94_v12 ;; %v78_v25 = vand.u32 2147483647, %v61_v1 ;; %v175_v27 = vmul.f32 1.442695, %v173_v17 ;; %v227_v28 = vsub.f32 %v318_v21, %v62_v2 ;; %v311_v55 = vld [vmem:[%s471_s0 + $0x18] sm:$0xff] ;; %v32_v56 = vld [vmem:[%s471_s0] sm:$0xff] }
0x9 : { %v70_v19 = vmul.f32 %v354_v15, %v61_v1 ;; %vm75_vm1 = vweird.f32 %v354_v15 ;; %359 = vpow2.f32 %v121_v16 ;; %v81_v30 = vor.u32 1.1754944e-38, %v80_v20 ;; %v309_v57 = vld [vmem:[%s471_s0 + $0x8] sm:$0xff] ;; %s298_s15 = sshll.u32 %s397_s14, 4 ;; %s300_s18 = sshll.u32 %s475_s4, 4 ;; %s299_s15 = int_to_ptr.vmem [resolvable:$true] %s298_s15 ;; %s301_s18 = int_to_ptr.hbm [resolvable:$true] %s300_s18 }
0xa : { %vm76_vm2 = vmor %vm74_vm0, %vm75_vm1 ;; %361 = vpow2.f32 %v148_v22 ;; %v202_v31 = vmul.f32 1.442695, %v200_v23 ;; %v252_v32 = vsub.f32 %v249_v26, %v62_v2 ;; %vm79_vm3 = vcmp.eq.f32.partialorder %v78_v25, 8.507059e+37 }
0xb : { %v71_v24 = vsub.f32 1.0, %v70_v19 ;; %363 = vpow2.f32 %v175_v27 ;; %v229_v36 = vmul.f32 1.442695, %v227_v28 ;; %vm351_vm4 = vcmp.lt.s32.totalorder %v447_v18, 64 }
0xc : { %365 = vpow2.f32 %v202_v31 ;; %v254_v40 = vmul.f32 1.442695, %v252_v32 }
0xd : { %v72_v29 = vmul.f32 %v354_v15, %v71_v24 ;; %v356_v33 = vpop.eup %355 ;; %367 = vpow2.f32 %v229_v36 }
0xe : { %v358_v35 = vpop.eup %357 ;; %369 = vpow2.f32 %v254_v40 }
0xf : { %v73_v34 = vadd.f32 %v354_v15, %v72_v29 ;; %v360_v38 = vpop.eup %359 }
0x10 : { %v362_v43 = vpop.eup %361 }
0x11 : { %v77_v37 = vsel /*vm=*/%vm76_vm2, /*on_true_vy=*/%v354_v15, /*on_false_vx=*/%v73_v34 ;; %v364_v45 = vpop.eup %363 }
0x12 : { %v82_v39 = vsel /*vm=*/%vm79_vm3, /*on_true_vy=*/%v81_v30, /*on_false_vx=*/%v77_v37 ;; %v366_v47 = vpop.eup %365 }
0x13 : { %v83_v41 = vmul.f32 %v356_v33, %v82_v39 ;; %v110_v42 = vmul.f32 %v358_v35, %v82_v39 ;; %v137_v44 = vmul.f32 %v360_v38, %v82_v39 ;; %v164_v46 = vmul.f32 %v362_v43, %v82_v39 ;; %v368_v49 = vpop.eup %367 }
0x14 : { %v191_v48 = vmul.f32 %v364_v45, %v82_v39 ;; %v218_v50 = vmul.f32 %v366_v47, %v82_v39 ;; %v370_v51 = vpop.eup %369 ;; %v245_v52 = vmul.f32 %v368_v49, %v82_v39 }
0x15 : { %328 = vmatpush.msra.mxu2 %v83_v41 ;; %329 = vmatpush.msra.mxu3 %v83_v41 ;; %v270_v53 = vmul.f32 %v370_v51, %v82_v39 }
0x16 : { %86 = vmatpush.msra.mxu0 %v83_v41 ;; %327 = vmatpush.msra.mxu1 %v83_v41 }
0x17 : { %331 = vmatpush.msra.mxu2 %v110_v42 ;; %332 = vmatpush.msra.mxu3 %v110_v42 }
0x18 : { %113 = vmatpush.msra.mxu0 %v110_v42 ;; %330 = vmatpush.msra.mxu1 %v110_v42 }
0x19 : { %334 = vmatpush.msra.mxu2 %v137_v44 ;; %335 = vmatpush.msra.mxu3 %v137_v44 }
0x1a : { %140 = vmatpush.msra.mxu0 %v137_v44 ;; %333 = vmatpush.msra.mxu1 %v137_v44 }
0x1b : { %337 = vmatpush.msra.mxu2 %v164_v46 ;; %338 = vmatpush.msra.mxu3 %v164_v46 }
0x1c : { %167 = vmatpush.msra.mxu0 %v164_v46 ;; %336 = vmatpush.msra.mxu1 %v164_v46 }
0x1d : { %340 = vmatpush.msra.mxu2 %v191_v48 ;; %341 = vmatpush.msra.mxu3 %v191_v48 }
0x1e : { %194 = vmatpush.msra.mxu0 %v191_v48 ;; %339 = vmatpush.msra.mxu1 %v191_v48 }
0x1f : { %343 = vmatpush.msra.mxu2 %v218_v50 ;; %344 = vmatpush.msra.mxu3 %v218_v50 }
0x20 : { %221 = vmatpush.msra.mxu0 %v218_v50 ;; %342 = vmatpush.msra.mxu1 %v218_v50 }
0x21 : { %346 = vmatpush.msra.mxu2 %v245_v52 ;; %347 = vmatpush.msra.mxu3 %v245_v52 }
0x22 : { %248 = vmatpush.msra.mxu0 %v245_v52 ;; %345 = vmatpush.msra.mxu1 %v245_v52 }
0x23 : { %349 = vmatpush.msra.mxu2 %v270_v53 ;; %350 = vmatpush.msra.mxu3 %v270_v53 }
0x24 : { %324 = vmatmul.msk.f32.vlgmr.msra.gmra.mxu2 %vm351_vm4, %v310_v54 ;; %326 = vmatmul.msk.f32.vlgmr.msra.gmra.mxu3 %vm351_vm4, %v311_v55 }
0x25 : { %273 = vmatpush.msra.mxu0 %v270_v53 ;; %348 = vmatpush.msra.mxu1 %v270_v53 }
0x26 : { %320 = vmatmul.msk.f32.vlgmr.msra.gmra.mxu0 %vm351_vm4, %v32_v56 ;; %322 = vmatmul.msk.f32.vlgmr.msra.gmra.mxu1 %vm351_vm4, %v309_v57 }
0x27 : { %v275_v58 = vpop.f32.mrf.mxu0 ;; %v278_v59 = vpop.f32.mrf.mxu1 }
0x28 : { %276 = vst [vmem:[#allocation1] sm:$0xff] /*vst_source=*/%v275_v58 }
0x29 : { %281 = vst [vmem:[#allocation1 + $0x8] sm:$0xff] /*vst_source=*/%v278_v59 }
0x2a : { %v283_v60 = vpop.f32.mrf.mxu2 ;; %v288_v61 = vpop.f32.mrf.mxu3 }
0x2b : { %286 = vst [vmem:[#allocation1 + $0x10] sm:$0xff] /*vst_source=*/%v283_v60 }
0x2c : { %291 = vst [vmem:[#allocation1 + $0x18] sm:$0xff] /*vst_source=*/%v288_v61 }
0x2d : { %303 = dma.vmem_to_hbm [thread:$0] /*vmem=*/%s299_s15, /*size_in_granules=*/512, /*hbm=*/%s301_s18, /*dst_syncflagno=*/[#allocation3] /*
base_bounds: (4, 1)
dynamic_base_bounds: (4, 1)
window_bounds: (4, 1)
iteration_bounds: (1, 1, 1)
strides: (4, 1)
pad_low: (0, 0)
pad_high: (0, 0)
element_size_in_bytes: 4096 */ }
0x2e : { %395 = dma.done.wait [#allocation3], 512 /* pipeline-emitter-dma-wait */ }
0x2f : { %396 = vsyncadd [#allocation3], 4294966784 }
0x30 : { %308 = vsyncpa [#allocation3], 1 } /* exit bundle: %fusion = fusion(%copy-done.1, %get-tuple-element.1, %fusion.2, %get-tuple-element) */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment