Created
January 8, 2026 04:32
-
-
Save leegao/c237a207fa61f73859fe8282470f3d56 to your computer and use it in GitHub Desktop.
VLIW dump of mini_attention (softmax(x @ w1) @ w2)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| !rm -rf compiler_dump | |
| !rm compiler_dump.zip | |
| import os | |
| # # Create dump directories | |
| DUMP_ROOT = "compiler_dump/" | |
| HLO_DUMP_PATH = os.path.join(DUMP_ROOT, "hlo") | |
| LLO_DUMP_PATH = os.path.join(DUMP_ROOT, "llo") | |
| os.makedirs(HLO_DUMP_PATH, exist_ok=True) | |
| os.makedirs(LLO_DUMP_PATH, exist_ok=True) | |
| # os.environ["XLA_FLAGS"] = ( | |
| # f"--xla_dump_hlo_as_text " | |
| # f"--xla_dump_to={HLO_DUMP_PATH} " | |
| # f"--xla_dump_hlo_pass_re=.* " | |
| # ) | |
| os.environ["LIBTPU_INIT_ARGS"] = ( | |
| f"--xla_jf_dump_to={LLO_DUMP_PATH} " | |
| f"--xla_jf_dump_hlo_text=true " | |
| f"--xla_jf_dump_llo_text=true " | |
| f"--xla_jf_dump_llo_html=false " | |
| f"--xla_jf_dump_llo_static_gaps=true " | |
| f"--xla_jf_emit_annotations=true " | |
| f"--xla_jf_debug_level=2 " | |
| f"--xla_jf_dump_debug_info=true " | |
| f"--xla_jf_dump_fusion_computations=false" | |
| ) | |
| # Import JAX after setting env vars | |
| import jax | |
| import jax.numpy as jnp | |
| @jax.named_call | |
| def softmax(h): | |
| """Stage 3: Softmax (row-wise, numerically stable)""" | |
| h_max = jnp.max(h, axis=-1, keepdims=True) | |
| exp_h = jnp.exp(h - h_max) | |
| return exp_h / jnp.sum(exp_h, axis=-1, keepdims=True) | |
| def mini_attention(x, w1, w2): | |
| """ | |
| A minimal attention-like block: | |
| matmul → softmax → matmul | |
| """ | |
| h = x @ w1 | |
| h = softmax(h) | |
| out = h @ w2 | |
| return out | |
| batch, d_in, d_mid, d_out = 64, 32, 64, 32 | |
| key = jax.random.PRNGKey(42) | |
| k1, k2, k3 = jax.random.split(key, 3) | |
| x = jax.random.normal(k1, (batch, d_in)) | |
| w1 = jax.random.normal(k2, (d_in, d_mid)) * 0.02 | |
| w2 = jax.random.normal(k3, (d_mid, d_out)) * 0.02 | |
| data = jax.jit(mini_attention, backend='tpu').lower(x, w1, w2).compile().as_text() | |
| print(data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ==> compiler_dump/llo/1767846219999249198-TLP-79-final_bundles.txt <== | |
| = control target key start | |
| LH: loop header | |
| LB: loop body | |
| LE: loop exit | |
| PB: predicated region body | |
| PF: predicated region fallthrough | |
| CT: control target | |
| = control target key end | |
| 0 : { %s237_s0 = sld [smem:[#allocation22]] } /* Start region 0 :: Start region 281 */ | |
| 0x1 : { %s238_s1 = sand.u32 134217727, %s237_s0 } | |
| 0x2 : { %s239_s2 = sor.u32 4026531840, %s238_s1 } | |
| 0x3 : { %240 = vtrace %s239_s2 } | |
| 0x4 : { %s231_s3 = sld [smem:[#allocation19]] } | |
| 0x5 : { %232 = vtrace %s231_s3 } | |
| 0x6 : { %s233_s4 = sld [smem:[#allocation20]] } | |
| 0x7 : { %234 = vtrace %s233_s4 } | |
| 0x8 : { %s235_s5 = sld [smem:[#allocation21]] } | |
| 0x9 : { %236 = vtrace %s235_s5 } | |
| 0xa : { %v218_v0 = vlaneseq } /* Start region 43 :: Start region 44 :: Start region 45 */ | |
| 0xb : { %v310_v1 = vshrl.u32 %v218_v0, 7 } | |
| 0xc : { %v220_v2 = vshrl.u32 %v310_v1, 1 ;; %v221_v3 = vand.u32 1, %v310_v1 } | |
| 0xd : { %v222_v4 = vshll.u32 %v221_v3, 2 ;; %v229_v6 = vsub.s32 %v220_v2, %v310_v1 } | |
| 0xe : { %v223_v5 = vadd.s32 %v222_v4, %v220_v2 } | |
| 0xf : { %v224_v7 = vsub.s32 %v223_v5, %v310_v1 } | |
| 0x10 : { %225 = vsetiar.raw.iar0 %v224_v7 /* EvenOdd Store IAR initialization */ } | |
| 0x11 : { %230 = vsetiar.raw.iar1 %v229_v6 /* EvenOdd Load IAR initialization */ } | |
| 0x12 : { %s98_s6 = sld [smem:[#allocation16]] } | |
| 0x13 : { %p241_p0 = scmp.eq.s32.totalorder %s98_s6, 0 } /* End region 43 */ | |
| 0x14 : { %s115_s7 = sxor.u32 (!%p241_p0), 2925155241, %s98_s6 } | |
| 0x15 : { %102 = sbr.rel (%p241_p0) target bundleno = 160 (0xa0), region = 46 } | |
| 0x16 : { %s116_s8 = smul.u32 (!%p241_p0), 2223506493, %s115_s7 } | |
| 0x17 : {} | |
| 0x18 : { %s117_s9 = sshrl.u32 (!%p241_p0), %s116_s8, 16 } | |
| 0x19 : { %s118_s10 = sxor.u32 (!%p241_p0), %s117_s9, %s116_s8 } /* End region 44 */ | |
| 0x1a : { %v107_v8 = vand.u32 127, %v218_v0 ;; %s123_s11 = smul.u32 3389127133, %s118_s10 ;; %vm242_vm0 = vcmp.eq.s32.totalorder %v310_v1, 1 ;; %vm243_vm2 = vcmp.eq.s32.totalorder %v310_v1, 2 ;; %vm244_vm3 = vcmp.eq.s32.totalorder %v310_v1, 3 } | |
| 0x1b : { %v111_v9 = vxor.u32 1135663077, %v107_v8 ;; %v125_v17 = vstv %s123_s11 } | |
| 0x1c : { %v112_v10 = vmul.u32 2925155241, %v111_v9 } | |
| 0x1d : { %v113_v11 = vshrl.u32 %v112_v10, 16 } | |
| 0x1e : { %v114_v12 = vxor.u32 %v113_v11, %v112_v10 } | |
| 0x1f : { %v119_v13 = vxor.u32 2223506493, %v114_v12 ;; %v133_v26 = vmul.u32 3389127133, %v114_v12 } | |
| 0x20 : { %v120_v14 = vmul.u32 1519409121, %v119_v13 } | |
| 0x21 : { %v121_v15 = vshrl.u32 %v120_v14, 16 } | |
| 0x22 : { %v122_v16 = vxor.u32 %v121_v15, %v120_v14 } | |
| 0x23 : { %v124_v18 = vmul.u32 1232336661, %v122_v16 } | |
| 0x24 : { %v126_v19 = vsub.s32 %v125_v17, %v124_v18 } | |
| 0x25 : { %v127_v20 = vshrl.u32 %v126_v19, 16 } | |
| 0x26 : { %v128_v21 = vxor.u32 %v127_v20, %v126_v19 } | |
| 0x27 : { %v129_v22 = vxor.u32 1519409121, %v128_v21 ;; %v142_v31 = vxor.u32 2925155241, %v128_v21 } | |
| 0x28 : { %v130_v23 = vmul.u32 2449846741, %v129_v22 ;; %v143_v34 = vmul.u32 2223506493, %v142_v31 } | |
| 0x29 : { %v131_v24 = vshrl.u32 %v130_v23, 16 ;; %v144_v37 = vshrl.u32 %v143_v34, 16 } | |
| 0x2a : { %v132_v25 = vxor.u32 %v131_v24, %v130_v23 ;; %v145_v39 = vxor.u32 %v144_v37, %v143_v34 } | |
| 0x2b : { %v134_v27 = vmul.u32 1232336661, %v132_v25 ;; %v150_v43 = vmul.u32 3389127133, %v145_v39 } | |
| 0x2c : { %v135_v28 = vsub.s32 %v133_v26, %v134_v27 } | |
| 0x2d : { %v136_v29 = vshrl.u32 %v135_v28, 16 } | |
| 0x2e : { %v137_v30 = vxor.u32 %v136_v29, %v135_v28 } | |
| 0x2f : { %v138_v32 = vxor.u32 1135663077, %v137_v30 } | |
| 0x30 : { %v139_v33 = vmul.u32 2925155241, %v138_v32 } | |
| 0x31 : { %v140_v35 = vshrl.u32 %v139_v33, 16 } | |
| 0x32 : { %v141_v36 = vxor.u32 %v140_v35, %v139_v33 } | |
| 0x33 : { %v146_v38 = vxor.u32 2223506493, %v141_v36 ;; %v159_v52 = vmul.u32 3389127133, %v141_v36 } | |
| 0x34 : { %v147_v40 = vmul.u32 1519409121, %v146_v38 } | |
| 0x35 : { %v148_v41 = vshrl.u32 %v147_v40, 16 } | |
| 0x36 : { %v149_v42 = vxor.u32 %v148_v41, %v147_v40 } | |
| 0x37 : { %v151_v44 = vmul.u32 1232336661, %v149_v42 } | |
| 0x38 : { %v152_v45 = vsub.s32 %v150_v43, %v151_v44 } | |
| 0x39 : { %v153_v46 = vshrl.u32 %v152_v45, 16 } | |
| 0x3a : { %v154_v47 = vxor.u32 %v153_v46, %v152_v45 } | |
| 0x3b : { %v155_v48 = vxor.u32 1519409121, %v154_v47 ;; %v172_v53 = vxor.u32 1179257497, %v154_v47 ;; %v188_v57 = vxor.u32 3546938817, %v154_v47 } | |
| 0x3c : { %v176_v60 = vxor.u32 461070425, %v154_v47 ;; %v192_v63 = vxor.u32 728804945, %v154_v47 } | |
| 0x3d : { %v156_v49 = vmul.u32 2449846741, %v155_v48 ;; %v173_v56 = vmul.u32 2174555301, %v172_v53 ;; %v189_v62 = vmul.u32 1343633581, %v188_v57 } | |
| 0x3e : { %v177_v6 = vmul.u32 702470093, %v176_v60 ;; %v193_v10 = vmul.u32 1920080165, %v192_v63 } | |
| 0x3f : { %v157_v50 = vshrl.u32 %v156_v49, 16 ;; %v174_v61 = vshrl.u32 %v173_v56, 16 ;; %v190_v14 = vshrl.u32 %v189_v62, 16 } | |
| 0x40 : { %v178_v18 = vshrl.u32 %v177_v6, 16 ;; %v194_v20 = vshrl.u32 %v193_v10, 16 } | |
| 0x41 : { %v158_v51 = vxor.u32 %v157_v50, %v156_v49 ;; %v175_v12 = vxor.u32 %v174_v61, %v173_v56 ;; %v191_v21 = vxor.u32 %v190_v14, %v189_v62 } | |
| 0x42 : { %v179_v25 = vxor.u32 %v178_v18, %v177_v6 ;; %v195_v27 = vxor.u32 %v194_v20, %v193_v10 } | |
| 0x43 : { %v160_v54 = vmul.u32 1232336661, %v158_v51 } | |
| 0x44 : { %v161_v55 = vsub.s32 %v159_v52, %v160_v54 } | |
| 0x45 : { %v162_v58 = vshrl.u32 %v161_v55, 16 } | |
| 0x46 : { %v163_v59 = vxor.u32 %v162_v58, %v161_v55 } | |
| 0x47 : { %v164_v0 = vxor.u32 2337405405, %v163_v59 ;; %v168_v2 = vxor.u32 747796405, %v163_v59 ;; %v180_v3 = vxor.u32 2174555301, %v163_v59 } | |
| 0x48 : { %v184_v4 = vxor.u32 702470093, %v163_v59 } | |
| 0x49 : { %v165_v5 = vmul.u32 1179257497, %v164_v0 ;; %v181_v7 = vmul.u32 3546938817, %v180_v3 ;; %v169_v8 = vmul.u32 461070425, %v168_v2 } | |
| 0x4a : { %v185_v9 = vmul.u32 728804945, %v184_v4 } | |
| 0x4b : { %v166_v11 = vshrl.u32 %v165_v5, 16 ;; %v182_v13 = vshrl.u32 %v181_v7, 16 ;; %v170_v17 = vshrl.u32 %v169_v8, 16 } | |
| 0x4c : { %v186_v19 = vshrl.u32 %v185_v9, 16 } | |
| 0x4d : { %v167_v15 = vxor.u32 %v166_v11, %v165_v5 ;; %v183_v16 = vxor.u32 %v182_v13, %v181_v7 ;; %v171_v24 = vxor.u32 %v170_v17, %v169_v8 } | |
| 0x4e : { %v187_v26 = vxor.u32 %v186_v19, %v185_v9 } | |
| 0x4f : { %v196_v22 = vor.u32 %v175_v12, %v167_v15 } | |
| 0x50 : { %v197_v23 = vor.u32 %v196_v22, %v183_v16 } | |
| 0x51 : { %v198_v28 = vor.u32 %v197_v23, %v191_v21 } | |
| 0x52 : { %vm199_vm1 = vcmp.eq.s32.totalorder %v198_v28, 0 } | |
| 0x53 : { %v209_v29 = vsel /*vm=*/%vm199_vm1, /*on_true_vy=*/%v171_v24, /*on_false_vx=*/%v167_v15 ;; %v210_v30 = vsel /*vm=*/%vm199_vm1, /*on_true_vy=*/%v179_v25, /*on_false_vx=*/%v175_v12 ;; %v212_v31 = vsel /*vm=*/%vm199_vm1, /*on_true_vy=*/%v187_v26, /*on_false_vx=*/%v183_v16 ;; %v214_v32 = vsel /*vm=*/%vm199_vm1, /*on_true_vy=*/%v195_v27, /*on_false_vx=*/%v191_v21 } | |
| 0x54 : { %v211_v33 = vsel /*vm=*/%vm242_vm0, /*on_true_vy=*/%v210_v30, /*on_false_vx=*/%v209_v29 } | |
| 0x55 : { %v213_v34 = vsel /*vm=*/%vm243_vm2, /*on_true_vy=*/%v212_v31, /*on_false_vx=*/%v211_v33 } | |
| 0x56 : { %v215_v35 = vsel /*vm=*/%vm244_vm3, /*on_true_vy=*/%v214_v32, /*on_false_vx=*/%v213_v34 } | |
| 0x57 : { %216 = setrngseed %v215_v35 /* Rng seed initialization */ } | |
| 0x58 : { %v217_v36 = vrng /* Rng seed initialization */ } /* End region 45 */ | |
| 0x59 PF: { %93 = vsettm 1 } /* Start/End empty region 46 */ | |
| 0x5a : { %s295_s12 = smov 2147483646 /* materialized constant */ } | |
| 0x5b : { %92 = vsettm %s295_s12 } | |
| 0x5c : { %90 = vtrace 2415919103 } | |
| 0x5d : { %0 = vtrace 2952790016 } | |
| 0x5e : { %1 = vtrace 3221225472 } | |
| 0x5f : { %s2_s13 = sld [smem:[#allocation0]] } /* End region 281 :: Start region 1 :: Start region 2 */ | |
| 0x60 : { %p245_p1 = scmp.ne.s32.totalorder %s2_s13, 1 } /* End region 1 */ | |
| 0x61 : { %6 = sbr.rel (%p245_p1) target bundleno = 244 (0xf4), region = 4 } | |
| 0x62 : {} | |
| 0x63 : {} | |
| 0x64 : {} | |
| 0x65 : {} /* End region 2 */ | |
| 0x66 : { %s319_s4 = sld [smem:[#allocation2]] } /* Start region 3 :: Start region 282 :: Start/End empty region 6 */ | |
| 0x67 : { %337 = sst [smem:[#allocation23_spill]] %s319_s4 } | |
| 0x68 : { %s9_s14 = scalar_parameter_address 0 } /* Start/End empty region 7 */ | |
| 0x69 : { %s10_s1 = scalar_parameter_address 1 } /* Start/End empty region 8 */ | |
| 0x6a : { %33 = vtrace 2147483648 ;; %12 = compiler-scheduling-barrier ;; %14 = compiler-scheduling-barrier } /* Start/End empty region 9 :: Start/End empty region 10 :: Start/End empty region 11 :: Start region 12 */ | |
| 0x6b : { %17 = vsyncpa [#allocation8], 0 ;; %s18_s15 = sld [smem:[#allocation9]] ;; %15 = compiler-scheduling-barrier } | |
| 0x6c : { %p19_p2 = scmp.ne.s32.totalorder %s18_s15, 0 } | |
| 0x6d : { %s20_s16 = scalar_select /*predicate=*/%p19_p2, /*on_true=*/32, /*on_false=*/0 } | |
| 0x6e : { %s21_s17 = sshll.u32 %s20_s16, 4 } | |
| 0x6f : { %22 = vsyncadd [#allocation8], %s21_s17 ;; %s23_s18 = scalar_select /*predicate=*/%p19_p2, /*on_true=*/0, /*on_false=*/32 } | |
| 0x70 : { %s25_s19 = sshll.u32 %s9_s14, 4 ;; %s296_s21 = smov [#allocation7] /* materialized constant */ ;; %s26_s19 = int_to_ptr.hbm [resolvable:$true] %s25_s19 } | |
| 0x71 : { %s24_s20 = sshll.u32 %s23_s18, 4 ;; %s27_s22 = sshll.u32 %s296_s21, 4 ;; %s28_s22 = int_to_ptr.vmem [resolvable:$true] %s27_s22 } | |
| 0x72 : { %s261_s23 = sshra.s32 %s28_s22, 4 ;; %s263_s24 = sshrl.u32 %s24_s20, 4 ;; %s262_s23 = int_to_ptr.vmem [resolvable:$true] %s261_s23 } | |
| 0x73 : { %s268_s25 = scalar_lea.vmem %s262_s23, %s263_s24 ;; %s270_s26 = scalar_lea.vmem %s296_s21, 32 } | |
| 0x74 : { %p269_p3 = scmp.ne.s32.totalorder %s262_s23, %s268_s25 ;; %p272_p4 = scmp.lt.s32.totalorder %s270_s26, %s268_s25 } | |
| 0x75 : { %p274_p5 = pnand %p272_p4, %p269_p3 } | |
| 0x76 : { %277 = shalt.err (!%p274_p5) /* BoundsCheck 0 [deref of %s28] for %30 = dma.hbm_to_vmem [thread:$1] /*hbm=*/%s26, /*size_in_granules=*/%s24, /*vmem=*/%s28, /*dst_syncflagno=*/[#allocation8] | |
| hlo: copy-start | |
| */ } | |
| 0x77 : { %30 = dma.hbm_to_vmem [thread:$1] /*hbm=*/%s26_s19, /*size_in_granules=*/%s24_s20, /*vmem=*/%s28_s22, /*dst_syncflagno=*/[#allocation8] } | |
| 0x78 : { %34 = vtrace 2415919104 ;; %32 = compiler-scheduling-barrier } /* End region 12 */ | |
| 0x79 : { %46 = vtrace 2147483649 ;; %35 = compiler-scheduling-barrier ;; %37 = compiler-scheduling-barrier ;; %38 = compiler-scheduling-barrier ;; %40 = compiler-scheduling-barrier } /* Start/End empty region 15 :: Start/End empty region 16 :: Start region 17 */ | |
| 0x7a : { %291 = dma.done.wait [#allocation8], 512 /* local-dma-wait */ ;; %41 = compiler-scheduling-barrier } | |
| 0x7b : { %292 = vsyncadd [#allocation8], 4294966784 } | |
| 0x7c : { %44 = vsyncpa [#allocation8], 1 } | |
| 0x7d : { %47 = vtrace 2415919105 ;; %45 = compiler-scheduling-barrier } /* End region 282 :: End region 17 */ | |
| 0x7e : { %53 = vtrace 2147483650 } /* Start region 20 :: Start region 283 */ | |
| 0x7f : { %s297_s0 = smov [#allocation7] /* materialized constant */ ;; %s298_s2 = smov [#allocation10] /* materialized constant */ ;; %48 = compiler-scheduling-barrier } | |
| 0x80 : { %s299_s3 = smov [#allocation11] /* materialized constant */ } /* End region 283 */ | |
| 0x81 : { %50 = inlined_call %s297_s0, %s10_s1, %s298_s2, %s299_s3 /* %fusion.5 = fusion(%copy-done, %Arg_1.2) */ } | |
| 0x82 : { %54 = vtrace 2415919106 ;; %52 = compiler-scheduling-barrier } /* End region 20 :: Start region 284 :: End region 284 */ | |
| 0x83 : { %71 = vtrace 2147483651 ;; %55 = compiler-scheduling-barrier ;; %56 = compiler-scheduling-barrier ;; %57 = compiler-scheduling-barrier ;; %58 = compiler-scheduling-barrier } /* Start region 285 :: Start/End empty region 23 :: Start/End empty region 24 :: Start region 25 */ | |
| 0x84 : { %61 = vsyncpa [#allocation13], 0 ;; %s338_s27 = scalar_parameter_address 2 ;; %s300_s29 = smov [#allocation12] /* materialized constant */ ;; %59 = compiler-scheduling-barrier } | |
| 0x85 : { %s63_s28 = sshll.u32 %s338_s27, 4 ;; %s65_s30 = sshll.u32 %s300_s29, 4 ;; %s64_s28 = int_to_ptr.hbm [resolvable:$true] %s63_s28 ;; %s66_s30 = int_to_ptr.vmem [resolvable:$true] %s65_s30 } | |
| 0x86 : { %68 = dma.hbm_to_vmem [thread:$1] /*hbm=*/%s64_s28, /*size_in_granules=*/512, /*vmem=*/%s66_s30, /*dst_syncflagno=*/[#allocation13] } | |
| 0x87 : { %72 = vtrace 2415919107 ;; %70 = compiler-scheduling-barrier } /* End region 285 :: End region 25 */ | |
| 0x88 : { %76 = vtrace 2147483652 } /* Start region 28 :: Start region 286 */ | |
| 0x89 : { %s301_s0 = smov [#allocation11] /* materialized constant */ ;; %s302_s1 = smov [#allocation10] /* materialized constant */ ;; %73 = compiler-scheduling-barrier } | |
| 0x8a : { %s303_s2 = smov [#allocation14] /* materialized constant */ } /* End region 286 */ | |
| 0x8b : { %74 = inlined_call %s301_s0, %s302_s1, %s303_s2 /* %fusion.2 = fusion(%get-tuple-element.1, %get-tuple-element) */ } | |
| 0x8c : { %77 = vtrace 2415919108 ;; %75 = compiler-scheduling-barrier } /* End region 28 :: Start region 287 :: End region 287 */ | |
| 0x8d : { %83 = vtrace 2147483653 } /* Start region 288 :: Start region 31 */ | |
| 0x8e : { %293 = dma.done.wait [#allocation13], 512 /* local-dma-wait */ ;; %78 = compiler-scheduling-barrier } | |
| 0x8f : { %294 = vsyncadd [#allocation13], 4294966784 } | |
| 0x90 : { %81 = vsyncpa [#allocation13], 1 } | |
| 0x91 : { %84 = vtrace 2415919109 ;; %82 = compiler-scheduling-barrier } /* End region 288 :: End region 31 */ | |
| 0x92 : { %88 = vtrace 2147483654 } /* Start region 34 :: Start region 289 */ | |
| 0x93 : { %s304_s0 = smov [#allocation12] /* materialized constant */ ;; %s339_s4 = sld [smem:[#allocation23_spill]] ;; %85 = compiler-scheduling-barrier ;; %s340_s4 = int_to_ptr.hbm [resolvable:$false] %s339_s4 } | |
| 0x94 : { %s305_s1 = smov [#allocation11] /* materialized constant */ ;; %s306_s2 = smov [#allocation14] /* materialized constant */ } | |
| 0x95 : { %s307_s3 = smov [#allocation10] /* materialized constant */ } /* End region 289 */ | |
| 0x96 : { %86 = inlined_call %s304_s0, %s305_s1, %s306_s2, %s307_s3, %s340_s4 /* %fusion = fusion(%copy-done.1, %get-tuple-element.1, %fusion.2, %get-tuple-element) */ } | |
| 0x97 : { %89 = vtrace 2415919110 ;; %87 = compiler-scheduling-barrier } /* End region 3 :: End region 34 :: Start region 290 :: End region 290 */ | |
| 0x98 PF: { %91 = vtrace 2684354559 } /* Start/End empty region 291 :: Start/End empty region 4 :: Start region 292 */ | |
| 0x99 : { %s308_s5 = smov 2147483647 /* materialized constant */ } | |
| 0x9a : { %94 = vsettm %s308_s5 } | |
| 0x9b : { %95 = vdelay 1 } | |
| 0x9c : { %96 = sfence } | |
| 0x9d : { %s309_s6 = smov 0 /* materialized constant */ } | |
| 0x9e : { %97 = sst [smem:[#allocation15]] %s309_s6 } /* End region 0 :: End region 292 */ | |
| ==> compiler_dump/llo/1767846220022669355-fusion.2-74-final_bundles.txt <== | |
| = control target key start | |
| LH: loop header | |
| LB: loop body | |
| LE: loop exit | |
| PB: predicated region body | |
| PF: predicated region fallthrough | |
| CT: control target | |
| = control target key end | |
| 0 : { %v182_v0 = vmov 0.0 /* materialized constant */ ;; %v15_v5 = vlaneseq ;; %s236_s0 = inlined_call_operand.vmem [shape: f32[64,64], index: 0, kind: input, shape index: {}] /* operand 0 */ ;; %s237_s1 = inlined_call_operand.vmem [shape: f32[64], index: 1, kind: input, shape index: {}] /* operand 1 */ ;; %s238_s2 = inlined_call_operand.vmem [shape: f32[64], index: 2, kind: output, shape index: {}] /* operand 2 */ } /* entry bundle: %fusion.2 = fusion(%get-tuple-element.1, %get-tuple-element) */ | |
| 0x1 : { %4 = vst [vmem:[#allocation0] sm:$0xff] /*vst_source=*/%v182_v0 ;; %v7_v1 = vld [vmem:[%s236_s0] sm:$0xff] ;; %v158_v3 = vld [vmem:[%s236_s0 + $0x8] sm:$0xff] ;; %v159_v7 = vld [vmem:[%s236_s0 + $0x10] sm:$0xff] } | |
| 0x2 : { %v8_v2 = vld [vmem:[%s237_s1] ss:$0 sm:$0xff] ;; %v160_v8 = vld [vmem:[%s236_s0 + $0x18] sm:$0xff] ;; %v162_v13 = vld [vmem:[%s236_s0 + $0x28] sm:$0xff] ;; %v16_v22 = vand.u32 127, %v15_v5 } | |
| 0x3 : { %v11_v4 = vsub.f32 %v7_v1, %v8_v2 ;; %v23_v6 = vsub.f32 %v158_v3, %v8_v2 ;; %v161_v9 = vld [vmem:[%s236_s0 + $0x20] sm:$0xff] ;; %v39_v10 = vsub.f32 %v159_v7, %v8_v2 ;; %v55_v11 = vsub.f32 %v160_v8, %v8_v2 ;; %v163_v14 = vld [vmem:[%s236_s0 + $0x30] sm:$0xff] ;; %v164_v15 = vld [vmem:[%s236_s0 + $0x38] sm:$0xff] } | |
| 0x4 : { %v71_v12 = vsub.f32 %v161_v9, %v8_v2 ;; %v87_v18 = vsub.f32 %v162_v13, %v8_v2 ;; %v103_v21 = vsub.f32 %v163_v14, %v8_v2 ;; %v119_v24 = vsub.f32 %v164_v15, %v8_v2 } | |
| 0x5 : { %v13_v16 = vmul.f32 1.442695, %v11_v4 ;; %v25_v17 = vmul.f32 1.442695, %v23_v6 ;; %v41_v19 = vmul.f32 1.442695, %v39_v10 } | |
| 0x6 : { %v57_v20 = vmul.f32 1.442695, %v55_v11 ;; %v73_v23 = vmul.f32 1.442695, %v71_v12 ;; %v89_v25 = vmul.f32 1.442695, %v87_v18 } | |
| 0x7 : { %166 = vpow2.f32 %v13_v16 ;; %v105_v26 = vmul.f32 1.442695, %v103_v21 ;; %vm17_vm0 = vcmp.lt.s32.totalorder %v16_v22, 64 ;; %v121_v27 = vmul.f32 1.442695, %v119_v24 } | |
| 0x8 : { %168 = vpow2.f32 %v25_v17 ;; %v131_v56 = vld [vmem:[#allocation0] ss:$0 sm:$0xff] } | |
| 0x9 : { %170 = vpow2.f32 %v41_v19 } | |
| 0xa : { %172 = vpow2.f32 %v57_v20 } | |
| 0xb : { %174 = vpow2.f32 %v73_v23 } | |
| 0xc : { %176 = vpow2.f32 %v89_v25 } | |
| 0xd : { %v167_v28 = vpop.eup %166 ;; %178 = vpow2.f32 %v105_v26 } | |
| 0xe : { %v169_v29 = vpop.eup %168 ;; %v18_v30 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v167_v28, /*on_false_vx=*/0.0 ;; %180 = vpow2.f32 %v121_v27 } | |
| 0xf : { %v171_v31 = vpop.eup %170 ;; %v30_v32 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v169_v29, /*on_false_vx=*/0.0 } | |
| 0x10 : { %v173_v33 = vpop.eup %172 ;; %v33_v34 = vadd.f32 %v30_v32, %v18_v30 ;; %v46_v35 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v171_v31, /*on_false_vx=*/0.0 } | |
| 0x11 : { %v175_v36 = vpop.eup %174 ;; %v62_v37 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v173_v33, /*on_false_vx=*/0.0 } | |
| 0x12 : { %v177_v38 = vpop.eup %176 ;; %v49_v39 = vadd.f32 %v46_v35, %v33_v34 ;; %v78_v40 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v175_v36, /*on_false_vx=*/0.0 } | |
| 0x13 : { %v179_v41 = vpop.eup %178 ;; %v94_v43 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v177_v38, /*on_false_vx=*/0.0 } | |
| 0x14 : { %v65_v42 = vadd.f32 %v62_v37, %v49_v39 ;; %v181_v44 = vpop.eup %180 ;; %v110_v46 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v179_v41, /*on_false_vx=*/0.0 } | |
| 0x15 : { %v126_v48 = vsel /*vm=*/%vm17_vm0, /*on_true_vy=*/%v181_v44, /*on_false_vx=*/0.0 } | |
| 0x16 : { %v81_v45 = vadd.f32 %v78_v40, %v65_v42 } | |
| 0x17 : { %v97_v47 = vadd.f32 %v94_v43, %v81_v45 } | |
| 0x18 : { %v113_v49 = vadd.f32 %v110_v46, %v97_v47 } | |
| 0x19 : { %v129_v50 = vadd.f32 %v126_v48, %v113_v49 } | |
| 0x1a : { %v132_v51 = vrot.slane %v129_v50, 4 } | |
| 0x1b : { %v135_v52 = vadd.f32 %v132_v51, %v129_v50 } | |
| 0x1c : { %v137_v53 = vrot.slane %v135_v52, 2 } | |
| 0x1d : { %v140_v54 = vadd.f32 %v137_v53, %v135_v52 } | |
| 0x1e : { %v142_v55 = vrot.slane %v140_v54, 1 } | |
| 0x1f : { %v145_v57 = vadd.f32 %v142_v55, %v140_v54 } | |
| 0x20 : { %v149_v58 = vadd.f32 %v145_v57, %v131_v56 } | |
| 0x21 : { %151 = vst [vmem:[#allocation0] sm:$0x1] /*vst_source=*/%v149_v58 } | |
| 0x22 : { %v154_v59 = vld [vmem:[#allocation0] sm:$0x1] } | |
| 0x23 : { %157 = vst [vmem:[%s238_s2] sm:$0x1] /*vst_source=*/%v154_v59 } /* exit bundle: %fusion.2 = fusion(%get-tuple-element.1, %get-tuple-element) */ | |
| ==> compiler_dump/llo/1767846220022805106-fusion.5-78-final_bundles.txt <== | |
| = control target key start | |
| LH: loop header | |
| LB: loop body | |
| LE: loop exit | |
| PB: predicated region body | |
| PF: predicated region fallthrough | |
| CT: control target | |
| = control target key end | |
| 0 : { %v292_v0 = vmov -inf /* materialized constant */ ;; %s364_s0 = inlined_call_operand.vmem [shape: f32[64,32], index: 0, kind: input, shape index: {}] /* operand 0 */ ;; %s365_s1 = inlined_call_operand.hbm [shape: f32[32,64], index: 1, kind: input, shape index: {}] /* operand 1 */ ;; %s366_s2 = inlined_call_operand.vmem [shape: f32[64], index: 2, kind: output, shape index: {0}] /* operand 2 */ ;; %s367_s3 = inlined_call_operand.vmem [shape: f32[64,64], index: 3, kind: output, shape index: {1}] /* operand 3 */ } /* entry bundle: %fusion.5 = fusion(%copy-done, %Arg_1.2) */ | |
| 0x1 : { %6 = vst [vmem:[#allocation1] sm:$0xff] /*vst_source=*/%v292_v0 } | |
| 0x2 : { %7 = vsyncpa [#allocation3], 0 ;; %s15_s14 = sshll.u32 %s365_s1, 4 ;; %s293_s15 = smov [#allocation2] /* materialized constant */ ;; %s16_s14 = int_to_ptr.hbm [resolvable:$true] %s15_s14 } /* Start region 2 */ | |
| 0x3 : { %s17_s16 = sshll.u32 %s293_s15, 4 ;; %s18_s16 = int_to_ptr.vmem [resolvable:$true] %s17_s16 } | |
| 0x4 : { %20 = dma.hbm_to_vmem [thread:$0] /*hbm=*/%s16_s14, /*size_in_granules=*/512, /*vmem=*/%s18_s16, /*dst_syncflagno=*/[#allocation3] /* | |
| base_bounds: (4, 1) | |
| dynamic_base_bounds: (4, 1) | |
| window_bounds: (4, 1) | |
| iteration_bounds: (1, 1, 1) | |
| strides: (4, 1) | |
| pad_low: (0, 0) | |
| pad_high: (0, 0) | |
| element_size_in_bytes: 4096 */ } | |
| 0x5 : { %290 = dma.done.wait [#allocation3], 512 /* pipeline-emitter-dma-wait */ } | |
| 0x6 : { %291 = vsyncadd [#allocation3], 4294966784 ;; %v43_v1 = vld [vmem:[#allocation2] sm:$0xff] ;; %v242_v2 = vld [vmem:[%s364_s0 + $0x18] sm:$0xff] ;; %v106_v22 = vlaneseq } | |
| 0x7 : { %61 = vxpose.xlu0.b32.start [1/4] (short) (narrow) /*vx=*/%v43_v1, /*width=*/64 ;; %v243_v3 = vld [vmem:[%s364_s0 + $0x10] sm:$0xff] ;; %86 = vmatpush.msra.mxu0 %v242_v2 ;; %v244_v4 = vld [vmem:[%s364_s0 + $0x8] sm:$0xff] ;; %v53_v6 = vld [vmem:[#allocation2 + $0x10] sm:$0xff] } | |
| 0x8 : { %252 = vmatpush.msra.mxu1 %v242_v2 ;; %253 = vmatpush.msra.mxu2 %v242_v2 ;; %v48_v5 = vld [vmem:[#allocation2 + $0x8] sm:$0xff] ;; %v58_v7 = vld [vmem:[#allocation2 + $0x18] sm:$0xff] ;; %v107_v23 = vand.u32 127, %v106_v22 } | |
| 0x9 : { %254 = vmatpush.msra.mxu3 %v242_v2 ;; %92 = vmatpush.msra.mxu0 %v243_v3 ;; %v99_v8 = vld [vmem:[%s364_s0] sm:$0xff] } | |
| 0xa : { %255 = vmatpush.msra.mxu1 %v243_v3 ;; %256 = vmatpush.msra.mxu2 %v243_v3 ;; %vm108_vm0 = vcmp.lt.s32.totalorder %v107_v23, 64 ;; %v210_v47 = vld [vmem:[#allocation1] ss:$0 sm:$0xff] } | |
| 0xb : { %257 = vmatpush.msra.mxu3 %v243_v3 ;; %98 = vmatpush.msra.mxu0 %v244_v4 } | |
| 0xc : { %258 = vmatpush.msra.mxu1 %v244_v4 ;; %259 = vmatpush.msra.mxu2 %v244_v4 } | |
| 0xd : { %260 = vmatpush.msra.mxu3 %v244_v4 ;; %102 = vmatpush.msra.mxu0 %v99_v8 } | |
| 0xe : { %261 = vmatpush.msra.mxu1 %v99_v8 ;; %262 = vmatpush.msra.mxu2 %v99_v8 } | |
| 0xf : { %62 = vxpose.xlu0.b32.cont [2/4] (short) (narrow) /*vx=*/%v48_v5, /*width=*/64 ;; %263 = vmatpush.msra.mxu3 %v99_v8 } | |
| 0x10 : { %63 = vxpose.xlu0.b32.cont [3/4] (short) (narrow) /*vx=*/%v53_v6, /*width=*/64 } | |
| 0x11 : { %64 = vxpose.xlu0.b32.end [4/4] (short) (narrow) /*vx=*/%v58_v7, /*width=*/64 } | |
| 0x12 : { %v65_v9 = vpop.trf.xlu0 } | |
| 0x13 : { %103 = vmatmul.f32.vlgmr.msra.gmra.mxu0 %v65_v9 } | |
| 0x14 : { %v66_v10 = vpop.trf.xlu0 } | |
| 0x15 : { %111 = vmatmul.f32.gmra.mxu0 %v66_v10 } | |
| 0x16 : { %v67_v11 = vpop.trf.xlu0 } | |
| 0x17 : { %125 = vmatmul.f32.vlgmr.msra.gmra.mxu1 %v67_v11 } | |
| 0x18 : { %v68_v12 = vpop.trf.xlu0 } | |
| 0x19 : { %139 = vmatmul.f32.gmra.mxu1 %v68_v12 } | |
| 0x1a : { %v69_v13 = vpop.trf.xlu0 } | |
| 0x1b : { %153 = vmatmul.f32.vlgmr.msra.gmra.mxu2 %v69_v13 } | |
| 0x1c : { %v70_v14 = vpop.trf.xlu0 } | |
| 0x1d : { %167 = vmatmul.f32.gmra.mxu2 %v70_v14 } | |
| 0x1e : { %v71_v15 = vpop.trf.xlu0 } | |
| 0x1f : { %181 = vmatmul.f32.vlgmr.msra.gmra.mxu3 %v71_v15 } | |
| 0x20 : { %v72_v16 = vpop.trf.xlu0 } | |
| 0x21 : { %195 = vmatmul.f32.gmra.mxu3 %v72_v16 } | |
| 0x22 : { %v104_v17 = vpop.f32.mrf.mxu0 } | |
| 0x23 : { %110 = vst [vmem:[%s367_s3] sm:$0xff] /*vst_source=*/%v104_v17 ;; %v109_v26 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v104_v17, /*on_false_vx=*/-inf } | |
| 0x24 : { %v112_v18 = vpop.f32.mrf.mxu0 } | |
| 0x25 : { %245 = vst [vmem:[%s367_s3 + $0x8] sm:$0xff] /*vst_source=*/%v112_v18 ;; %v118_v25 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v112_v18, /*on_false_vx=*/-inf } | |
| 0x26 : { %v121_v28 = vmax.f32 %v109_v26, %v118_v25 } | |
| 0x27 : { %v126_v19 = vpop.f32.mrf.mxu1 } | |
| 0x28 : { %246 = vst [vmem:[%s367_s3 + $0x10] sm:$0xff] /*vst_source=*/%v126_v19 ;; %v132_v27 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v126_v19, /*on_false_vx=*/-inf } | |
| 0x29 : { %v135_v30 = vmax.f32 %v121_v28, %v132_v27 } | |
| 0x2a : { %v140_v20 = vpop.f32.mrf.mxu1 } | |
| 0x2b : { %247 = vst [vmem:[%s367_s3 + $0x18] sm:$0xff] /*vst_source=*/%v140_v20 ;; %v146_v29 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v140_v20, /*on_false_vx=*/-inf } | |
| 0x2c : { %v149_v33 = vmax.f32 %v135_v30, %v146_v29 } | |
| 0x2d : { %v154_v21 = vpop.f32.mrf.mxu2 } | |
| 0x2e : { %248 = vst [vmem:[%s367_s3 + $0x20] sm:$0xff] /*vst_source=*/%v154_v21 ;; %v160_v32 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v154_v21, /*on_false_vx=*/-inf } | |
| 0x2f : { %v163_v35 = vmax.f32 %v149_v33, %v160_v32 } | |
| 0x30 : { %v168_v24 = vpop.f32.mrf.mxu2 } | |
| 0x31 : { %249 = vst [vmem:[%s367_s3 + $0x28] sm:$0xff] /*vst_source=*/%v168_v24 ;; %v174_v34 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v168_v24, /*on_false_vx=*/-inf } | |
| 0x32 : { %v177_v37 = vmax.f32 %v163_v35, %v174_v34 } | |
| 0x33 : { %v182_v31 = vpop.f32.mrf.mxu3 } | |
| 0x34 : { %250 = vst [vmem:[%s367_s3 + $0x30] sm:$0xff] /*vst_source=*/%v182_v31 ;; %v188_v36 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v182_v31, /*on_false_vx=*/-inf } | |
| 0x35 : { %v191_v38 = vmax.f32 %v177_v37, %v188_v36 } | |
| 0x36 : { %v196_v39 = vpop.f32.mrf.mxu3 } | |
| 0x37 : { %v202_v40 = vsel /*vm=*/%vm108_vm0, /*on_true_vy=*/%v196_v39, /*on_false_vx=*/-inf ;; %251 = vst [vmem:[%s367_s3 + $0x38] sm:$0xff] /*vst_source=*/%v196_v39 } | |
| 0x38 : { %v205_v41 = vmax.f32 %v191_v38, %v202_v40 } | |
| 0x39 : { %v211_v42 = vrot.slane %v205_v41, 4 } | |
| 0x3a : { %v214_v43 = vmax.f32 %v205_v41, %v211_v42 } | |
| 0x3b : { %v216_v44 = vrot.slane %v214_v43, 2 } | |
| 0x3c : { %v219_v45 = vmax.f32 %v214_v43, %v216_v44 } | |
| 0x3d : { %v221_v46 = vrot.slane %v219_v45, 1 } | |
| 0x3e : { %v224_v48 = vmax.f32 %v219_v45, %v221_v46 } | |
| 0x3f : { %v228_v49 = vmax.f32 %v210_v47, %v224_v48 } | |
| 0x40 : { %230 = vst [vmem:[#allocation1] sm:$0x1] /*vst_source=*/%v228_v49 } | |
| 0x41 : { %235 = vsyncpa [#allocation3], 1 } /* End region 2 */ | |
| 0x42 : { %v238_v50 = vld [vmem:[#allocation1] sm:$0x1] } | |
| 0x43 : { %241 = vst [vmem:[%s366_s2] sm:$0x1] /*vst_source=*/%v238_v50 } /* exit bundle: %fusion.5 = fusion(%copy-done, %Arg_1.2) */ | |
| ==> compiler_dump/llo/1767846220024102775-fusion-78-final_bundles.txt <== | |
| = control target key start | |
| LH: loop header | |
| LB: loop body | |
| LE: loop exit | |
| PB: predicated region body | |
| PF: predicated region fallthrough | |
| CT: control target | |
| = control target key end | |
| 0 : { %v14_v9 = vlaneseq ;; %s471_s0 = inlined_call_operand.vmem [shape: f32[64,32], index: 0, kind: input, shape index: {}] /* operand 0 */ ;; %s472_s1 = inlined_call_operand.vmem [shape: f32[64,64], index: 1, kind: input, shape index: {}] /* operand 1 */ ;; %s473_s2 = inlined_call_operand.vmem [shape: f32[64], index: 2, kind: input, shape index: {}] /* operand 2 */ ;; %s474_s3 = inlined_call_operand.vmem [shape: f32[64], index: 3, kind: input, shape index: {}] /* operand 3 */ ;; %s475_s4 = inlined_call_operand.hbm [shape: f32[64,32], index: 4, kind: output, shape index: {}] /* operand 4 */ } /* entry bundle: %fusion = fusion(%copy-done.1, %get-tuple-element.1, %fusion.2, %get-tuple-element) */ | |
| 0x1 : { %v312_v0 = vld [vmem:[%s472_s1 + $0x38] sm:$0xff] ;; %v61_v1 = vld [vmem:[%s473_s2] ss:$0 sm:$0xff] ;; %v313_v4 = vld [vmem:[%s472_s1 + $0x30] sm:$0xff] } | |
| 0x2 : { %v62_v2 = vld [vmem:[%s474_s3] ss:$0 sm:$0xff] ;; %353 = vrcp.f32 %v61_v1 ;; %v314_v5 = vld [vmem:[%s472_s1 + $0x28] sm:$0xff] ;; %v316_v11 = vld [vmem:[%s472_s1 + $0x18] sm:$0xff] } | |
| 0x3 : { %v65_v3 = vsub.f32 %v312_v0, %v62_v2 ;; %v92_v6 = vsub.f32 %v313_v4, %v62_v2 ;; %v315_v7 = vld [vmem:[%s472_s1 + $0x20] sm:$0xff] ;; %v119_v8 = vsub.f32 %v314_v5, %v62_v2 ;; %v317_v14 = vld [vmem:[%s472_s1 + $0x10] sm:$0xff] } | |
| 0x4 : { %v146_v13 = vsub.f32 %v315_v7, %v62_v2 } | |
| 0x5 : { %v67_v10 = vmul.f32 1.442695, %v65_v3 ;; %v94_v12 = vmul.f32 1.442695, %v92_v6 } | |
| 0x6 : { %5 = vsyncpa [#allocation3], 0 ;; %v121_v16 = vmul.f32 1.442695, %v119_v8 ;; %v173_v17 = vsub.f32 %v316_v11, %v62_v2 ;; %v447_v18 = vand.u32 127, %v14_v9 ;; %v80_v20 = vand.u32 2147483648, %v61_v1 } | |
| 0x7 : { %355 = vpow2.f32 %v67_v10 ;; %v318_v21 = vld [vmem:[%s472_s1 + $0x8] sm:$0xff] ;; %v148_v22 = vmul.f32 1.442695, %v146_v13 ;; %v200_v23 = vsub.f32 %v317_v14, %v62_v2 ;; %vm74_vm0 = vweird.f32 %v61_v1 ;; %v249_v26 = vld [vmem:[%s472_s1] sm:$0xff] ;; %v310_v54 = vld [vmem:[%s471_s0 + $0x10] sm:$0xff] ;; %s397_s14 = smov [#allocation1] /* materialized constant */ } | |
| 0x8 : { %v354_v15 = vpop.eup %353 ;; %357 = vpow2.f32 %v94_v12 ;; %v78_v25 = vand.u32 2147483647, %v61_v1 ;; %v175_v27 = vmul.f32 1.442695, %v173_v17 ;; %v227_v28 = vsub.f32 %v318_v21, %v62_v2 ;; %v311_v55 = vld [vmem:[%s471_s0 + $0x18] sm:$0xff] ;; %v32_v56 = vld [vmem:[%s471_s0] sm:$0xff] } | |
| 0x9 : { %v70_v19 = vmul.f32 %v354_v15, %v61_v1 ;; %vm75_vm1 = vweird.f32 %v354_v15 ;; %359 = vpow2.f32 %v121_v16 ;; %v81_v30 = vor.u32 1.1754944e-38, %v80_v20 ;; %v309_v57 = vld [vmem:[%s471_s0 + $0x8] sm:$0xff] ;; %s298_s15 = sshll.u32 %s397_s14, 4 ;; %s300_s18 = sshll.u32 %s475_s4, 4 ;; %s299_s15 = int_to_ptr.vmem [resolvable:$true] %s298_s15 ;; %s301_s18 = int_to_ptr.hbm [resolvable:$true] %s300_s18 } | |
| 0xa : { %vm76_vm2 = vmor %vm74_vm0, %vm75_vm1 ;; %361 = vpow2.f32 %v148_v22 ;; %v202_v31 = vmul.f32 1.442695, %v200_v23 ;; %v252_v32 = vsub.f32 %v249_v26, %v62_v2 ;; %vm79_vm3 = vcmp.eq.f32.partialorder %v78_v25, 8.507059e+37 } | |
| 0xb : { %v71_v24 = vsub.f32 1.0, %v70_v19 ;; %363 = vpow2.f32 %v175_v27 ;; %v229_v36 = vmul.f32 1.442695, %v227_v28 ;; %vm351_vm4 = vcmp.lt.s32.totalorder %v447_v18, 64 } | |
| 0xc : { %365 = vpow2.f32 %v202_v31 ;; %v254_v40 = vmul.f32 1.442695, %v252_v32 } | |
| 0xd : { %v72_v29 = vmul.f32 %v354_v15, %v71_v24 ;; %v356_v33 = vpop.eup %355 ;; %367 = vpow2.f32 %v229_v36 } | |
| 0xe : { %v358_v35 = vpop.eup %357 ;; %369 = vpow2.f32 %v254_v40 } | |
| 0xf : { %v73_v34 = vadd.f32 %v354_v15, %v72_v29 ;; %v360_v38 = vpop.eup %359 } | |
| 0x10 : { %v362_v43 = vpop.eup %361 } | |
| 0x11 : { %v77_v37 = vsel /*vm=*/%vm76_vm2, /*on_true_vy=*/%v354_v15, /*on_false_vx=*/%v73_v34 ;; %v364_v45 = vpop.eup %363 } | |
| 0x12 : { %v82_v39 = vsel /*vm=*/%vm79_vm3, /*on_true_vy=*/%v81_v30, /*on_false_vx=*/%v77_v37 ;; %v366_v47 = vpop.eup %365 } | |
| 0x13 : { %v83_v41 = vmul.f32 %v356_v33, %v82_v39 ;; %v110_v42 = vmul.f32 %v358_v35, %v82_v39 ;; %v137_v44 = vmul.f32 %v360_v38, %v82_v39 ;; %v164_v46 = vmul.f32 %v362_v43, %v82_v39 ;; %v368_v49 = vpop.eup %367 } | |
| 0x14 : { %v191_v48 = vmul.f32 %v364_v45, %v82_v39 ;; %v218_v50 = vmul.f32 %v366_v47, %v82_v39 ;; %v370_v51 = vpop.eup %369 ;; %v245_v52 = vmul.f32 %v368_v49, %v82_v39 } | |
| 0x15 : { %328 = vmatpush.msra.mxu2 %v83_v41 ;; %329 = vmatpush.msra.mxu3 %v83_v41 ;; %v270_v53 = vmul.f32 %v370_v51, %v82_v39 } | |
| 0x16 : { %86 = vmatpush.msra.mxu0 %v83_v41 ;; %327 = vmatpush.msra.mxu1 %v83_v41 } | |
| 0x17 : { %331 = vmatpush.msra.mxu2 %v110_v42 ;; %332 = vmatpush.msra.mxu3 %v110_v42 } | |
| 0x18 : { %113 = vmatpush.msra.mxu0 %v110_v42 ;; %330 = vmatpush.msra.mxu1 %v110_v42 } | |
| 0x19 : { %334 = vmatpush.msra.mxu2 %v137_v44 ;; %335 = vmatpush.msra.mxu3 %v137_v44 } | |
| 0x1a : { %140 = vmatpush.msra.mxu0 %v137_v44 ;; %333 = vmatpush.msra.mxu1 %v137_v44 } | |
| 0x1b : { %337 = vmatpush.msra.mxu2 %v164_v46 ;; %338 = vmatpush.msra.mxu3 %v164_v46 } | |
| 0x1c : { %167 = vmatpush.msra.mxu0 %v164_v46 ;; %336 = vmatpush.msra.mxu1 %v164_v46 } | |
| 0x1d : { %340 = vmatpush.msra.mxu2 %v191_v48 ;; %341 = vmatpush.msra.mxu3 %v191_v48 } | |
| 0x1e : { %194 = vmatpush.msra.mxu0 %v191_v48 ;; %339 = vmatpush.msra.mxu1 %v191_v48 } | |
| 0x1f : { %343 = vmatpush.msra.mxu2 %v218_v50 ;; %344 = vmatpush.msra.mxu3 %v218_v50 } | |
| 0x20 : { %221 = vmatpush.msra.mxu0 %v218_v50 ;; %342 = vmatpush.msra.mxu1 %v218_v50 } | |
| 0x21 : { %346 = vmatpush.msra.mxu2 %v245_v52 ;; %347 = vmatpush.msra.mxu3 %v245_v52 } | |
| 0x22 : { %248 = vmatpush.msra.mxu0 %v245_v52 ;; %345 = vmatpush.msra.mxu1 %v245_v52 } | |
| 0x23 : { %349 = vmatpush.msra.mxu2 %v270_v53 ;; %350 = vmatpush.msra.mxu3 %v270_v53 } | |
| 0x24 : { %324 = vmatmul.msk.f32.vlgmr.msra.gmra.mxu2 %vm351_vm4, %v310_v54 ;; %326 = vmatmul.msk.f32.vlgmr.msra.gmra.mxu3 %vm351_vm4, %v311_v55 } | |
| 0x25 : { %273 = vmatpush.msra.mxu0 %v270_v53 ;; %348 = vmatpush.msra.mxu1 %v270_v53 } | |
| 0x26 : { %320 = vmatmul.msk.f32.vlgmr.msra.gmra.mxu0 %vm351_vm4, %v32_v56 ;; %322 = vmatmul.msk.f32.vlgmr.msra.gmra.mxu1 %vm351_vm4, %v309_v57 } | |
| 0x27 : { %v275_v58 = vpop.f32.mrf.mxu0 ;; %v278_v59 = vpop.f32.mrf.mxu1 } | |
| 0x28 : { %276 = vst [vmem:[#allocation1] sm:$0xff] /*vst_source=*/%v275_v58 } | |
| 0x29 : { %281 = vst [vmem:[#allocation1 + $0x8] sm:$0xff] /*vst_source=*/%v278_v59 } | |
| 0x2a : { %v283_v60 = vpop.f32.mrf.mxu2 ;; %v288_v61 = vpop.f32.mrf.mxu3 } | |
| 0x2b : { %286 = vst [vmem:[#allocation1 + $0x10] sm:$0xff] /*vst_source=*/%v283_v60 } | |
| 0x2c : { %291 = vst [vmem:[#allocation1 + $0x18] sm:$0xff] /*vst_source=*/%v288_v61 } | |
| 0x2d : { %303 = dma.vmem_to_hbm [thread:$0] /*vmem=*/%s299_s15, /*size_in_granules=*/512, /*hbm=*/%s301_s18, /*dst_syncflagno=*/[#allocation3] /* | |
| base_bounds: (4, 1) | |
| dynamic_base_bounds: (4, 1) | |
| window_bounds: (4, 1) | |
| iteration_bounds: (1, 1, 1) | |
| strides: (4, 1) | |
| pad_low: (0, 0) | |
| pad_high: (0, 0) | |
| element_size_in_bytes: 4096 */ } | |
| 0x2e : { %395 = dma.done.wait [#allocation3], 512 /* pipeline-emitter-dma-wait */ } | |
| 0x2f : { %396 = vsyncadd [#allocation3], 4294966784 } | |
| 0x30 : { %308 = vsyncpa [#allocation3], 1 } /* exit bundle: %fusion = fusion(%copy-done.1, %get-tuple-element.1, %fusion.2, %get-tuple-element) */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment