Created
November 26, 2025 15:26
-
-
Save bjacob/6532650ac7c5f7ae4c3bec8bf0f08ad4 to your computer and use it in GitHub Desktop.
PC-sampling profile of Llama 405b FP4 prefill on MI350
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Data from command: | |
| /tmp/xx/iree-build/tools/iree-benchmark-module --device=hip://0 --device_allocator=caching --hip_use_streams=true --module=/home/ossci/iree-model-benchmark/llama3/tmp/base.405b_fp4.vmfb --parameters=model=/tmp/fp4_preshuffled_2025_09_12.irpa --function=prefill_bs4 --input=@/tmp/args_bs4_2500/prefill_input0_tokens.npy --input=@/tmp/args_bs4_2500/prefill_input1_seq_lens.npy --input=@/tmp/args_bs4_2500/prefill_input2_seq_block_ids.npy --input=@/tmp/args_bs4_2500/prefill_input3_kv_cache_state.npy --benchmark_repetitions=1 | |
| Took 41.44 seconds | |
| 5330711 samples collected | |
| +-----+----------------------------------------------------+---------------+-----------------+----------------+------------+ | |
| | | kernel | duration[s] | % of gpu time | % of samples | selected | | |
| |-----+----------------------------------------------------+---------------+-----------------+----------------+------------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | 2.41 | 24.22 | 23.87 | * | | |
| | 1 | prefill_bs4$async_dispatch_23_reduction_Dx53248x51 | 2.38 | 23.88 | 24.22 | * | | |
| | 2 | prefill_bs4$async_dispatch_25_reduction_Dx16384x16 | 2.21 | 22.17 | 23.13 | * | | |
| | 3 | prefill_bs4$async_dispatch_16_attention_4x8x16xDx1 | 1.05 | 10.59 | 10.05 | * | | |
| | 4 | prefill_bs4$async_dispatch_4_reduction_Dx16384x512 | 0.7 | 7.03 | 6.67 | * | | |
| | 5 | prefill_bs4$async_dispatch_18_reduction_Dx16384x51 | 0.62 | 6.22 | 6.71 | * | | |
| | 6 | prefill_bs4$async_dispatch_3_reduction_Dx32_f32.kd | 0.21 | 2.15 | 2.2 | * | | |
| | 7 | prefill_bs4$async_dispatch_2_reduction_Dx16384_f32 | 0.07 | 0.66 | 0.57 | * | | |
| | 8 | prefill_bs4$async_dispatch_2904_matmul_like_Dx1282 | 0.05 | 0.55 | 0.56 | * | | |
| | 9 | prefill_bs4$async_dispatch_5_reduction_Dx1024x512x | 0.04 | 0.41 | 0.44 | * | | |
| | 10 | prefill_bs4$async_dispatch_6_reduction_Dx1024x512x | 0.04 | 0.41 | 0.43 | * | | |
| | 11 | prefill_bs4$async_dispatch_9_slow_memcpy.kd | 0.04 | 0.39 | 0.28 | * | | |
| | 12 | prefill_bs4$async_dispatch_10_slow_memcpy.kd | 0.04 | 0.39 | 0.28 | * | | |
| | 13 | prefill_bs4$async_dispatch_1_elementwise_D_f16xf32 | 0.04 | 0.38 | 0.29 | * | | |
| | 14 | prefill_bs4$async_dispatch_15_elementwise_broadcas | 0.03 | 0.34 | 0.15 | * | | |
| | 15 | prefill_bs4$async_dispatch_2892_attention_4x8x16xD | 0.01 | 0.08 | 0.08 | * | | |
| | 16 | prefill_bs4$async_dispatch_11_slow_memcpy.kd | 0 | 0.03 | 0.02 | * | | |
| | 17 | prefill_bs4$async_dispatch_12_slow_memcpy.kd | 0 | 0.03 | 0.02 | * | | |
| | 18 | __amd_rocclr_copyBuffer.kd | 0 | 0.01 | 0 | * | | |
| | 19 | prefill_bs4$async_dispatch_2883_reduction_Dx1024x5 | 0 | 0 | 0 | * | | |
| | 20 | prefill_bs4$async_dispatch_2889_slow_memcpy.kd | 0 | 0 | 0 | * | | |
| | 21 | prefill_bs4$async_dispatch_2890_slow_memcpy.kd | 0 | 0 | 0 | * | | |
| | 22 | prefill_bs4$async_dispatch_0_elementwise_broadcast | 0 | 0 | 0 | * | | |
| | 23 | prefill_bs4$async_dispatch_2382_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 24 | prefill_bs4$async_dispatch_2336_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 25 | prefill_bs4$async_dispatch_13_scatter_4xDx8x32x2x6 | 0 | 0 | 0 | * | | |
| | 26 | prefill_bs4$async_dispatch_2887_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 27 | prefill_bs4$async_dispatch_1255_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 28 | prefill_bs4$async_dispatch_128_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 29 | prefill_bs4$async_dispatch_2658_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 30 | prefill_bs4$async_dispatch_2589_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 31 | prefill_bs4$async_dispatch_1071_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 32 | prefill_bs4$async_dispatch_2727_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 33 | prefill_bs4$async_dispatch_1347_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 34 | prefill_bs4$async_dispatch_243_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 35 | prefill_bs4$async_dispatch_2796_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 36 | prefill_bs4$async_dispatch_1278_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 37 | prefill_bs4$async_dispatch_220_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 38 | prefill_bs4$async_dispatch_2290_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 39 | prefill_bs4$async_dispatch_105_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 40 | prefill_bs4$async_dispatch_2198_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 41 | prefill_bs4$async_dispatch_2750_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 42 | prefill_bs4$async_dispatch_1370_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 43 | prefill_bs4$async_dispatch_2244_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 44 | prefill_bs4$async_dispatch_542_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 45 | prefill_bs4$async_dispatch_657_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 46 | prefill_bs4$async_dispatch_2106_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 47 | prefill_bs4$async_dispatch_289_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 48 | prefill_bs4$async_dispatch_1462_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 49 | prefill_bs4$async_dispatch_1209_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 50 | prefill_bs4$async_dispatch_841_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 51 | prefill_bs4$async_dispatch_1301_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 52 | prefill_bs4$async_dispatch_2819_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 53 | prefill_bs4$async_dispatch_749_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 54 | prefill_bs4$async_dispatch_335_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 55 | prefill_bs4$async_dispatch_2474_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 56 | prefill_bs4$async_dispatch_2359_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 57 | prefill_bs4$async_dispatch_1094_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 58 | prefill_bs4$async_dispatch_2865_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 59 | prefill_bs4$async_dispatch_2612_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 60 | prefill_bs4$async_dispatch_174_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 61 | prefill_bs4$async_dispatch_2451_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 62 | prefill_bs4$async_dispatch_519_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 63 | prefill_bs4$async_dispatch_1600_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 64 | prefill_bs4$async_dispatch_1991_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 65 | prefill_bs4$async_dispatch_2428_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 66 | prefill_bs4$async_dispatch_979_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 67 | prefill_bs4$async_dispatch_726_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 68 | prefill_bs4$async_dispatch_36_scatter_4xDx8x32x2x6 | 0 | 0 | 0 | * | | |
| | 69 | prefill_bs4$async_dispatch_2083_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 70 | prefill_bs4$async_dispatch_2543_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 71 | prefill_bs4$async_dispatch_2267_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 72 | prefill_bs4$async_dispatch_2773_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 73 | prefill_bs4$async_dispatch_2060_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 74 | prefill_bs4$async_dispatch_818_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 75 | prefill_bs4$async_dispatch_82_scatter_4xDx8x32x2x6 | 0 | 0 | 0 | * | | |
| | 76 | prefill_bs4$async_dispatch_1623_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 77 | prefill_bs4$async_dispatch_1761_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 78 | prefill_bs4$async_dispatch_2129_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 79 | prefill_bs4$async_dispatch_680_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 80 | prefill_bs4$async_dispatch_427_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 81 | prefill_bs4$async_dispatch_1554_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 82 | prefill_bs4$async_dispatch_1669_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 83 | prefill_bs4$async_dispatch_1117_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 84 | prefill_bs4$async_dispatch_1807_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 85 | prefill_bs4$async_dispatch_2221_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 86 | prefill_bs4$async_dispatch_496_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 87 | prefill_bs4$async_dispatch_1393_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 88 | prefill_bs4$async_dispatch_1026_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 89 | prefill_bs4$async_dispatch_2130_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 90 | prefill_bs4$async_dispatch_887_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 91 | prefill_bs4$async_dispatch_772_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 92 | prefill_bs4$async_dispatch_864_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 93 | prefill_bs4$async_dispatch_1163_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 94 | prefill_bs4$async_dispatch_1439_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 95 | prefill_bs4$async_dispatch_1945_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 96 | prefill_bs4$async_dispatch_1899_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 97 | prefill_bs4$async_dispatch_910_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 98 | prefill_bs4$async_dispatch_1324_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 99 | prefill_bs4$async_dispatch_1072_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 100 | prefill_bs4$async_dispatch_2566_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 101 | prefill_bs4$async_dispatch_2313_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 102 | prefill_bs4$async_dispatch_1416_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 103 | prefill_bs4$async_dispatch_1922_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 104 | prefill_bs4$async_dispatch_1646_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 105 | prefill_bs4$async_dispatch_1048_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 106 | prefill_bs4$async_dispatch_266_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 107 | prefill_bs4$async_dispatch_1485_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 108 | prefill_bs4$async_dispatch_1830_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 109 | prefill_bs4$async_dispatch_1508_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 110 | prefill_bs4$async_dispatch_2704_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 111 | prefill_bs4$async_dispatch_2842_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 112 | prefill_bs4$async_dispatch_2405_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 113 | prefill_bs4$async_dispatch_358_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 114 | prefill_bs4$async_dispatch_1025_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 115 | prefill_bs4$async_dispatch_1140_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 116 | prefill_bs4$async_dispatch_1186_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 117 | prefill_bs4$async_dispatch_980_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 118 | prefill_bs4$async_dispatch_1232_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 119 | prefill_bs4$async_dispatch_2152_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 120 | prefill_bs4$async_dispatch_588_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 121 | prefill_bs4$async_dispatch_2014_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 122 | prefill_bs4$async_dispatch_795_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 123 | prefill_bs4$async_dispatch_1784_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 124 | prefill_bs4$async_dispatch_2175_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 125 | prefill_bs4$async_dispatch_1118_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 126 | prefill_bs4$async_dispatch_956_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 127 | prefill_bs4$async_dispatch_404_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 128 | prefill_bs4$async_dispatch_2497_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 129 | prefill_bs4$async_dispatch_2520_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 130 | prefill_bs4$async_dispatch_1968_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 131 | prefill_bs4$async_dispatch_2037_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 132 | prefill_bs4$async_dispatch_59_scatter_4xDx8x32x2x6 | 0 | 0 | 0 | * | | |
| | 133 | prefill_bs4$async_dispatch_611_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 134 | prefill_bs4$async_dispatch_151_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 135 | prefill_bs4$async_dispatch_1555_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 136 | prefill_bs4$async_dispatch_1853_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 137 | prefill_bs4$async_dispatch_634_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 138 | prefill_bs4$async_dispatch_1692_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 139 | prefill_bs4$async_dispatch_1577_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 140 | prefill_bs4$async_dispatch_1738_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 141 | prefill_bs4$async_dispatch_1876_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 142 | prefill_bs4$async_dispatch_565_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 143 | prefill_bs4$async_dispatch_1715_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 144 | prefill_bs4$async_dispatch_312_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 145 | prefill_bs4$async_dispatch_1531_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 146 | prefill_bs4$async_dispatch_1002_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 147 | prefill_bs4$async_dispatch_2681_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 148 | prefill_bs4$async_dispatch_197_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 149 | prefill_bs4$async_dispatch_473_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 150 | prefill_bs4$async_dispatch_1946_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 151 | prefill_bs4$async_dispatch_933_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 152 | prefill_bs4$async_dispatch_1463_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 153 | prefill_bs4$async_dispatch_450_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 154 | prefill_bs4$async_dispatch_703_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 155 | prefill_bs4$async_dispatch_428_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 156 | prefill_bs4$async_dispatch_381_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 157 | prefill_bs4$async_dispatch_2635_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 158 | prefill_bs4$async_dispatch_635_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 159 | prefill_bs4$async_dispatch_2199_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 160 | prefill_bs4$async_dispatch_2222_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 161 | prefill_bs4$async_dispatch_2820_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 162 | prefill_bs4$async_dispatch_37_scatter_4xDx8x32x2x6 | 0 | 0 | 0 | * | | |
| | 163 | prefill_bs4$async_dispatch_1624_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 164 | prefill_bs4$async_dispatch_1693_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 165 | prefill_bs4$async_dispatch_2337_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 166 | prefill_bs4$async_dispatch_1417_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 167 | prefill_bs4$async_dispatch_2843_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 168 | prefill_bs4$async_dispatch_1394_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 169 | prefill_bs4$async_dispatch_1279_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 170 | prefill_bs4$async_dispatch_359_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 171 | prefill_bs4$async_dispatch_451_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 172 | prefill_bs4$async_dispatch_704_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 173 | prefill_bs4$async_dispatch_681_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 174 | prefill_bs4$async_dispatch_1854_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 175 | prefill_bs4$async_dispatch_1739_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 176 | prefill_bs4$async_dispatch_2797_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 177 | prefill_bs4$async_dispatch_1923_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 178 | prefill_bs4$async_dispatch_2429_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 179 | prefill_bs4$async_dispatch_1233_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 180 | prefill_bs4$async_dispatch_1808_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 181 | prefill_bs4$async_dispatch_267_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 182 | prefill_bs4$async_dispatch_2521_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 183 | prefill_bs4$async_dispatch_1578_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 184 | prefill_bs4$async_dispatch_2176_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 185 | prefill_bs4$async_dispatch_2383_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 186 | prefill_bs4$async_dispatch_290_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 187 | prefill_bs4$async_dispatch_1716_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 188 | prefill_bs4$async_dispatch_566_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 189 | prefill_bs4$async_dispatch_1440_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 190 | prefill_bs4$async_dispatch_1302_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 191 | prefill_bs4$async_dispatch_2268_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 192 | prefill_bs4$async_dispatch_2153_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 193 | prefill_bs4$async_dispatch_2475_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 194 | prefill_bs4$async_dispatch_244_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 195 | prefill_bs4$async_dispatch_382_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 196 | prefill_bs4$async_dispatch_1256_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 197 | prefill_bs4$async_dispatch_405_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 198 | prefill_bs4$async_dispatch_129_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 199 | prefill_bs4$async_dispatch_2728_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 200 | prefill_bs4$async_dispatch_2107_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 201 | prefill_bs4$async_dispatch_2659_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 202 | prefill_bs4$async_dispatch_106_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 203 | prefill_bs4$async_dispatch_2590_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 204 | prefill_bs4$async_dispatch_1348_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 205 | prefill_bs4$async_dispatch_543_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 206 | prefill_bs4$async_dispatch_221_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 207 | prefill_bs4$async_dispatch_14_scatter_4xDx8x32x2x6 | 0 | 0 | 0 | * | | |
| | 208 | prefill_bs4$async_dispatch_520_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 209 | prefill_bs4$async_dispatch_2567_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 210 | prefill_bs4$async_dispatch_2452_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 211 | prefill_bs4$async_dispatch_1532_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 212 | prefill_bs4$async_dispatch_1509_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 213 | prefill_bs4$async_dispatch_1992_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 214 | prefill_bs4$async_dispatch_2705_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 215 | prefill_bs4$async_dispatch_2291_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 216 | prefill_bs4$async_dispatch_2888_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 217 | prefill_bs4$async_dispatch_1647_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 218 | prefill_bs4$async_dispatch_1095_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 219 | prefill_bs4$async_dispatch_2682_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 220 | prefill_bs4$async_dispatch_497_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 221 | prefill_bs4$async_dispatch_474_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 222 | prefill_bs4$async_dispatch_658_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 223 | prefill_bs4$async_dispatch_1371_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 224 | prefill_bs4$async_dispatch_1210_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 225 | prefill_bs4$async_dispatch_1900_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 226 | prefill_bs4$async_dispatch_2245_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 227 | prefill_bs4$async_dispatch_773_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 228 | prefill_bs4$async_dispatch_865_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 229 | prefill_bs4$async_dispatch_796_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 230 | prefill_bs4$async_dispatch_2038_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 231 | prefill_bs4$async_dispatch_1831_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 232 | prefill_bs4$async_dispatch_313_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 233 | prefill_bs4$async_dispatch_750_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 234 | prefill_bs4$async_dispatch_1325_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 235 | prefill_bs4$async_dispatch_2613_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 236 | prefill_bs4$async_dispatch_842_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 237 | prefill_bs4$async_dispatch_1670_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 238 | prefill_bs4$async_dispatch_1601_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 239 | prefill_bs4$async_dispatch_1785_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 240 | prefill_bs4$async_dispatch_1969_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 241 | prefill_bs4$async_dispatch_2314_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 242 | prefill_bs4$async_dispatch_2498_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 243 | prefill_bs4$async_dispatch_911_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 244 | prefill_bs4$async_dispatch_612_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 245 | prefill_bs4$async_dispatch_1141_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 246 | prefill_bs4$async_dispatch_934_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 247 | prefill_bs4$async_dispatch_1003_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 248 | prefill_bs4$async_dispatch_589_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 249 | prefill_bs4$async_dispatch_198_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 250 | prefill_bs4$async_dispatch_175_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 251 | prefill_bs4$async_dispatch_2751_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 252 | prefill_bs4$async_dispatch_60_scatter_4xDx8x32x2x6 | 0 | 0 | 0 | * | | |
| | 253 | prefill_bs4$async_dispatch_1877_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 254 | prefill_bs4$async_dispatch_2061_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 255 | prefill_bs4$async_dispatch_2866_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 256 | prefill_bs4$async_dispatch_83_scatter_4xDx8x32x2x6 | 0 | 0 | 0 | * | | |
| | 257 | prefill_bs4$async_dispatch_727_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 258 | prefill_bs4$async_dispatch_1762_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 259 | prefill_bs4$async_dispatch_888_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 260 | prefill_bs4$async_dispatch_2544_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 261 | prefill_bs4$async_dispatch_2084_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 262 | prefill_bs4$async_dispatch_957_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 263 | prefill_bs4$async_dispatch_2015_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 264 | prefill_bs4$async_dispatch_152_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 265 | prefill_bs4$async_dispatch_1486_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 266 | prefill_bs4$async_dispatch_2406_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 267 | prefill_bs4$async_dispatch_819_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 268 | prefill_bs4$async_dispatch_2636_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 269 | prefill_bs4$async_dispatch_2360_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 270 | prefill_bs4$async_dispatch_336_scatter_4xDx8x32x2x | 0 | 0 | 0 | * | | |
| | 271 | prefill_bs4$async_dispatch_2774_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 272 | prefill_bs4$async_dispatch_1049_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 273 | prefill_bs4$async_dispatch_1187_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 274 | prefill_bs4$async_dispatch_1164_scatter_4xDx8x32x2 | 0 | 0 | 0 | * | | |
| | 275 | prefill_bs4$async_dispatch_8_elementwise_broadcast | 0 | 0 | 0 | * | | |
| | 276 | prefill_bs4$async_dispatch_7_elementwise_D_i64.kd | 0 | 0 | 0 | * | | |
| | 277 | prefill_bs4$async_dispatch_2886_elementwise_D_i64. | 0 | 0 | 0 | * | | |
| +-----+----------------------------------------------------+---------------+-----------------+----------------+------------+ | |
| Reporting on kernel: prefill_bs4$async_dispatch_22_reduction_Dx53248x512x32_f4E2M1FNxf4E2M1FNxf8E8M0FNUxf8E8M0FNUxf32.kd | |
| ---------------- | |
| Kernel overview: | |
| ---------------- | |
| Statistics: average occupancy, active VALU threads in the EXEC mask, and IPC. | |
| +----+-------------+-----------+-----------------+ | |
| | | Average | Average | Average | | |
| | | occupancy | active | IPC | | |
| | | [%] | threads | [inst. issued | | |
| | | | [%] | /quad-cycle] | | |
| |----+-------------+-----------+-----------------| | |
| | 0 | 24.44 | 100 | 0.2 | | |
| +----+-------------+-----------+-----------------+ | |
| Pipeline utilization: This table reports high-level utilization | |
| of your workload, looking only at whether an instruction was | |
| issued to a pipeline in given cycle, and whether that pipe was stalled. | |
| +----+----------+---------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+---------+---------+---------+----------+----------+---------+ | |
| | | weight | issue | issue | issue | issue | issue | issue | issue | issue | dual | stall | stall | stall | stall | stall | stall | stall | stall | | |
| | | [%] | misc | exp | flat | lds | vmem | scalar | matrix | valu | issue | misc | exp | flat | lds | vmem | scalar | matrix | valu | | |
| | | | [%] | [%] | [%] | [%] | tex | [%] | [%] | [%] | valu | [%] | [%] | [%] | [%] | tex | [%] | [%] | [%] | | |
| | | | | | | | [%] | | | | [%] | | | | | [%] | | | | | |
| |----+----------+---------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+---------+---------+---------+----------+----------+---------| | |
| | 0 | 13.13 | 0.26 | 0 | 27.69 | 63.5 | 29.24 | 0.27 | 10.13 | 9.41 | 0.01 | 0 | 0 | 27.2 | 51.12 | 28.49 | 0 | 4.73 | 4.67 | | |
| +----+----------+---------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+---------+---------+---------+----------+----------+---------+ | |
| CU state: This table determines the most common types of samples. | |
| This may indicate different types of activity in your kernel | |
| (e.g., some waves are memory-intensive, others compute) | |
| +----+-----------+-------------+---------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------+-----------+-----------+-----------+ | |
| | | active | occupancy | issue | issue | issue | issue | issue | issue | issue | issue | stall | stall | stall | stall | stall | stall | stall | stall | reason: | reason: | reason: | reason: | reason: | | |
| | | threads | [%] | misc | exp | flat | lds | vmem | scalar | matrix | valu | misc | exp | flat | lds | vmem | scalar | matrix | valu | alu | waitcnt | barrier | arbiter | arbiter | | |
| | | [%] | | [%] | [%] | [%] | [%] | tex | [%] | [%] | [%] | [%] | [%] | [%] | [%] | tex | [%] | [%] | [%] | dependency | [%] | wait | not | win | | |
| | | | | | | | | [%] | | | | | | | | [%] | | | | [%] | | [%] | win | ex | | |
| | | | | | | | | | | | | | | | | | | | | | | | [%] | stall | | |
| | | | | | | | | | | | | | | | | | | | | | | | | [%] | | |
| |----+-----------+-------------+---------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------+-----------+-----------+-----------| | |
| | 0 | 100 | 24.23 | 0.14 | 0 | 0 | 35.08 | 0.46 | 0.22 | 0 | 0 | 0 | 0 | 0 | 26.89 | 0 | 0 | 0 | 0 | 0.13 | 4.43 | 2.33 | 2.86 | 20.57 | | |
| | 1 | 100 | 23.34 | 0.02 | 0 | 11.33 | 1.56 | 0.14 | 0.02 | 2.47 | 2.79 | 0 | 0 | 11.18 | 0 | 0.1 | 0 | 1.14 | 1.4 | 0.23 | 3.11 | 2.29 | 0.01 | 7.14 | | |
| | 2 | 100 | 24.6 | 0.04 | 0 | 0.12 | 0 | 10.58 | 0.01 | 0.36 | 0.59 | 0 | 0 | 0 | 0 | 10.45 | 0 | 0.11 | 0.22 | 0.02 | 0.75 | 2.44 | 0.94 | 5.94 | | |
| | 3 | 100 | 25 | 0 | 0 | 0 | 10.53 | 10.53 | 0 | 0 | 0 | 0 | 0 | 0 | 9.34 | 10.42 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9.89 | | |
| | 4 | 100 | 25 | 0 | 0 | 9.02 | 9.02 | 0 | 0 | 0 | 0 | 0 | 0 | 8.79 | 9.01 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8.9 | | |
| | 5 | 100 | 25 | 0 | 0 | 7.54 | 0 | 7.54 | 0 | 0 | 0 | 0 | 0 | 7.54 | 0 | 7.54 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7.54 | | |
| | 6 | 100 | 24.36 | 0.06 | 0 | 0 | 3.83 | 0 | 0.01 | 1.41 | 6.14 | 0 | 0 | 0 | 2.95 | 0 | 0 | 1.41 | 3.15 | 0.04 | 0.43 | 0.4 | 0.04 | 3.21 | | |
| | 7 | 100 | 24.35 | 0.01 | 0 | 0 | 3.78 | 0 | 0.01 | 6.06 | 0.09 | 0 | 0 | 0 | 3.01 | 0 | 0 | 2.15 | 0 | 0.01 | 0.15 | 0.37 | 0.36 | 2.7 | | |
| +----+-----------+-------------+---------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------+-----------+-----------+-----------+ | |
| No workload imbalance detected | |
| --------------------------------------- | |
| Detailed instruction/pipeline analyses: | |
| --------------------------------------- | |
| Instruction hotspots: This table reports the most commonly observed | |
| instructions in the code (but notably: not whether they were stalled, | |
| issued, etc.) | |
| +----+----------------------------------------------------+---------------------------------------------------------+----------+----------+-----------+-------------+ | |
| | | kernel | inst | pc | weight | active | occupancy | | |
| | | | | | [%] | threads | [%] | | |
| | | | | | | [%] | | | |
| |----+----------------------------------------------------+---------------------------------------------------------+----------+----------+-----------+-------------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[170:173], v[114:115], off | 0x13d588 | 10.4 | 100 | 24.3 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[138:141], v113, s[20:23], 0 offen | 0x13d520 | 5.6 | 100 | 25 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d5b4 | 5.5 | 100 | 24.2 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx2 v[234:235], v34, s[12:15], 0 offen | 0x13d548 | 4 | 100 | 24.7 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt lgkmcnt(1) | 0x13d920 | 3.3 | 100 | 24 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt lgkmcnt(0) | 0x13d94c | 2.7 | 100 | 24.3 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_write_b128 v63, v[170:173] offset:32768 | 0x13da54 | 1.6 | 100 | 24.8 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[222:225], v94 | 0x13d600 | 1.5 | 100 | 24.3 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[226:229], v97 | 0x13d608 | 1.4 | 100 | 24.3 | | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[166:169], v130, s[20:23], 0 offen | 0x13d580 | 1.3 | 100 | 24 | | |
| +----+----------------------------------------------------+---------------------------------------------------------+----------+----------+-----------+-------------+ | |
| Internal stalls in critical path: This table determines | |
| the most common instructions where no other useful work | |
| occurred in a sample. This often indicates where | |
| memory operations (or no-ops, etc.) where the latency | |
| could not be hidden by other work. Fixing these issues | |
| will likely result in higher performance. | |
| +----+----------------------------------------------------+----------------------+----------+-----------+-------------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | reason: | | |
| | | | | | threads | [%] | waitcnt | | |
| | | | | | [%] | | [%] | | |
| |----+----------------------------------------------------+----------------------+----------+-----------+-------------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt lgkmcnt(1) | 0x13d920 | 100 | 24 | 3.3 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt lgkmcnt(0) | 0x13d94c | 100 | 24.3 | 2.7 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt vmcnt(4) | 0x13da50 | 100 | 23.9 | 0.5 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt vmcnt(4) | 0x13d3b0 | 100 | 25 | 0.3 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt lgkmcnt(1) | 0x13de58 | 100 | 24.8 | 0.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt vmcnt(0) | 0x13dac0 | 100 | 24.1 | 0.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt lgkmcnt(0) | 0x13d754 | 100 | 24.5 | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt vmcnt(3) | 0x13da5c | 100 | 24.2 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt vmcnt(2) | 0x13da68 | 100 | 23.9 | 0.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_waitcnt vmcnt(1) | 0x13da74 | 100 | 23.7 | 0.1 | | |
| +----+----------------------------------------------------+----------------------+----------+-----------+-------------+-----------+ | |
| MISC pipeline holes: This table identifies the most common MISC instructions | |
| that are causing the MISC pipeline to not be utilized on a given sample. | |
| Fixing these will likely increase MISC utilization. | |
| +----+----------------------------------------------------+---------------------+----------+-----------+-------------+---------+---------+--------------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | | |
| | | | | | threads | [%] | misc | misc | alu | barrier | | |
| | | | | | [%] | | [%] | [%] | dependency | wait | | |
| | | | | | | | | | [%] | [%] | | |
| |----+----------------------------------------------------+---------------------+----------+-----------+-------------+---------+---------+--------------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d5b4 | 100 | 24.2 | 0 | 0 | -- | 5.5 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d960 | 100 | 23.9 | 0 | 0 | -- | 0.9 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d3ac | 100 | 25 | 0 | 0 | -- | 0.2 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13e174 | 100 | 24.9 | 0 | 0 | -- | 0.2 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13dadc | 100 | 25 | 0 | 0 | -- | 0.0 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_cbranch_scc1 1176 | 0x13cf18 | 100 | 25 | 0 | 0 | 0.0 | -- | | |
| +----+----------------------------------------------------+---------------------+----------+-----------+-------------+---------+---------+--------------+-----------+ | |
| MISC other-pipeline available: This table identifies the most common MISC instructions | |
| that caused the MISC pipeline to not be utilized on a given sample *where other pipelines* were available to be used on the same cycle. | |
| If possible, you might be able to move other types of instructions before this | |
| instruction to increase overall utilization. | |
| +----+----------------------------------------------------+---------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | avail | avail | avail | avail | avail | avail | avail | reason: | reason: | | |
| | | | | | threads | [%] | misc | misc | exp | flat | lds | vmem | scalar | matrix | valu | alu | barrier | | |
| | | | | | [%] | | [%] | [%] | [%] | [%] | [%] | tex | [%] | [%] | [%] | dependency | wait | | |
| | | | | | | | | | | | | [%] | | | | [%] | [%] | | |
| |----+----------------------------------------------------+---------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d5b4 | 100 | 24.2 | 0 | 0 | 5.5 | 5 | 4.1 | 3.5 | 5.5 | 5.2 | 5.2 | -- | 5.5 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d960 | 100 | 23.9 | 0 | 0 | 0.9 | 0.9 | 0.4 | 0.9 | 0.9 | 0.9 | 0.9 | -- | 0.9 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d3ac | 100 | 25 | 0 | 0 | 0.2 | 0.2 | 0.1 | 0.2 | 0.2 | 0.2 | 0.2 | -- | 0.2 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13e174 | 100 | 24.9 | 0 | 0 | 0.2 | 0.2 | 0.2 | 0.1 | 0.2 | 0.2 | 0.2 | -- | 0.2 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13dadc | 100 | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_cbranch_scc1 1176 | 0x13cf18 | 100 | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | -- | | |
| +----+----------------------------------------------------+---------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------+ | |
| MISC Hotspots: This table identifies the most common MISC instructions | |
| *including* arbitration loss (and other "good" stalls). | |
| If the MISC pipeline is highly utilized reducing use | |
| or avoiding these instructions may improve overall performance. | |
| +----+----------------------------------------------------+----------------------+----------+-----------+-------------+---------+---------+--------------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | | |
| | | | | | threads | [%] | misc | misc | alu | barrier | | |
| | | | | | [%] | | [%] | [%] | dependency | wait | | |
| | | | | | | | | | [%] | [%] | | |
| |----+----------------------------------------------------+----------------------+----------+-----------+-------------+---------+---------+--------------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d5b4 | 100 | 24.2 | 0 | 0 | -- | 5.5 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d960 | 100 | 23.9 | 0 | 0 | -- | 0.9 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13d3ac | 100 | 25 | 0 | 0 | -- | 0.2 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13e174 | 100 | 24.9 | 0 | 0 | -- | 0.2 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_cbranch_scc1 65154 | 0x13dad4 | 100 | 24 | 0 | 0 | -- | -- | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_barrier | 0x13dadc | 100 | 24.9 | 0 | 0 | -- | 0.0 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_cbranch_scc1 1176 | 0x13cf18 | 100 | 25 | 0 | 0 | 0.0 | -- | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_cbranch_scc1 64539 | 0x13e178 | 100 | 25 | 0 | 0 | -- | -- | | |
| +----+----------------------------------------------------+----------------------+----------+-----------+-------------+---------+---------+--------------+-----------+ | |
| FLAT pipeline holes: This table identifies the most common FLAT instructions | |
| that are causing the FLAT pipeline to not be utilized on a given sample. | |
| Fixing these will likely increase FLAT utilization. | |
| +----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | reason: | | |
| | | | | | threads | [%] | flat | flat | alu | arbiter | arbiter | | |
| | | | | | [%] | | [%] | [%] | dependency | not | win | | |
| | | | | | | | | | [%] | win | ex | | |
| | | | | | | | | | | [%] | stall | | |
| | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[170:173], v[114:115], off | 0x13d588 | 100 | 24.3 | 9.7 | 9.7 | -- | 0.6 | 9.7 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[174:177], v[116:117], off | 0x13d590 | 100 | 24.2 | 0.7 | 0.7 | -- | 0 | 0.7 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[182:185], v[120:121], off | 0x13d5a0 | 100 | 24.4 | 0.5 | 0.5 | -- | 0 | 0.5 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dword v236, v[122:123], off | 0x13d5a8 | 100 | 24.4 | 0.5 | 0.5 | -- | 0 | 0.5 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[178:181], v[118:119], off | 0x13d598 | 100 | 24.1 | 0.5 | 0.5 | -- | 0 | 0.5 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_store_short v[44:45], v28, off | 0x13e074 | 100 | 24.9 | 0.2 | 0.2 | 0.0 | 0 | 0.2 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dword v2, v[18:19], off | 0x13d38c | 100 | 25 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[130:133], v[2:3], off | 0x13d368 | 100 | 25 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[134:137], v[6:7], off | 0x13d370 | 100 | 25 | 0 | 0 | -- | 0 | 0 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[138:141], v[10:11], off | 0x13d378 | 100 | 25 | 0 | 0 | -- | 0 | 0 | | |
| +----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------+ | |
| FLAT other-pipeline available: This table identifies the most common FLAT instructions | |
| that caused the FLAT pipeline to not be utilized on a given sample *where other pipelines* were available to be used on the same cycle. | |
| If possible, you might be able to move other types of instructions before this | |
| instruction to increase overall utilization. | |
| +----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | avail | avail | avail | avail | avail | avail | avail | reason: | reason: | reason: | | |
| | | | | | threads | [%] | flat | flat | misc | exp | lds | vmem | scalar | matrix | valu | alu | arbiter | arbiter | | |
| | | | | | [%] | | [%] | [%] | [%] | [%] | [%] | tex | [%] | [%] | [%] | dependency | not | win | | |
| | | | | | | | | | | | | [%] | | | | [%] | win | ex | | |
| | | | | | | | | | | | | | | | | | [%] | stall | | |
| | | | | | | | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[170:173], v[114:115], off | 0x13d588 | 100 | 24.3 | 9.7 | 9.7 | 10.3 | 10.3 | 6.8 | 7.2 | 10.3 | 9.6 | 9.4 | -- | 0.6 | 9.7 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[174:177], v[116:117], off | 0x13d590 | 100 | 24.2 | 0.7 | 0.7 | 0.7 | 0.7 | 0.5 | 0.5 | 0.7 | 0.7 | 0.7 | -- | 0 | 0.7 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[182:185], v[120:121], off | 0x13d5a0 | 100 | 24.4 | 0.5 | 0.5 | 0.6 | 0.6 | 0.4 | 0.4 | 0.6 | 0.5 | 0.5 | -- | 0 | 0.5 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dword v236, v[122:123], off | 0x13d5a8 | 100 | 24.4 | 0.5 | 0.5 | 0.5 | 0.5 | 0.4 | 0.3 | 0.5 | 0.5 | 0.5 | -- | 0 | 0.5 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[178:181], v[118:119], off | 0x13d598 | 100 | 24.1 | 0.5 | 0.5 | 0.5 | 0.5 | 0.4 | 0.3 | 0.5 | 0.5 | 0.5 | -- | 0 | 0.5 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_store_short v[44:45], v28, off | 0x13e074 | 100 | 24.9 | 0.2 | 0.2 | 0.2 | 0.2 | 0.2 | 0.2 | 0.2 | 0.2 | 0.2 | 0.0 | 0 | 0.2 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dword v2, v[18:19], off | 0x13d38c | 100 | 25 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[130:133], v[2:3], off | 0x13d368 | 100 | 25 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[134:137], v[6:7], off | 0x13d370 | 100 | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0 | 0 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[138:141], v[10:11], off | 0x13d378 | 100 | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0 | 0 | | |
| +----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+--------------+-----------+-----------+ | |
| FLAT Hotspots: This table identifies the most common FLAT instructions | |
| *including* arbitration loss (and other "good" stalls). | |
| If the FLAT pipeline is highly utilized reducing use | |
| or avoiding these instructions may improve overall performance. | |
| +----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | reason: | | |
| | | | | | threads | [%] | flat | flat | alu | arbiter | arbiter | | |
| | | | | | [%] | | [%] | [%] | dependency | not | win | | |
| | | | | | | | | | [%] | win | ex | | |
| | | | | | | | | | | [%] | stall | | |
| | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[170:173], v[114:115], off | 0x13d588 | 100 | 24.3 | 9.7 | 9.7 | -- | 0.7 | 9.7 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[174:177], v[116:117], off | 0x13d590 | 100 | 24.2 | 0.7 | 0.7 | -- | 0 | 0.7 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[182:185], v[120:121], off | 0x13d5a0 | 100 | 24.4 | 0.6 | 0.5 | -- | 0 | 0.5 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dword v236, v[122:123], off | 0x13d5a8 | 100 | 24.3 | 0.5 | 0.5 | -- | 0 | 0.5 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[178:181], v[118:119], off | 0x13d598 | 100 | 24 | 0.5 | 0.5 | -- | 0 | 0.5 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_store_short v[44:45], v28, off | 0x13e074 | 100 | 24.9 | 0.2 | 0.2 | 0.0 | 0 | 0.2 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dword v2, v[18:19], off | 0x13d38c | 100 | 25 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[130:133], v[2:3], off | 0x13d368 | 100 | 25 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[134:137], v[6:7], off | 0x13d370 | 100 | 25 | 0 | 0 | -- | 0 | 0 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | global_load_dwordx4 v[138:141], v[10:11], off | 0x13d378 | 100 | 25 | 0 | 0 | -- | 0 | 0 | | |
| +----+----------------------------------------------------+-------------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------+ | |
| LDS pipeline holes: This table identifies the most common LDS instructions | |
| that are causing the LDS pipeline to not be utilized on a given sample. | |
| Fixing these will likely increase LDS utilization. | |
| +----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+---------+---------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | | |
| | | | | | threads | [%] | lds | lds | arbiter | | |
| | | | | | [%] | | [%] | [%] | win | | |
| | | | | | | | | | ex | | |
| | | | | | | | | | stall | | |
| | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+---------+---------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[202:205], v96 | 0x13d8f0 | 100 | 24.3 | 1.2 | 1.2 | 1.2 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[206:209], v86 | 0x13d5e0 | 100 | 24.2 | 1.1 | 1.1 | 1.1 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[210:213], v89 | 0x13d5e8 | 100 | 24.1 | 1.1 | 1.1 | 1.1 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[214:217], v90 | 0x13d5f0 | 100 | 24.2 | 1.1 | 1.1 | 1.1 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[214:217], v100 | 0x13d8c0 | 100 | 24.3 | 1.1 | 1.1 | 1.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_u8 v117, v102 | 0x13d630 | 100 | 24.3 | 1.1 | 1.1 | 1.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[218:221], v93 | 0x13d5f8 | 100 | 24.2 | 1.1 | 1.1 | 1.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[222:225], v94 | 0x13d600 | 100 | 24.1 | 1.1 | 1.1 | 1.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_u8 v116, v101 offset:8 | 0x13d628 | 100 | 24.2 | 1.1 | 1.1 | 1.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[210:213], v83 | 0x13d8b8 | 100 | 24.2 | 1 | 1 | 1 | | |
| +----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+---------+---------+-----------+ | |
| LDS other-pipeline available: This table identifies the most common LDS instructions | |
| that caused the LDS pipeline to not be utilized on a given sample *where other pipelines* were available to be used on the same cycle. | |
| If possible, you might be able to move other types of instructions before this | |
| instruction to increase overall utilization. | |
| +----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | avail | avail | avail | avail | avail | avail | avail | reason: | | |
| | | | | | threads | [%] | lds | lds | misc | exp | flat | vmem | scalar | matrix | valu | arbiter | | |
| | | | | | [%] | | [%] | [%] | [%] | [%] | [%] | tex | [%] | [%] | [%] | win | | |
| | | | | | | | | | | | | [%] | | | | ex | | |
| | | | | | | | | | | | | | | | | stall | | |
| | | | | | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[202:205], v96 | 0x13d8f0 | 100 | 24.3 | 1.2 | 1.2 | 1.2 | 1.2 | 0.9 | 1.1 | 1.2 | 1.2 | 1.2 | 1.2 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[206:209], v86 | 0x13d5e0 | 100 | 24.2 | 1.1 | 1.1 | 1.1 | 1.1 | 1.1 | 0.8 | 1.1 | 1 | 1.1 | 1.1 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[210:213], v89 | 0x13d5e8 | 100 | 24.1 | 1.1 | 1.1 | 1.1 | 1.1 | 1.1 | 0.8 | 1.1 | 1.1 | 1.1 | 1.1 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[214:217], v90 | 0x13d5f0 | 100 | 24.2 | 1.1 | 1.1 | 1.1 | 1.1 | 1 | 0.8 | 1.1 | 1 | 1.1 | 1.1 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[214:217], v100 | 0x13d8c0 | 100 | 24.3 | 1.1 | 1.1 | 1.1 | 1.1 | 0.7 | 1 | 1.1 | 1.1 | 1.1 | 1.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_u8 v117, v102 | 0x13d630 | 100 | 24.3 | 1.1 | 1.1 | 1.1 | 1.1 | 0.9 | 0.8 | 1.1 | 1.1 | 1 | 1.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[218:221], v93 | 0x13d5f8 | 100 | 24.2 | 1.1 | 1.1 | 1.1 | 1.1 | 1 | 0.8 | 1.1 | 0.9 | 1 | 1.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[222:225], v94 | 0x13d600 | 100 | 24.1 | 1.1 | 1.1 | 1.1 | 1.1 | 1 | 0.8 | 1.1 | 1 | 1 | 1.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_u8 v116, v101 offset:8 | 0x13d628 | 100 | 24.2 | 1.1 | 1.1 | 1.1 | 1.1 | 0.9 | 0.8 | 1.1 | 1 | 0.9 | 1.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[210:213], v83 | 0x13d8b8 | 100 | 24.2 | 1 | 1 | 1 | 1 | 0.6 | 1 | 1 | 1 | 1 | 1 | | |
| +----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+-----------+ | |
| LDS Hotspots: This table identifies the most common LDS instructions | |
| *including* arbitration loss (and other "good" stalls). | |
| If the LDS pipeline is highly utilized reducing use | |
| or avoiding these instructions may improve overall performance. | |
| +----+----------------------------------------------------+--------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | reason: | | |
| | | | | | threads | [%] | lds | lds | alu | arbiter | arbiter | | |
| | | | | | [%] | | [%] | [%] | dependency | not | win | | |
| | | | | | | | | | [%] | win | ex | | |
| | | | | | | | | | | [%] | stall | | |
| | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+--------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_write_b128 v63, v[170:173] offset:32768 | 0x13da54 | 100 | 24.8 | 1.5 | 0.6 | 0.0 | 0.1 | 0.6 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[222:225], v94 | 0x13d600 | 100 | 24.3 | 1.5 | 1.1 | -- | 0.3 | 1.1 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[226:229], v97 | 0x13d608 | 100 | 24.3 | 1.4 | 0.9 | -- | 0.4 | 0.9 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[202:205], v96 | 0x13d8f0 | 100 | 24.3 | 1.3 | 1.2 | -- | 0 | 1.2 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[210:213], v89 | 0x13d5e8 | 100 | 24.1 | 1.2 | 1.1 | -- | 0.1 | 1.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_write_b128 v64, v[174:177] offset:32768 | 0x13da60 | 100 | 24.6 | 1.2 | 0.8 | -- | 0 | 0.8 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[206:209], v86 | 0x13d5e0 | 100 | 24.2 | 1.2 | 1.1 | -- | 0 | 1.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[214:217], v90 | 0x13d5f0 | 100 | 24.2 | 1.2 | 1.1 | -- | 0 | 1.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_u8 v117, v102 | 0x13d630 | 100 | 24.3 | 1.2 | 1.1 | -- | 0 | 1.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | ds_read_b128 v[214:217], v100 | 0x13d8c0 | 100 | 24.3 | 1.2 | 1.1 | -- | 0 | 1.1 | | |
| +----+----------------------------------------------------+--------------------------------------------+----------+-----------+-------------+---------+---------+--------------+-----------+-----------+ | |
| VMEM_TEX pipeline holes: This table identifies the most common VMEM_TEX instructions | |
| that are causing the VMEM_TEX pipeline to not be utilized on a given sample. | |
| Fixing these will likely increase VMEM_TEX utilization. | |
| +----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------------+--------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | reason: | reason: | | |
| | | | | | threads | [%] | vmem | vmem | no | alu | arbiter | arbiter | | |
| | | | | | [%] | | tex | tex | instruction | dependency | not | win | | |
| | | | | | | | [%] | [%] | available | [%] | win | ex | | |
| | | | | | | | | | [%] | | [%] | stall | | |
| | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------------+--------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[138:141], v113, s[20:23], 0 offen | 0x13d520 | 100 | 25 | 5.5 | 5.5 | 0.0 | 0.1 | 0.0 | 5.5 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx2 v[234:235], v34, s[12:15], 0 offen | 0x13d548 | 100 | 24.7 | 3.9 | 3.9 | -- | 0.1 | -- | 3.9 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[166:169], v130, s[20:23], 0 offen | 0x13d580 | 100 | 24 | 1.3 | 1.3 | 0.0 | -- | -- | 1.3 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[150:153], v126, s[20:23], 0 offen | 0x13d560 | 100 | 24.6 | 0.4 | 0.4 | -- | -- | -- | 0.4 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[162:165], v129, s[20:23], 0 offen | 0x13d578 | 100 | 24.5 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[158:161], v128, s[20:23], 0 offen | 0x13d570 | 100 | 24.3 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[146:149], v125, s[20:23], 0 offen | 0x13d558 | 100 | 24.5 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[154:157], v127, s[20:23], 0 offen | 0x13d568 | 100 | 24.4 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[142:145], v124, s[20:23], 0 offen | 0x13d550 | 100 | 24.5 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[122:125], v16, s[20:23], 0 offen | 0x13d2f0 | 100 | 25 | 0.1 | 0.1 | -- | 0.0 | -- | 0.1 | | |
| +----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------------+--------------+-----------+-----------+ | |
| VMEM_TEX other-pipeline available: This table identifies the most common VMEM_TEX instructions | |
| that caused the VMEM_TEX pipeline to not be utilized on a given sample *where other pipelines* were available to be used on the same cycle. | |
| If possible, you might be able to move other types of instructions before this | |
| instruction to increase overall utilization. | |
| +----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+---------------+--------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | avail | avail | avail | avail | avail | avail | avail | reason: | reason: | reason: | reason: | | |
| | | | | | threads | [%] | vmem | vmem | misc | exp | flat | lds | scalar | matrix | valu | no | alu | arbiter | arbiter | | |
| | | | | | [%] | | tex | tex | [%] | [%] | [%] | [%] | [%] | [%] | [%] | instruction | dependency | not | win | | |
| | | | | | | | [%] | [%] | | | | | | | | available | [%] | win | ex | | |
| | | | | | | | | | | | | | | | | [%] | | [%] | stall | | |
| | | | | | | | | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+---------------+--------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[138:141], v113, s[20:23], 0 offen | 0x13d520 | 100 | 25 | 5.5 | 5.5 | 5.6 | 5.6 | 2.4 | 5.1 | 5.6 | 5.6 | 5.6 | 0.0 | 0.1 | 0.0 | 5.5 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx2 v[234:235], v34, s[12:15], 0 offen | 0x13d548 | 100 | 24.7 | 3.9 | 3.9 | 4 | 4 | 4 | 1.6 | 4 | 3.9 | 3.9 | -- | 0.1 | -- | 3.9 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[166:169], v130, s[20:23], 0 offen | 0x13d580 | 100 | 24 | 1.3 | 1.3 | 1.3 | 1.3 | 1.3 | 0.6 | 1.3 | 1.3 | 1.2 | 0.0 | -- | -- | 1.3 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[150:153], v126, s[20:23], 0 offen | 0x13d560 | 100 | 24.6 | 0.4 | 0.4 | 0.4 | 0.4 | 0.4 | 0.2 | 0.4 | 0.4 | 0.4 | -- | -- | -- | 0.4 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[162:165], v129, s[20:23], 0 offen | 0x13d578 | 100 | 24.5 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.2 | 0.3 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[158:161], v128, s[20:23], 0 offen | 0x13d570 | 100 | 24.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.2 | 0.3 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[146:149], v125, s[20:23], 0 offen | 0x13d558 | 100 | 24.5 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.1 | 0.3 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[154:157], v127, s[20:23], 0 offen | 0x13d568 | 100 | 24.4 | 0.3 | 0.3 | 0.3 | 0.3 | 0.3 | 0.2 | 0.3 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[142:145], v124, s[20:23], 0 offen | 0x13d550 | 100 | 24.5 | 0.3 | 0.3 | 0.3 | 0.3 | 0.2 | 0.1 | 0.3 | 0.2 | 0.2 | -- | -- | -- | 0.3 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[122:125], v16, s[20:23], 0 offen | 0x13d2f0 | 100 | 25 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0.0 | -- | 0.1 | | |
| +----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+----------+----------+---------+---------------+--------------+-----------+-----------+ | |
| VMEM_TEX Hotspots: This table identifies the most common VMEM_TEX instructions | |
| *including* arbitration loss (and other "good" stalls). | |
| If the VMEM_TEX pipeline is highly utilized reducing use | |
| or avoiding these instructions may improve overall performance. | |
| +----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------------+--------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | reason: | reason: | | |
| | | | | | threads | [%] | vmem | vmem | no | alu | arbiter | arbiter | | |
| | | | | | [%] | | tex | tex | instruction | dependency | not | win | | |
| | | | | | | | [%] | [%] | available | [%] | win | ex | | |
| | | | | | | | | | [%] | | [%] | stall | | |
| | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------------+--------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[138:141], v113, s[20:23], 0 offen | 0x13d520 | 100 | 25 | 5.6 | 5.5 | 0.0 | 0.1 | 0.0 | 5.5 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx2 v[234:235], v34, s[12:15], 0 offen | 0x13d548 | 100 | 24.7 | 4 | 3.9 | -- | 0.1 | 0.0 | 3.9 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[166:169], v130, s[20:23], 0 offen | 0x13d580 | 100 | 24 | 1.3 | 1.3 | 0.0 | -- | 0.0 | 1.3 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[150:153], v126, s[20:23], 0 offen | 0x13d560 | 100 | 24.5 | 0.4 | 0.4 | -- | -- | 0.0 | 0.4 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[162:165], v129, s[20:23], 0 offen | 0x13d578 | 100 | 24.4 | 0.4 | 0.3 | -- | -- | 0.0 | 0.3 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[158:161], v128, s[20:23], 0 offen | 0x13d570 | 100 | 24.3 | 0.4 | 0.3 | -- | -- | 0.0 | 0.3 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[146:149], v125, s[20:23], 0 offen | 0x13d558 | 100 | 24.4 | 0.3 | 0.3 | -- | -- | 0.0 | 0.3 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[154:157], v127, s[20:23], 0 offen | 0x13d568 | 100 | 24.4 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[142:145], v124, s[20:23], 0 offen | 0x13d550 | 100 | 24.4 | 0.3 | 0.3 | -- | -- | -- | 0.3 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | buffer_load_dwordx4 v[122:125], v16, s[20:23], 0 offen | 0x13d2f0 | 100 | 25 | 0.1 | 0.1 | -- | 0.0 | 0.0 | 0.1 | | |
| +----+----------------------------------------------------+---------------------------------------------------------+----------+-----------+-------------+---------+---------+---------------+--------------+-----------+-----------+ | |
| SCALAR pipeline holes: This table identifies the most common SCALAR instructions | |
| that are causing the SCALAR pipeline to not be utilized on a given sample. | |
| Fixing these will likely increase SCALAR utilization. | |
| +----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+----------+----------+---------------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | | |
| | | | | | threads | [%] | scalar | scalar | no | | |
| | | | | | [%] | | [%] | [%] | instruction | | |
| | | | | | | | | | available | | |
| | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+----------+----------+---------------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_load_dword s17, s[0:1], 0x40 | 0x13cf00 | 100 | 22.4 | 0 | 0 | 0 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_addc_u32 s12, s5, s13 | 0x13cf20 | 100 | 25 | 0 | 0 | 0 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_mov_b64 s[6:7], 0x100 | 0x13cffc | 100 | 25 | 0 | 0 | 0 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_mul_i32 s26, s27, 0x1a0 | 0x13d200 | 100 | 25 | 0 | 0 | 0 | | |
| +----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+----------+----------+---------------+ | |
| SCALAR other-pipeline available: This table identifies the most common SCALAR instructions | |
| that caused the SCALAR pipeline to not be utilized on a given sample *where other pipelines* were available to be used on the same cycle. | |
| If possible, you might be able to move other types of instructions before this | |
| instruction to increase overall utilization. | |
| +----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+----------+----------+---------+---------+---------+---------+---------+----------+---------+---------------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | avail | avail | avail | avail | avail | avail | avail | reason: | | |
| | | | | | threads | [%] | scalar | scalar | misc | exp | flat | lds | vmem | matrix | valu | no | | |
| | | | | | [%] | | [%] | [%] | [%] | [%] | [%] | [%] | tex | [%] | [%] | instruction | | |
| | | | | | | | | | | | | | [%] | | | available | | |
| | | | | | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+----------+----------+---------+---------+---------+---------+---------+----------+---------+---------------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_load_dword s17, s[0:1], 0x40 | 0x13cf00 | 100 | 22.4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_addc_u32 s12, s5, s13 | 0x13cf20 | 100 | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_mov_b64 s[6:7], 0x100 | 0x13cffc | 100 | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_mul_i32 s26, s27, 0x1a0 | 0x13d200 | 100 | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | |
| +----+----------------------------------------------------+--------------------------------+----------+-----------+-------------+----------+----------+---------+---------+---------+---------+---------+----------+---------+---------------+ | |
| SCALAR Hotspots: This table identifies the most common SCALAR instructions | |
| *including* arbitration loss (and other "good" stalls). | |
| If the SCALAR pipeline is highly utilized reducing use | |
| or avoiding these instructions may improve overall performance. | |
| +----+----------------------------------------------------+-----------------------------------+----------+-----------+-------------+----------+----------+---------------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | | |
| | | | | | threads | [%] | scalar | scalar | no | | |
| | | | | | [%] | | [%] | [%] | instruction | | |
| | | | | | | | | | available | | |
| | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+-----------------------------------+----------+-----------+-------------+----------+----------+---------------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_cmpk_lt_u32 s27, 0x78 | 0x13da44 | 100 | 24 | 0 | 0 | -- | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_add_i32 s27, s27, 4 | 0x13da14 | 100 | 23.5 | 0 | 0 | -- | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_addc_u32 s12, s5, s13 | 0x13cf20 | 100 | 25 | 0 | 0 | 0.0 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_load_dword s17, s[0:1], 0x40 | 0x13cf00 | 100 | 22.6 | 0 | 0 | 0.0 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_mov_b64 s[6:7], 0x100 | 0x13cffc | 100 | 25 | 0 | 0 | 0.0 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_mov_b32 s22, 0x1a000000 | 0x13d0e8 | 100 | 25 | 0 | 0 | -- | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_mul_i32 s27, s27, 0xe6000000 | 0x13d39c | 100 | 25 | 0 | 0 | -- | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_mul_hi_u32 s10, s16, 0x4ec4ec4f | 0x13d1e8 | 100 | 25 | 0 | 0 | -- | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_add_i32 s16, s16, s24 | 0x13dd0c | 100 | 25 | 0 | 0 | -- | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | s_lshr_b32 s10, s10, 8 | 0x13d1f4 | 100 | 25 | 0 | 0 | -- | | |
| +----+----------------------------------------------------+-----------------------------------+----------+-----------+-------------+----------+----------+---------------+ | |
| MATRIX pipeline holes: This table identifies the most common MATRIX instructions | |
| that are causing the MATRIX pipeline to not be utilized on a given sample. | |
| Fixing these will likely increase MATRIX utilization. | |
| +----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+--------------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | | |
| | | | | | threads | [%] | matrix | matrix | alu | arbiter | | |
| | | | | | [%] | | [%] | [%] | dependency | win | | |
| | | | | | | | | | [%] | ex | | |
| | | | | | | | | | | stall | | |
| | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+--------------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[4:7], v[210:213], v[198:201], v[4:7], v118, v117 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d974 | 100 | 23.8 | 0.2 | 0.2 | -- | 0.2 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[28:31], v[132:135], v[186:189], v[28:31], v114, v115 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d994 | 100 | 23.9 | 0.2 | 0.2 | 0.0 | 0.2 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[210:213], v[206:209], v[0:3], v118, v122 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d984 | 100 | 23.9 | 0.1 | 0.1 | -- | 0.1 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[24:27], v[132:135], v[194:197], v[24:27], v114, v116 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9a4 | 100 | 24.4 | 0.1 | 0.1 | -- | 0.1 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[20:23], v[132:135], v[202:205], v[20:23], v114, v117 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9b4 | 100 | 24.1 | 0.1 | 0.1 | -- | 0.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[132:135], v[214:217], v[16:19], v114, v122 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9c4 | 100 | 24.2 | 0.1 | 0.1 | -- | 0.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[124:127], v[186:189], v[12:15], v118, v115 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9d4 | 100 | 24.1 | 0.1 | 0.1 | -- | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[4:7], v[124:127], v[202:205], v[4:7], v118, v117 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9f4 | 100 | 24 | 0.1 | 0.1 | 0.0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[124:127], v[206:209], v[16:19], v114, v122 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d934 | 100 | 24.2 | 0.1 | 0.1 | -- | 0.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[124:127], v[214:217], v[0:3], v118, v122 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13da04 | 100 | 24 | 0.1 | 0.1 | -- | 0.1 | | |
| +----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+--------------+-----------+ | |
| MATRIX other-pipeline available: This table identifies the most common MATRIX instructions | |
| that caused the MATRIX pipeline to not be utilized on a given sample *where other pipelines* were available to be used on the same cycle. | |
| If possible, you might be able to move other types of instructions before this | |
| instruction to increase overall utilization. | |
| +----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+---------+---------+---------+---------+---------+----------+---------+--------------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | avail | avail | avail | avail | avail | avail | avail | reason: | reason: | | |
| | | | | | threads | [%] | matrix | matrix | misc | exp | flat | lds | vmem | scalar | valu | alu | arbiter | | |
| | | | | | [%] | | [%] | [%] | [%] | [%] | [%] | [%] | tex | [%] | [%] | dependency | win | | |
| | | | | | | | | | | | | | [%] | | | [%] | ex | | |
| | | | | | | | | | | | | | | | | | stall | | |
| | | | | | | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+---------+---------+---------+---------+---------+----------+---------+--------------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[4:7], v[210:213], v[198:201], v[4:7], v118, v117 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d974 | 100 | 23.8 | 0.2 | 0.2 | 0.2 | 0.2 | 0.2 | 0.1 | 0.2 | 0.2 | 0.2 | -- | 0.2 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[28:31], v[132:135], v[186:189], v[28:31], v114, v115 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d994 | 100 | 23.9 | 0.2 | 0.2 | 0.2 | 0.2 | 0.1 | 0.1 | 0.1 | 0.2 | 0.1 | 0.0 | 0.2 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[210:213], v[206:209], v[0:3], v118, v122 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d984 | 100 | 23.9 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0.1 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[24:27], v[132:135], v[194:197], v[24:27], v114, v116 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9a4 | 100 | 24.4 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0.1 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[20:23], v[132:135], v[202:205], v[20:23], v114, v117 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9b4 | 100 | 24.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | -- | 0.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[132:135], v[214:217], v[16:19], v114, v122 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9c4 | 100 | 24.2 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0 | -- | 0.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[124:127], v[186:189], v[12:15], v118, v115 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9d4 | 100 | 24.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0 | -- | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[4:7], v[124:127], v[202:205], v[4:7], v118, v117 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9f4 | 100 | 24 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0 | 0.0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[124:127], v[206:209], v[16:19], v114, v122 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d934 | 100 | 24.2 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0 | -- | 0.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[124:127], v[214:217], v[0:3], v118, v122 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13da04 | 100 | 24 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0 | -- | 0.1 | | |
| +----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+---------+---------+---------+---------+---------+----------+---------+--------------+-----------+ | |
| MATRIX Hotspots: This table identifies the most common MATRIX instructions | |
| *including* arbitration loss (and other "good" stalls). | |
| If the MATRIX pipeline is highly utilized reducing use | |
| or avoiding these instructions may improve overall performance. | |
| +----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+--------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | stall | reason: | reason: | reason: | | |
| | | | | | threads | [%] | matrix | matrix | alu | arbiter | arbiter | | |
| | | | | | [%] | | [%] | [%] | dependency | not | win | | |
| | | | | | | | | | [%] | win | ex | | |
| | | | | | | | | | | [%] | stall | | |
| | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+--------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[4:7], v[210:213], v[198:201], v[4:7], v118, v117 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d974 | 100 | 23.8 | 0.3 | 0.2 | -- | 0 | 0.2 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[28:31], v[132:135], v[186:189], v[28:31], v114, v115 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d994 | 100 | 24 | 0.3 | 0.2 | 0.0 | 0 | 0.2 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[210:213], v[206:209], v[0:3], v118, v122 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d984 | 100 | 23.9 | 0.2 | 0.1 | -- | 0 | 0.1 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[24:27], v[132:135], v[194:197], v[24:27], v114, v116 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9a4 | 100 | 24.3 | 0.2 | 0.1 | -- | 0 | 0.1 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[20:23], v[132:135], v[202:205], v[20:23], v114, v117 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9b4 | 100 | 24 | 0.2 | 0.1 | -- | 0 | 0.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[132:135], v[214:217], v[16:19], v114, v122 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9c4 | 100 | 24.1 | 0.2 | 0.1 | -- | 0 | 0.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[124:127], v[186:189], v[12:15], v118, v115 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9d4 | 100 | 24.1 | 0.2 | 0.1 | -- | 0 | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[4:7], v[124:127], v[202:205], v[4:7], v118, v117 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9f4 | 100 | 24 | 0.2 | 0.1 | 0.0 | 0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[8:11], v[124:127], v[194:197], v[8:11], v118, v116 op_sel:[1,1,0] op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d9e4 | 100 | 24.1 | 0.2 | 0.1 | -- | 0 | 0.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[124:127], v[206:209], v[16:19], v114, v122 op_sel_hi:[1,1,0] cbsz:4 blgp:4 | 0x13d934 | 100 | 24.2 | 0.2 | 0.1 | -- | 0 | 0.1 | | |
| +----+----------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+----------+--------------+-----------+-----------+ | |
| VALU pipeline holes: This table identifies the most common VALU instructions | |
| that are causing the VALU pipeline to not be utilized on a given sample. | |
| Fixing these will likely increase VALU utilization. | |
| +----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | dual | stall | reason: | reason: | reason: | | |
| | | | | | threads | [%] | valu | issue | valu | no | arbiter | arbiter | | |
| | | | | | [%] | | [%] | valu | [%] | instruction | not | win | | |
| | | | | | | | | [%] | | available | win | ex | | |
| | | | | | | | | | | [%] | [%] | stall | | |
| | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[114:115], s[2:3], 0, v[44:45] | 0x13d4e0 | 100 | 24 | 0.1 | 0 | 0.1 | 0.1 | 0 | 0.1 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_add_u32_e32 v128, 0x20000, v113 | 0x13d528 | 100 | 23.5 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v115, 8, v115 | 0x13d6dc | 100 | 24 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v117, 8, v117 | 0x13d6e0 | 100 | 24.1 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v125, 8, v125 | 0x13d6ec | 100 | 24.3 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v123, 8, v123 | 0x13d6e4 | 100 | 24.5 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_or_b32_sdwa v115, v116, v117 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 0x13d704 | 100 | 24.2 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[116:117], s[2:3], 0, v[46:47] | 0x13d4e8 | 100 | 24.3 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v127, 8, v127 | 0x13d6f4 | 100 | 24.6 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v129, 8, v129 | 0x13d6fc | 100 | 24.2 | 0 | 0 | 0 | -- | 0 | 0 | | |
| +----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------------+-----------+-----------+ | |
| VALU other-pipeline available: This table identifies the most common VALU instructions | |
| that caused the VALU pipeline to not be utilized on a given sample *where other pipelines* were available to be used on the same cycle. | |
| If possible, you might be able to move other types of instructions before this | |
| instruction to increase overall utilization. | |
| +----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+---------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | dual | stall | avail | avail | avail | avail | avail | avail | avail | reason: | reason: | reason: | | |
| | | | | | threads | [%] | valu | issue | valu | misc | exp | flat | lds | vmem | scalar | matrix | no | arbiter | arbiter | | |
| | | | | | [%] | | [%] | valu | [%] | [%] | [%] | [%] | [%] | tex | [%] | [%] | instruction | not | win | | |
| | | | | | | | | [%] | | | | | | [%] | | | available | win | ex | | |
| | | | | | | | | | | | | | | | | | [%] | [%] | stall | | |
| | | | | | | | | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+---------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[114:115], s[2:3], 0, v[44:45] | 0x13d4e0 | 100 | 24 | 0.1 | 0 | 0.1 | 0.2 | 0.2 | 0.2 | 0.1 | 0.2 | 0.2 | 0.2 | 0.1 | 0 | 0.1 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_add_u32_e32 v128, 0x20000, v113 | 0x13d528 | 100 | 23.5 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v115, 8, v115 | 0x13d6dc | 100 | 24 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v117, 8, v117 | 0x13d6e0 | 100 | 24.1 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v125, 8, v125 | 0x13d6ec | 100 | 24.3 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v123, 8, v123 | 0x13d6e4 | 100 | 24.5 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_or_b32_sdwa v115, v116, v117 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 0x13d704 | 100 | 24.2 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[116:117], s[2:3], 0, v[46:47] | 0x13d4e8 | 100 | 24.3 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v127, 8, v127 | 0x13d6f4 | 100 | 24.6 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0 | 0.1 | 0.1 | 0.1 | 0.1 | -- | 0 | 0.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v129, 8, v129 | 0x13d6fc | 100 | 24.2 | 0 | 0 | 0 | 0.1 | 0.1 | 0 | 0 | 0 | 0.1 | 0 | -- | 0 | 0 | | |
| +----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+---------------+-----------+-----------+ | |
| VALU dual-issue holes: This table identifies the most common VALU instructions | |
| where two VALU instructions could not be issued in a given sample. | |
| These *may* indicate reasons why the VALU failed to dual-issue. | |
| Fixing these *may* increase dual-issue rates, and increase VALU utilization. | |
| +----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+----------+---------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issued | issue | issue | issue | issue | issue | issue | issue | stall | stall | stall | stall | reason: | reason: | reason: | | |
| | | | | | threads | [%] | [%] | misc | flat | lds | vmem | scalar | matrix | valu | flat | lds | vmem | matrix | no | arbiter | none | | |
| | | | | | [%] | | | [%] | [%] | [%] | tex | [%] | [%] | [%] | [%] | [%] | tex | [%] | instruction | not | [%] | | |
| | | | | | | | | | | | [%] | | | | | | [%] | | available | win | | | |
| | | | | | | | | | | | | | | | | | | | [%] | [%] | | | |
| |----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+----------+---------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[114:115], s[2:3], 0, v[44:45] | 0x13d4e0 | 100 | 24.3 | 82.9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_add_u32_e32 v126, 0xfffc0000, v113 | 0x13d510 | 100 | 23.6 | 99.8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | 0 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_add_u32_e32 v127, 0xfffe0000, v113 | 0x13d518 | 100 | 23.9 | 99.3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | 0 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_add_u32_e32 v124, 0xfff80000, v113 | 0x13d500 | 100 | 24 | 99.6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | 0 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[116:117], s[2:3], 0, v[46:47] | 0x13d4e8 | 100 | 24.5 | 97 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | 0 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v115, 8, v115 | 0x13d6dc | 100 | 24.4 | 95.6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | 0 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[118:119], s[2:3], 0, v[48:49] | 0x13d4f0 | 100 | 24.4 | 98.7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | 0 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[48:49], v[48:49], 0, s[6:7] | 0x13da34 | 100 | 24 | 100 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | -- | 0 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[122:123], s[2:3], 0, v[42:43] | 0x13d540 | 100 | 24.3 | 98.8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | 0 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_or_b32_sdwa v122, v122, v123 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 0x13d790 | 100 | 24.6 | 93.2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -- | 0.0 | 0 | | |
| +----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+----------+---------+---------+---------+---------+----------+----------+---------+---------+---------+---------+----------+---------------+-----------+-----------+ | |
| VALU Hotspots: This table identifies the most common VALU instructions | |
| *including* arbitration loss (and other "good" stalls). | |
| If the VALU pipeline is highly utilized reducing use | |
| or avoiding these instructions may improve overall performance. | |
| +----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------------+-----------+-----------+ | |
| | | kernel | inst | pc | active | occupancy | issue | dual | stall | reason: | reason: | reason: | | |
| | | | | | threads | [%] | valu | issue | valu | no | arbiter | arbiter | | |
| | | | | | [%] | | [%] | valu | [%] | instruction | not | win | | |
| | | | | | | | | [%] | | available | win | ex | | |
| | | | | | | | | | | [%] | [%] | stall | | |
| | | | | | | | | | | | | [%] | | |
| |----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------------+-----------+-----------| | |
| | 0 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[114:115], s[2:3], 0, v[44:45] | 0x13d4e0 | 100 | 24.1 | 0.1 | 0 | 0.1 | 0.1 | 0 | 0.1 | | |
| | 1 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_add_u32_e32 v128, 0x20000, v113 | 0x13d528 | 100 | 23.6 | 0.2 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 2 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v115, 8, v115 | 0x13d6dc | 100 | 24.1 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 3 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v117, 8, v117 | 0x13d6e0 | 100 | 24.3 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 4 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v125, 8, v125 | 0x13d6ec | 100 | 24.2 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 5 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshl_add_u64 v[116:117], s[2:3], 0, v[46:47] | 0x13d4e8 | 100 | 24.4 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 6 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v123, 8, v123 | 0x13d6e4 | 100 | 24.5 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 7 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_or_b32_sdwa v115, v116, v117 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 0x13d704 | 100 | 24.3 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 8 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v127, 8, v127 | 0x13d6f4 | 100 | 24.6 | 0.1 | 0 | 0.1 | -- | 0 | 0.1 | | |
| | 9 | prefill_bs4$async_dispatch_22_reduction_Dx53248x51 | v_lshlrev_b16_e32 v129, 8, v129 | 0x13d6fc | 100 | 24.1 | 0.1 | 0 | 0 | -- | 0 | 0 | | |
| +----+----------------------------------------------------+---------------------------------------------------------------------------------------------------+----------+-----------+-------------+---------+---------+---------+---------------+-----------+-----------+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment