Created
November 26, 2025 15:31
-
-
Save bjacob/e64f0fc85378c0f529741a1401072d22 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| hal.executable public @prefill_bs4$async_dispatch_22 { | |
| hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384, dma_sizes = [32, 128]>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>) { | |
| hal.executable.export public @prefill_bs4$async_dispatch_22_reduction_Dx53248x512x32_f4E2M1FNxf4E2M1FNxf8E8M0FNUxf8E8M0FNUxf32 ordinal(0) layout(#hal.pipeline.layout<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device, %arg1: index) -> (index, index, index) { | |
| %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice(%arg1) | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @prefill_bs4$async_dispatch_22_reduction_Dx53248x512x32_f4E2M1FNxf4E2M1FNxf8E8M0FNUxf8E8M0FNUxf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} { | |
| %c32_i64 = arith.constant 32 : i64 | |
| %cst = arith.constant 0.000000e+00 : f32 | |
| %0 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
| %1 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
| %2 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
| %3 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
| %4 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
| %5 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
| %6 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
| %7 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
| %8 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
| %9 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
| %10 = hal.interface.constant.load layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
| %11 = arith.extui %0 : i32 to i64 | |
| %12 = arith.extui %1 : i32 to i64 | |
| %13 = arith.shli %12, %c32_i64 : i64 | |
| %14 = arith.ori %11, %13 : i64 | |
| %15 = arith.index_castui %14 : i64 to index | |
| %16 = arith.extui %2 : i32 to i64 | |
| %17 = arith.extui %3 : i32 to i64 | |
| %18 = arith.shli %17, %c32_i64 : i64 | |
| %19 = arith.ori %16, %18 : i64 | |
| %20 = arith.index_castui %19 : i64 to index | |
| %21 = arith.extui %4 : i32 to i64 | |
| %22 = arith.extui %5 : i32 to i64 | |
| %23 = arith.shli %22, %c32_i64 : i64 | |
| %24 = arith.ori %21, %23 : i64 | |
| %25 = arith.index_castui %24 {stream.alignment = 16777216 : index, stream.values = [721420288 : index, 2315255808 : index, 3909091328 : index, 5502926848 : index, 7096762368 : index, 8690597888 : index, 10284433408 : index, 11878268928 : index, 13472104448 : index, 15065939968 : index, 16659775488 : index, 18253611008 : index, 19847446528 : index, 21441282048 : index, 23035117568 : index, 24628953088 : index, 26222788608 : index, 27816624128 : index, 29410459648 : index, 31004295168 : index, 32598130688 : index, 34191966208 : index, 35785801728 : index, 37379637248 : index, 38973472768 : index, 40567308288 : index, 42161143808 : index, 43754979328 : index, 45348814848 : index, 46942650368 : index, 48536485888 : index, 50130321408 : index, 51724156928 : index, 53317992448 : index, 54911827968 : index, 56505663488 : index, 58099499008 : index, 59693334528 : index, 61287170048 : index, 62881005568 : index, 64474841088 : index, 66068676608 : index, 67662512128 : index, 69256347648 : index, 70850183168 : index, 72444018688 : index, 74037854208 : index, 75631689728 : index, 77225525248 : index, 78819360768 : index, 80413196288 : index, 82007031808 : index, 83600867328 : index, 85194702848 : index, 86788538368 : index, 88382373888 : index, 89976209408 : index, 91570044928 : index, 93163880448 : index, 94757715968 : index, 96351551488 : index, 97945387008 : index, 99539222528 : index, 101133058048 : index, 102726893568 : index, 104320729088 : index, 105914564608 : index, 107508400128 : index, 109102235648 : index, 110696071168 : index, 112289906688 : index, 113883742208 : index, 115477577728 : index, 117071413248 : index, 118665248768 : index, 120259084288 : index, 121852919808 : index, 123446755328 : index, 125040590848 : index, 126634426368 : index, 128228261888 : index, 129822097408 : index, 131415932928 : index, 133009768448 : index, 134603603968 : index, 136197439488 : index, 137791275008 : index, 139385110528 : index, 140978946048 : index, 142572781568 : index, 144166617088 : index, 145760452608 : index, 147354288128 : index, 148948123648 : index, 150541959168 : index, 152135794688 : index, 153729630208 : index, 155323465728 : index, 156917301248 : index, 158511136768 : index, 160104972288 : index, 161698807808 : index, 163292643328 : index, 164886478848 : index, 166480314368 : index, 168074149888 : index, 169667985408 : index, 171261820928 : index, 172855656448 : index, 174449491968 : index, 176043327488 : index, 177637163008 : index, 179230998528 : index, 180824834048 : index, 182418669568 : index, 184012505088 : index, 185606340608 : index, 187200176128 : index, 188794011648 : index, 190387847168 : index, 191981682688 : index, 193575518208 : index, 195169353728 : index, 196763189248 : index, 198357024768 : index, 199950860288 : index]} : i64 to index | |
| %26 = arith.extui %6 : i32 to i64 | |
| %27 = arith.extui %7 : i32 to i64 | |
| %28 = arith.shli %27, %c32_i64 : i64 | |
| %29 = arith.ori %26, %28 : i64 | |
| %30 = arith.index_castui %29 {stream.alignment = 64 : index, stream.values = [205071122496 : index, 205170802816 : index, 205270483136 : index, 205370163456 : index, 205469843776 : index, 205569524096 : index, 205669204416 : index, 205768884736 : index, 205868565056 : index, 205968245376 : index, 206067925696 : index, 206167606016 : index, 206267286336 : index, 206366966656 : index, 206466646976 : index, 206566327296 : index, 206666007616 : index, 206765687936 : index, 206865368256 : index, 206965048576 : index, 207064728896 : index, 207164409216 : index, 207264089536 : index, 207363769856 : index, 207463450176 : index, 207563130496 : index, 207662810816 : index, 207762491136 : index, 207862171456 : index, 207961851776 : index, 208061532096 : index, 208161212416 : index, 208260892736 : index, 208360573056 : index, 208460253376 : index, 208559933696 : index, 208659614016 : index, 208759294336 : index, 208858974656 : index, 208958654976 : index, 209058335296 : index, 209158015616 : index, 209257695936 : index, 209357376256 : index, 209457056576 : index, 209556736896 : index, 209656417216 : index, 209756097536 : index, 209855777856 : index, 209955458176 : index, 210055138496 : index, 210154818816 : index, 210254499136 : index, 210354179456 : index, 210453859776 : index, 210553540096 : index, 210653220416 : index, 210752900736 : index, 210852581056 : index, 210952261376 : index, 211051941696 : index, 211151622016 : index, 211251302336 : index, 211350982656 : index, 211450662976 : index, 211550343296 : index, 211650023616 : index, 211749703936 : index, 211849384256 : index, 211949064576 : index, 212048744896 : index, 212148425216 : index, 212248105536 : index, 212347785856 : index, 212447466176 : index, 212547146496 : index, 212646826816 : index, 212746507136 : index, 212846187456 : index, 212945867776 : index, 213045548096 : index, 213145228416 : index, 213244908736 : index, 213344589056 : index, 213444269376 : index, 213543949696 : index, 213643630016 : index, 213743310336 : index, 213842990656 : index, 213942670976 : index, 214042351296 : index, 214142031616 : index, 214241711936 : index, 214341392256 : index, 214441072576 : index, 214540752896 : index, 214640433216 : index, 214740113536 : index, 214839793856 : index, 214939474176 : index, 215039154496 : index, 215138834816 : index, 215238515136 : index, 215338195456 : index, 215437875776 : index, 215537556096 : index, 215637236416 : index, 215736916736 : index, 215836597056 : index, 215936277376 : index, 216035957696 : index, 216135638016 : index, 216235318336 : index, 216334998656 : index, 216434678976 : index, 216534359296 : index, 216634039616 : index, 216733719936 : index, 216833400256 : index, 216933080576 : index, 217032760896 : index, 217132441216 : index, 217232121536 : index, 217331801856 : index, 217431482176 : index, 217531162496 : index]} : i64 to index | |
| %31 = arith.extui %8 : i32 to i64 | |
| %32 = arith.extui %9 : i32 to i64 | |
| %33 = arith.shli %32, %c32_i64 : i64 | |
| %34 = arith.ori %31, %33 : i64 | |
| %35 = arith.index_castui %34 : i64 to index | |
| %36 = arith.index_castui %10 : i32 to index | |
| %37:6 = util.assume.int | |
| %15<umin = 21037056, umax = 86146744320>, | |
| %20<umin = 20971520, umax = 85878374400>, | |
| %25[<umin = 721420288, umax = 721420288, udiv = 721420288>, <umin = 2315255808, umax = 2315255808, udiv = 2315255808>, <umin = 3909091328, umax = 3909091328, udiv = 3909091328>, <umin = 5502926848, umax = 5502926848, udiv = 5502926848>, <umin = 7096762368, umax = 7096762368, udiv = 7096762368>, <umin = 8690597888, umax = 8690597888, udiv = 8690597888>, <umin = 10284433408, umax = 10284433408, udiv = 10284433408>, <umin = 11878268928, umax = 11878268928, udiv = 11878268928>, <umin = 13472104448, umax = 13472104448, udiv = 13472104448>, <umin = 15065939968, umax = 15065939968, udiv = 15065939968>, <umin = 16659775488, umax = 16659775488, udiv = 16659775488>, <umin = 18253611008, umax = 18253611008, udiv = 18253611008>, <umin = 19847446528, umax = 19847446528, udiv = 19847446528>, <umin = 21441282048, umax = 21441282048, udiv = 21441282048>, <umin = 23035117568, umax = 23035117568, udiv = 23035117568>, <umin = 24628953088, umax = 24628953088, udiv = 24628953088>, <umin = 26222788608, umax = 26222788608, udiv = 26222788608>, <umin = 27816624128, umax = 27816624128, udiv = 27816624128>, <umin = 29410459648, umax = 29410459648, udiv = 29410459648>, <umin = 31004295168, umax = 31004295168, udiv = 31004295168>, <umin = 32598130688, umax = 32598130688, udiv = 32598130688>, <umin = 34191966208, umax = 34191966208, udiv = 34191966208>, <umin = 35785801728, umax = 35785801728, udiv = 35785801728>, <umin = 37379637248, umax = 37379637248, udiv = 37379637248>, <umin = 38973472768, umax = 38973472768, udiv = 38973472768>, <umin = 40567308288, umax = 40567308288, udiv = 40567308288>, <umin = 42161143808, umax = 42161143808, udiv = 42161143808>, <umin = 43754979328, umax = 43754979328, udiv = 43754979328>, <umin = 45348814848, umax = 45348814848, udiv = 45348814848>, <umin = 46942650368, umax = 46942650368, udiv = 46942650368>, <umin = 48536485888, umax = 48536485888, udiv = 48536485888>, <umin = 50130321408, umax = 50130321408, udiv = 50130321408>, <umin = 51724156928, umax = 51724156928, udiv = 51724156928>, <umin = 53317992448, umax = 53317992448, udiv = 53317992448>, <umin = 54911827968, umax = 54911827968, udiv = 54911827968>, <umin = 56505663488, umax = 56505663488, udiv = 56505663488>, <umin = 58099499008, umax = 58099499008, udiv = 58099499008>, <umin = 59693334528, umax = 59693334528, udiv = 59693334528>, <umin = 61287170048, umax = 61287170048, udiv = 61287170048>, <umin = 62881005568, umax = 62881005568, udiv = 62881005568>, <umin = 64474841088, umax = 64474841088, udiv = 64474841088>, <umin = 66068676608, umax = 66068676608, udiv = 66068676608>, <umin = 67662512128, umax = 67662512128, udiv = 67662512128>, <umin = 69256347648, umax = 69256347648, udiv = 69256347648>, <umin = 70850183168, umax = 70850183168, udiv = 70850183168>, <umin = 72444018688, umax = 72444018688, udiv = 72444018688>, <umin = 74037854208, umax = 74037854208, udiv = 74037854208>, <umin = 75631689728, umax = 75631689728, udiv = 75631689728>, <umin = 77225525248, umax = 77225525248, udiv = 77225525248>, <umin = 78819360768, umax = 78819360768, udiv = 78819360768>, <umin = 80413196288, umax = 80413196288, udiv = 80413196288>, <umin = 82007031808, umax = 82007031808, udiv = 82007031808>, <umin = 83600867328, umax = 83600867328, udiv = 83600867328>, <umin = 85194702848, umax = 85194702848, udiv = 85194702848>, <umin = 86788538368, umax = 86788538368, udiv = 86788538368>, <umin = 88382373888, umax = 88382373888, udiv = 88382373888>, <umin = 89976209408, umax = 89976209408, udiv = 89976209408>, <umin = 91570044928, umax = 91570044928, udiv = 91570044928>, <umin = 93163880448, umax = 93163880448, udiv = 93163880448>, <umin = 94757715968, umax = 94757715968, udiv = 94757715968>, <umin = 96351551488, umax = 96351551488, udiv = 96351551488>, <umin = 97945387008, umax = 97945387008, udiv = 97945387008>, <umin = 99539222528, umax = 99539222528, udiv = 99539222528>, <umin = 101133058048, umax = 101133058048, udiv = 101133058048>, <umin = 102726893568, umax = 102726893568, udiv = 102726893568>, <umin = 104320729088, umax = 104320729088, udiv = 104320729088>, <umin = 105914564608, umax = 105914564608, udiv = 105914564608>, <umin = 107508400128, umax = 107508400128, udiv = 107508400128>, <umin = 109102235648, umax = 109102235648, udiv = 109102235648>, <umin = 110696071168, umax = 110696071168, udiv = 110696071168>, <umin = 112289906688, umax = 112289906688, udiv = 112289906688>, <umin = 113883742208, umax = 113883742208, udiv = 113883742208>, <umin = 115477577728, umax = 115477577728, udiv = 115477577728>, <umin = 117071413248, umax = 117071413248, udiv = 117071413248>, <umin = 118665248768, umax = 118665248768, udiv = 118665248768>, <umin = 120259084288, umax = 120259084288, udiv = 120259084288>, <umin = 121852919808, umax = 121852919808, udiv = 121852919808>, <umin = 123446755328, umax = 123446755328, udiv = 123446755328>, <umin = 125040590848, umax = 125040590848, udiv = 125040590848>, <umin = 126634426368, umax = 126634426368, udiv = 126634426368>, <umin = 128228261888, umax = 128228261888, udiv = 128228261888>, <umin = 129822097408, umax = 129822097408, udiv = 129822097408>, <umin = 131415932928, umax = 131415932928, udiv = 131415932928>, <umin = 133009768448, umax = 133009768448, udiv = 133009768448>, <umin = 134603603968, umax = 134603603968, udiv = 134603603968>, <umin = 136197439488, umax = 136197439488, udiv = 136197439488>, <umin = 137791275008, umax = 137791275008, udiv = 137791275008>, <umin = 139385110528, umax = 139385110528, udiv = 139385110528>, <umin = 140978946048, umax = 140978946048, udiv = 140978946048>, <umin = 142572781568, umax = 142572781568, udiv = 142572781568>, <umin = 144166617088, umax = 144166617088, udiv = 144166617088>, <umin = 145760452608, umax = 145760452608, udiv = 145760452608>, <umin = 147354288128, umax = 147354288128, udiv = 147354288128>, <umin = 148948123648, umax = 148948123648, udiv = 148948123648>, <umin = 150541959168, umax = 150541959168, udiv = 150541959168>, <umin = 152135794688, umax = 152135794688, udiv = 152135794688>, <umin = 153729630208, umax = 153729630208, udiv = 153729630208>, <umin = 155323465728, umax = 155323465728, udiv = 155323465728>, <umin = 156917301248, umax = 156917301248, udiv = 156917301248>, <umin = 158511136768, umax = 158511136768, udiv = 158511136768>, <umin = 160104972288, umax = 160104972288, udiv = 160104972288>, <umin = 161698807808, umax = 161698807808, udiv = 161698807808>, <umin = 163292643328, umax = 163292643328, udiv = 163292643328>, <umin = 164886478848, umax = 164886478848, udiv = 164886478848>, <umin = 166480314368, umax = 166480314368, udiv = 166480314368>, <umin = 168074149888, umax = 168074149888, udiv = 168074149888>, <umin = 169667985408, umax = 169667985408, udiv = 169667985408>, <umin = 171261820928, umax = 171261820928, udiv = 171261820928>, <umin = 172855656448, umax = 172855656448, udiv = 172855656448>, <umin = 174449491968, umax = 174449491968, udiv = 174449491968>, <umin = 176043327488, umax = 176043327488, udiv = 176043327488>, <umin = 177637163008, umax = 177637163008, udiv = 177637163008>, <umin = 179230998528, umax = 179230998528, udiv = 179230998528>, <umin = 180824834048, umax = 180824834048, udiv = 180824834048>, <umin = 182418669568, umax = 182418669568, udiv = 182418669568>, <umin = 184012505088, umax = 184012505088, udiv = 184012505088>, <umin = 185606340608, umax = 185606340608, udiv = 185606340608>, <umin = 187200176128, umax = 187200176128, udiv = 187200176128>, <umin = 188794011648, umax = 188794011648, udiv = 188794011648>, <umin = 190387847168, umax = 190387847168, udiv = 190387847168>, <umin = 191981682688, umax = 191981682688, udiv = 191981682688>, <umin = 193575518208, umax = 193575518208, udiv = 193575518208>, <umin = 195169353728, umax = 195169353728, udiv = 195169353728>, <umin = 196763189248, umax = 196763189248, udiv = 196763189248>, <umin = 198357024768, umax = 198357024768, udiv = 198357024768>, <umin = 199950860288, umax = 199950860288, udiv = 199950860288>], | |
| %30[<umin = 205071122496, umax = 205071122496, udiv = 205071122496>, <umin = 205170802816, umax = 205170802816, udiv = 205170802816>, <umin = 205270483136, umax = 205270483136, udiv = 205270483136>, <umin = 205370163456, umax = 205370163456, udiv = 205370163456>, <umin = 205469843776, umax = 205469843776, udiv = 205469843776>, <umin = 205569524096, umax = 205569524096, udiv = 205569524096>, <umin = 205669204416, umax = 205669204416, udiv = 205669204416>, <umin = 205768884736, umax = 205768884736, udiv = 205768884736>, <umin = 205868565056, umax = 205868565056, udiv = 205868565056>, <umin = 205968245376, umax = 205968245376, udiv = 205968245376>, <umin = 206067925696, umax = 206067925696, udiv = 206067925696>, <umin = 206167606016, umax = 206167606016, udiv = 206167606016>, <umin = 206267286336, umax = 206267286336, udiv = 206267286336>, <umin = 206366966656, umax = 206366966656, udiv = 206366966656>, <umin = 206466646976, umax = 206466646976, udiv = 206466646976>, <umin = 206566327296, umax = 206566327296, udiv = 206566327296>, <umin = 206666007616, umax = 206666007616, udiv = 206666007616>, <umin = 206765687936, umax = 206765687936, udiv = 206765687936>, <umin = 206865368256, umax = 206865368256, udiv = 206865368256>, <umin = 206965048576, umax = 206965048576, udiv = 206965048576>, <umin = 207064728896, umax = 207064728896, udiv = 207064728896>, <umin = 207164409216, umax = 207164409216, udiv = 207164409216>, <umin = 207264089536, umax = 207264089536, udiv = 207264089536>, <umin = 207363769856, umax = 207363769856, udiv = 207363769856>, <umin = 207463450176, umax = 207463450176, udiv = 207463450176>, <umin = 207563130496, umax = 207563130496, udiv = 207563130496>, <umin = 207662810816, umax = 207662810816, udiv = 207662810816>, <umin = 207762491136, umax = 207762491136, udiv = 207762491136>, <umin = 207862171456, umax = 207862171456, udiv = 207862171456>, <umin = 207961851776, umax = 207961851776, udiv = 207961851776>, <umin = 208061532096, umax = 208061532096, udiv = 208061532096>, <umin = 208161212416, umax = 208161212416, udiv = 208161212416>, <umin = 208260892736, umax = 208260892736, udiv = 208260892736>, <umin = 208360573056, umax = 208360573056, udiv = 208360573056>, <umin = 208460253376, umax = 208460253376, udiv = 208460253376>, <umin = 208559933696, umax = 208559933696, udiv = 208559933696>, <umin = 208659614016, umax = 208659614016, udiv = 208659614016>, <umin = 208759294336, umax = 208759294336, udiv = 208759294336>, <umin = 208858974656, umax = 208858974656, udiv = 208858974656>, <umin = 208958654976, umax = 208958654976, udiv = 208958654976>, <umin = 209058335296, umax = 209058335296, udiv = 209058335296>, <umin = 209158015616, umax = 209158015616, udiv = 209158015616>, <umin = 209257695936, umax = 209257695936, udiv = 209257695936>, <umin = 209357376256, umax = 209357376256, udiv = 209357376256>, <umin = 209457056576, umax = 209457056576, udiv = 209457056576>, <umin = 209556736896, umax = 209556736896, udiv = 209556736896>, <umin = 209656417216, umax = 209656417216, udiv = 209656417216>, <umin = 209756097536, umax = 209756097536, udiv = 209756097536>, <umin = 209855777856, umax = 209855777856, udiv = 209855777856>, <umin = 209955458176, umax = 209955458176, udiv = 209955458176>, <umin = 210055138496, umax = 210055138496, udiv = 210055138496>, <umin = 210154818816, umax = 210154818816, udiv = 210154818816>, <umin = 210254499136, umax = 210254499136, udiv = 210254499136>, <umin = 210354179456, umax = 210354179456, udiv = 210354179456>, <umin = 210453859776, umax = 210453859776, udiv = 210453859776>, <umin = 210553540096, umax = 210553540096, udiv = 210553540096>, <umin = 210653220416, umax = 210653220416, udiv = 210653220416>, <umin = 210752900736, umax = 210752900736, udiv = 210752900736>, <umin = 210852581056, umax = 210852581056, udiv = 210852581056>, <umin = 210952261376, umax = 210952261376, udiv = 210952261376>, <umin = 211051941696, umax = 211051941696, udiv = 211051941696>, <umin = 211151622016, umax = 211151622016, udiv = 211151622016>, <umin = 211251302336, umax = 211251302336, udiv = 211251302336>, <umin = 211350982656, umax = 211350982656, udiv = 211350982656>, <umin = 211450662976, umax = 211450662976, udiv = 211450662976>, <umin = 211550343296, umax = 211550343296, udiv = 211550343296>, <umin = 211650023616, umax = 211650023616, udiv = 211650023616>, <umin = 211749703936, umax = 211749703936, udiv = 211749703936>, <umin = 211849384256, umax = 211849384256, udiv = 211849384256>, <umin = 211949064576, umax = 211949064576, udiv = 211949064576>, <umin = 212048744896, umax = 212048744896, udiv = 212048744896>, <umin = 212148425216, umax = 212148425216, udiv = 212148425216>, <umin = 212248105536, umax = 212248105536, udiv = 212248105536>, <umin = 212347785856, umax = 212347785856, udiv = 212347785856>, <umin = 212447466176, umax = 212447466176, udiv = 212447466176>, <umin = 212547146496, umax = 212547146496, udiv = 212547146496>, <umin = 212646826816, umax = 212646826816, udiv = 212646826816>, <umin = 212746507136, umax = 212746507136, udiv = 212746507136>, <umin = 212846187456, umax = 212846187456, udiv = 212846187456>, <umin = 212945867776, umax = 212945867776, udiv = 212945867776>, <umin = 213045548096, umax = 213045548096, udiv = 213045548096>, <umin = 213145228416, umax = 213145228416, udiv = 213145228416>, <umin = 213244908736, umax = 213244908736, udiv = 213244908736>, <umin = 213344589056, umax = 213344589056, udiv = 213344589056>, <umin = 213444269376, umax = 213444269376, udiv = 213444269376>, <umin = 213543949696, umax = 213543949696, udiv = 213543949696>, <umin = 213643630016, umax = 213643630016, udiv = 213643630016>, <umin = 213743310336, umax = 213743310336, udiv = 213743310336>, <umin = 213842990656, umax = 213842990656, udiv = 213842990656>, <umin = 213942670976, umax = 213942670976, udiv = 213942670976>, <umin = 214042351296, umax = 214042351296, udiv = 214042351296>, <umin = 214142031616, umax = 214142031616, udiv = 214142031616>, <umin = 214241711936, umax = 214241711936, udiv = 214241711936>, <umin = 214341392256, umax = 214341392256, udiv = 214341392256>, <umin = 214441072576, umax = 214441072576, udiv = 214441072576>, <umin = 214540752896, umax = 214540752896, udiv = 214540752896>, <umin = 214640433216, umax = 214640433216, udiv = 214640433216>, <umin = 214740113536, umax = 214740113536, udiv = 214740113536>, <umin = 214839793856, umax = 214839793856, udiv = 214839793856>, <umin = 214939474176, umax = 214939474176, udiv = 214939474176>, <umin = 215039154496, umax = 215039154496, udiv = 215039154496>, <umin = 215138834816, umax = 215138834816, udiv = 215138834816>, <umin = 215238515136, umax = 215238515136, udiv = 215238515136>, <umin = 215338195456, umax = 215338195456, udiv = 215338195456>, <umin = 215437875776, umax = 215437875776, udiv = 215437875776>, <umin = 215537556096, umax = 215537556096, udiv = 215537556096>, <umin = 215637236416, umax = 215637236416, udiv = 215637236416>, <umin = 215736916736, umax = 215736916736, udiv = 215736916736>, <umin = 215836597056, umax = 215836597056, udiv = 215836597056>, <umin = 215936277376, umax = 215936277376, udiv = 215936277376>, <umin = 216035957696, umax = 216035957696, udiv = 216035957696>, <umin = 216135638016, umax = 216135638016, udiv = 216135638016>, <umin = 216235318336, umax = 216235318336, udiv = 216235318336>, <umin = 216334998656, umax = 216334998656, udiv = 216334998656>, <umin = 216434678976, umax = 216434678976, udiv = 216434678976>, <umin = 216534359296, umax = 216534359296, udiv = 216534359296>, <umin = 216634039616, umax = 216634039616, udiv = 216634039616>, <umin = 216733719936, umax = 216733719936, udiv = 216733719936>, <umin = 216833400256, umax = 216833400256, udiv = 216833400256>, <umin = 216933080576, umax = 216933080576, udiv = 216933080576>, <umin = 217032760896, umax = 217032760896, udiv = 217032760896>, <umin = 217132441216, umax = 217132441216, udiv = 217132441216>, <umin = 217232121536, umax = 217232121536, udiv = 217232121536>, <umin = 217331801856, umax = 217331801856, udiv = 217331801856>, <umin = 217431482176, umax = 217431482176, udiv = 217431482176>, <umin = 217531162496, umax = 217531162496, udiv = 217531162496>], | |
| %35<umin = 27205760, umax = 111407325184>, | |
| %36<umin = 128, umax = 524160, udiv = 128> | |
| : index, index, index, index, index, index | |
| %38 = hal.interface.binding.subspan layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%37#2) flags(ReadOnly) : memref<53248x512x32xf4E2M1FN, strided<[16384, 32, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
| %39 = amdgpu.fat_raw_buffer_cast %38 resetOffset : memref<53248x512x32xf4E2M1FN, strided<[16384, 32, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<53248x512x32xf4E2M1FN, #amdgpu.address_space<fat_raw_buffer>> | |
| %40 = hal.interface.binding.subspan layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%37#3) flags(ReadOnly) : memref<53248x512xf8E8M0FNU, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
| %41 = amdgpu.fat_raw_buffer_cast %40 resetOffset : memref<53248x512xf8E8M0FNU, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<53248x512xf8E8M0FNU, #amdgpu.address_space<fat_raw_buffer>> | |
| %42 = iree_tensor_ext.dispatch.workload.ordinal %37#5, 0 : index | |
| %43 = hal.interface.binding.subspan layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%37#0) flags("ReadOnly|Indirect") : memref<?x512x32xf4E2M1FN, strided<[16384, 32, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%42} | |
| %44 = hal.interface.binding.subspan layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%37#1) flags("ReadOnly|Indirect") : memref<?x512xf8E8M0FNU, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%42} | |
| %45 = hal.interface.binding.subspan layout(<constants = 11, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%37#4) flags(Indirect) : memref<?x53248xf16, strided<[53248, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%42} | |
| %46 = iree_codegen.load_from_buffer %39 : memref<53248x512x32xf4E2M1FN, #amdgpu.address_space<fat_raw_buffer>> -> tensor<53248x512x32xf4E2M1FN> | |
| %47 = iree_codegen.load_from_buffer %41 : memref<53248x512xf8E8M0FNU, #amdgpu.address_space<fat_raw_buffer>> -> tensor<53248x512xf8E8M0FNU> | |
| %48 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%42] | |
| %expand_shape = memref.expand_shape %45 [[0, 1], [2]] output_shape [%48, 128, 53248] : memref<?x53248xf16, strided<[53248, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<?x128x53248xf16, strided<[6815744, 53248, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
| %expand_shape_0 = memref.expand_shape %44 [[0, 1], [2]] output_shape [%48, 128, 512] : memref<?x512xf8E8M0FNU, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<?x128x512xf8E8M0FNU, strided<[65536, 512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
| %expand_shape_1 = memref.expand_shape %43 [[0, 1], [2], [3]] output_shape [%48, 128, 512, 32] : memref<?x512x32xf4E2M1FN, strided<[16384, 32, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> into memref<?x128x512x32xf4E2M1FN, strided<[2097152, 16384, 32, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
| %49 = iree_codegen.load_from_buffer %expand_shape_1 : memref<?x128x512x32xf4E2M1FN, strided<[2097152, 16384, 32, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> -> tensor<?x128x512x32xf4E2M1FN> | |
| %50 = iree_codegen.load_from_buffer %expand_shape_0 : memref<?x128x512xf8E8M0FNU, strided<[65536, 512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> -> tensor<?x128x512xf8E8M0FNU> | |
| %51 = tensor.empty(%48) : tensor<?x128x53248xf32> | |
| %52 = linalg.fill ins(%cst : f32) outs(%51 : tensor<?x128x53248xf32>) -> tensor<?x128x53248xf32> | |
| %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%49, %46, %50, %47 : tensor<?x128x512x32xf4E2M1FN>, tensor<53248x512x32xf4E2M1FN>, tensor<?x128x512xf8E8M0FNU>, tensor<53248x512xf8E8M0FNU>) outs(%52 : tensor<?x128x53248xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.scaled_mma_layout<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, promote_operands = [0, 1, 2, 3], reduction = [0, 0, 0, 4, 1], subgroup = [0, 2, 4, 0, 0], workgroup = [1, 64, 128, 0, 0]}>} { | |
| ^bb0(%in: f4E2M1FN, %in_2: f4E2M1FN, %in_3: f8E8M0FNU, %in_4: f8E8M0FNU, %out: f32): | |
| %56 = arith.scaling_extf %in, %in_3 : f4E2M1FN, f8E8M0FNU to f32 | |
| %57 = arith.scaling_extf %in_2, %in_4 : f4E2M1FN, f8E8M0FNU to f32 | |
| %58 = arith.mulf %56, %57 : f32 | |
| %59 = arith.addf %out, %58 : f32 | |
| linalg.yield %59 : f32 | |
| } -> tensor<?x128x53248xf32> | |
| %54 = tensor.empty(%48) : tensor<?x128x53248xf16> | |
| %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%53 : tensor<?x128x53248xf32>) outs(%54 : tensor<?x128x53248xf16>) { | |
| ^bb0(%in: f32, %out: f16): | |
| %56 = arith.truncf %in : f32 to f16 | |
| linalg.yield %56 : f16 | |
| } -> tensor<?x128x53248xf16> | |
| iree_codegen.store_to_buffer %55, %expand_shape : tensor<?x128x53248xf16> into memref<?x128x53248xf16, strided<[6815744, 53248, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
| return | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment