Created
February 24, 2023 20:06
-
-
Save kuhar/f18f9772013871c6327a57cffda0769d to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // -----// IR Dump After TosaToSCF (tosa-to-scf) //----- // | |
| func.func @main() { | |
| %0 = util.unfoldable_constant dense<1> : tensor<1024x1024xi8> | |
| %1 = util.unfoldable_constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq_const(%4, dense<1024> : tensor<1024x1024xi32>) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TopLevelSCFToCFG (iree-top-level-scf-to-cfg) //----- // | |
| func.func @main() { | |
| %0 = util.unfoldable_constant dense<1> : tensor<1024x1024xi8> | |
| %1 = util.unfoldable_constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq_const(%4, dense<1024> : tensor<1024x1024xi32>) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Inliner (inline) //----- // | |
| module { | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaToArith (tosa-to-arith) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaToTensor (tosa-to-tensor) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaToLinalgExt (iree-tosa-to-linalg-ext) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaOptionalDecompositions (tosa-optional-decompositions) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaToLinalgNamed (tosa-to-linalg-named) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaLayerwiseConstantFoldPass (tosa-layerwise-constant-fold) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaValidation (tosa-validate) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaToArith (tosa-to-arith) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After TosaToTensor (tosa-to-tensor) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After StripSignedness (iree-flow-strip-signedness) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-linalg-quantized-matmul-to-matmul) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After LinalgQuantizedConvToConvPass (iree-linalg-quantized-conv-to-conv) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After VerifyCompilerTOSAInputLegality (iree-tosa-verify-compiler-input-legality) //----- // | |
| module { | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IREEImportPublic (iree-import-public) //----- // | |
| module { | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ImportMLProgram (iree-import-ml-program) //----- // | |
| module { | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) //----- // | |
| module { | |
| func.func @main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Inliner (inline) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After EraseUnusedLinalgOperands (iree-flow-erase-unused-linalg-operands) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ExpandTensorShapes (iree-flow-expand-tensor-shapes) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module attributes {iree.fixedpoint.iteration = 0 : index} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module attributes {iree.fixedpoint.iteration = 0 : index} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module attributes {iree.fixedpoint.iteration = 0 : index} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After RaiseSpecialOps (iree-flow-raise-special-ops) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After RaiseSpecialOps (iree-flow-raise-special-ops) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After LinalgDetensorize (linalg-detensorize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After LinalgDetensorize (linalg-detensorize) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- // | |
| func.func private @_main() { | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %c1 = arith.constant 1 : index | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1_1 = arith.constant 1 : index | |
| %4 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0, %c1024, %c1_1] | |
| %c0_2 = arith.constant 0 : index | |
| %c1024_3 = arith.constant 1024 : index | |
| %c1_4 = arith.constant 1 : index | |
| %5 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_2, %c1024_3, %c1_4] | |
| %c0_5 = arith.constant 0 : index | |
| %c1_6 = arith.constant 1 : index | |
| %6 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_5, %c1, %c1_6] | |
| %7 = flow.dispatch.region[%4, %5, %6] -> (tensor<1024x1024xi32>) { | |
| %8 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.return %8 : tensor<1024x1024xi32> | |
| } count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| check.expect_eq(%7, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = flow.dispatch.region[%c1024, %c1024, %c1] -> (tensor<1024x1024xi32>) { | |
| %5 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.return %5 : tensor<1024x1024xi32> | |
| } count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| check.expect_eq(%4, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| flow.return | |
| } count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| flow.return | |
| } count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| flow.return | |
| } count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| flow.return | |
| } count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> = | |
| (%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| flow.return | |
| } count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- // | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| // -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After VerifyInput (iree-stream-verify-input) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After OutlineConstants (iree-stream-outline-constants) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| flow.executable private @_main_dispatch_0 { | |
| flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| flow.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %2 = tensor.empty() : tensor<1024x1024xi32> | |
| %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = arith.constant dense<1024> : tensor<1024x1024xi32> | |
| %cst_0 = arith.constant dense<1> : tensor<1024x1024xi8> | |
| %0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8> | |
| %2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> | |
| check.expect_eq(%2, %cst) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ConvertToStream (iree-stream-conversion) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = stream.tensor.constant : tensor<1024x1024xi32> in !stream.resource<constant> = dense<1024> : tensor<1024x1024xi32> | |
| %0 = stream.resource.size %cst : !stream.resource<constant> | |
| %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0} | |
| %cst_0 = stream.tensor.constant : tensor<1024x1024xi8> in !stream.resource<constant> = dense<1> : tensor<1024x1024xi8> | |
| %2 = stream.resource.size %cst_0 : !stream.resource<constant> | |
| %3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} -> !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %c0 = arith.constant 0 : index | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
| %10 = stream.async.transfer %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8} | |
| %11 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%8} -> tensor<1024x1024xi32> | |
| %12 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %13 = stream.tensor.export %12 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%11, %13) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After VerifyLoweringToTensors (iree-stream-verify-lowering-to-tensors) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %c0_i32 = arith.constant 0 : i32 | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %cst = stream.tensor.constant : tensor<1024x1024xi32> in !stream.resource<constant> = dense<1024> : tensor<1024x1024xi32> | |
| %0 = stream.resource.size %cst : !stream.resource<constant> | |
| %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0} | |
| %cst_0 = stream.tensor.constant : tensor<1024x1024xi8> in !stream.resource<constant> = dense<1> : tensor<1024x1024xi8> | |
| %2 = stream.resource.size %cst_0 : !stream.resource<constant> | |
| %3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} -> !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %c0 = arith.constant 0 : index | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
| %10 = stream.async.transfer %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8} | |
| %11 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%8} -> tensor<1024x1024xi32> | |
| %12 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %13 = stream.tensor.export %12 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%11, %13) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0} | |
| %2 = stream.tensor.sizeof tensor<1024x1024xi8> : index | |
| %3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8} | |
| %10 = stream.async.transfer %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8} | |
| %11 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%8} -> tensor<1024x1024xi32> | |
| %12 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %13 = stream.tensor.export %12 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%11, %13) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0} | |
| %2 = stream.tensor.sizeof tensor<1024x1024xi8> : index | |
| %3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| %11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%10, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0} | |
| %2 = stream.tensor.sizeof tensor<1024x1024xi8> : index | |
| %3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| %11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%10, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0} | |
| %2 = stream.tensor.sizeof tensor<1024x1024xi8> : index | |
| %3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| %11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%10, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0} | |
| %2 = stream.tensor.sizeof tensor<1024x1024xi8> : index | |
| %3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| %11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%10, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0} | |
| %2 = stream.tensor.sizeof tensor<1024x1024xi8> : index | |
| %3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| %11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%10, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0} | |
| %2 = stream.tensor.sizeof tensor<1024x1024xi8> : index | |
| %3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| %11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%10, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.tensor.sizeof tensor<1024x1024xi32> : index | |
| %1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0} | |
| %2 = stream.tensor.sizeof tensor<1024x1024xi8> : index | |
| %3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2} | |
| %4 = util.optimization_barrier %3 : !stream.resource<*> | |
| %5 = util.optimization_barrier %3 : !stream.resource<*> | |
| %6 = stream.resource.size %4 : !stream.resource<*> | |
| %7 = stream.resource.size %5 : !stream.resource<*> | |
| %8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| %11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0} | |
| %12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32> | |
| check.expect_eq(%10, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %1 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %2 = util.optimization_barrier %1 : !stream.resource<*> | |
| %3 = util.optimization_barrier %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.resource.size %3 : !stream.resource<*> | |
| %6 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%2[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
| %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %9 = stream.async.transfer %0 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%8, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After EncodeDeviceTensors (iree-stream-encode-device-tensors) //----- // | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| // -----// IR Dump After MaterializeBuiltins (iree-stream-materialize-builtins) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %1 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %2 = util.optimization_barrier %1 : !stream.resource<*> | |
| %3 = util.optimization_barrier %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.resource.size %3 : !stream.resource<*> | |
| %6 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%2[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304} | |
| %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %9 = stream.async.transfer %0 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%8, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ElideAsyncCopies (iree-stream-elide-async-copies) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After EmplaceAllocations (iree-stream-emplace-allocations) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After EmplaceAllocations (iree-stream-emplace-allocations) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<*> | |
| %2 = util.optimization_barrier %0 : !stream.resource<*> | |
| %3 = stream.resource.size %1 : !stream.resource<*> | |
| %4 = stream.resource.size %2 : !stream.resource<*> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304} | |
| %6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304} | |
| %9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After RefineUsage (iree-stream-refine-usage) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%6, %8) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%6, %8) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%6, %8) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%6, %8) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%6, %8) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%6, %8) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%6, %8) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| %8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%6, %8) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg0[%c0 to %3 for %3], %arg1[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %9 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %8, %9 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After PropagateTimepoints (iree-stream-propagate-timepoints) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %11 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %11 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %5 = stream.timepoint.immediate => !stream.timepoint | |
| %6 = stream.timepoint.immediate => !stream.timepoint | |
| %7 = stream.timepoint.immediate => !stream.timepoint | |
| %results_0:2, %result_timepoint_1 = stream.async.execute await(%7) => with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %11:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %12 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %13 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %12, %13 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %11#0, %11#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %8:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %9 = stream.tensor.export %8#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %10 = stream.tensor.export %8#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%10, %9) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After VerifyLoweringToAsync (iree-stream-verify-lowering-to-async) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} { | |
| %8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576} | |
| stream.yield %8 : !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576} | |
| %1 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %2 = util.optimization_barrier %0 : !stream.resource<transient> | |
| %3 = stream.resource.size %1 : !stream.resource<transient> | |
| %4 = stream.resource.size %2 : !stream.resource<transient> | |
| %results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) { | |
| %9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304} | |
| %10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304} | |
| stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } | |
| stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| } => !stream.timepoint | |
| %5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%7, %6) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After PackConstants (iree-stream-pack-constants) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After PackAllocations (iree-stream-pack-allocations) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After LayoutSlices (iree-stream-layout-slices) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %c0_0 = arith.constant 0 : index | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0_0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %c0_1 = arith.constant 0 : index | |
| %7:2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7#0 as %arg2: !stream.resource<external>{%c4194304}, %7#1 as %arg3: !stream.resource<external>{%c4194304}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0_1 for %c4194304] : !stream.resource<external>{%c4194304} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg3[%c0_1 for %c4194304] : i32 -> !stream.resource<external>{%c4194304} | |
| } | |
| } => !stream.timepoint | |
| %9:2 = stream.timepoint.await %8 => %7#1, %7#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %11 = stream.tensor.export %9#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%11, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After PackConstants (iree-stream-pack-constants) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %c0_0 = arith.constant 0 : index | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0_0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %c0_1 = arith.constant 0 : index | |
| %7:2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7#0 as %arg2: !stream.resource<external>{%c4194304}, %7#1 as %arg3: !stream.resource<external>{%c4194304}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0_1 for %c4194304] : !stream.resource<external>{%c4194304} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg3[%c0_1 for %c4194304] : i32 -> !stream.resource<external>{%c4194304} | |
| } | |
| } => !stream.timepoint | |
| %9:2 = stream.timepoint.await %8 => %7#1, %7#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %10 = stream.tensor.export %9#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %11 = stream.tensor.export %9#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%11, %10) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After PackAllocations (iree-stream-pack-allocations) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %c0_0 = arith.constant 0 : index | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0_0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %c0_1 = arith.constant 0 : index | |
| %7:3 = stream.resource.pack slices({ | |
| [0, 0] = %c4194304, | |
| [0, 0] = %c4194304 | |
| }) : index | |
| %8 = stream.resource.alloc uninitialized : !stream.resource<external>{%7#0} | |
| %9 = stream.resource.subview %8[%7#1] : !stream.resource<external>{%7#0} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.resource.subview %8[%7#2] : !stream.resource<external>{%7#0} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %9 as %arg2: !stream.resource<external>{%c4194304}, %10 as %arg3: !stream.resource<external>{%c4194304}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0_1 for %c4194304] : !stream.resource<external>{%c4194304} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg3[%c0_1 for %c4194304] : i32 -> !stream.resource<external>{%c4194304} | |
| } | |
| } => !stream.timepoint | |
| %12:2 = stream.timepoint.await %11 => %10, %9 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %13 = stream.tensor.export %12#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %14 = stream.tensor.export %12#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%14, %13) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After LayoutSlices (iree-stream-layout-slices) //----- // | |
| func.func private @_main() { | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %c0_0 = arith.constant 0 : index | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0_0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %c0_1 = arith.constant 0 : index | |
| %c0_2 = arith.constant 0 : index | |
| %c4194304_3 = arith.constant 4194304 : index | |
| %c4194304_4 = arith.constant 4194304 : index | |
| %c8388608 = arith.constant 8388608 : index | |
| %c8388608_5 = arith.constant 8388608 : index | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608_5} | |
| %8 = stream.resource.subview %7[%c0_2] : !stream.resource<external>{%c8388608_5} -> !stream.resource<external>{%c4194304} | |
| %9 = stream.resource.subview %7[%c4194304_4] : !stream.resource<external>{%c8388608_5} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %8 as %arg2: !stream.resource<external>{%c4194304}, %9 as %arg3: !stream.resource<external>{%c4194304}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0_1 for %c4194304] : !stream.resource<external>{%c4194304} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg3[%c0_1 for %c4194304] : i32 -> !stream.resource<external>{%c4194304} | |
| } | |
| } => !stream.timepoint | |
| %11:2 = stream.timepoint.await %10 => %9, %8 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %11#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0_0 = arith.constant 0 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %c0_1 = arith.constant 0 : index | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0_1 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %c0_2 = arith.constant 0 : index | |
| %c0_3 = arith.constant 0 : index | |
| %c4194304_4 = arith.constant 4194304 : index | |
| %c4194304_5 = arith.constant 4194304 : index | |
| %c8388608 = arith.constant 8388608 : index | |
| %c8388608_6 = arith.constant 8388608 : index | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608_6} | |
| %8 = stream.resource.subview %7[%c0_3] : !stream.resource<external>{%c8388608_6} -> !stream.resource<external>{%c4194304} | |
| %9 = stream.resource.subview %7[%c4194304_5] : !stream.resource<external>{%c8388608_6} -> !stream.resource<external>{%c4194304} | |
| %10 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %8 as %arg2: !stream.resource<external>{%c4194304}, %9 as %arg3: !stream.resource<external>{%c4194304}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0_0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0_0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0_2 for %c4194304] : !stream.resource<external>{%c4194304} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg3[%c0_2 for %c4194304] : i32 -> !stream.resource<external>{%c4194304} | |
| } | |
| } => !stream.timepoint | |
| %11:2 = stream.timepoint.await %10 => %9, %8 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %11#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After VerifyLoweringToCmd (iree-stream-verify-lowering-to-cmd) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module attributes {iree.fixedpoint.iteration = 0 : index} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module attributes {iree.fixedpoint.iteration = 0 : index} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module attributes {iree.fixedpoint.iteration = 0 : index} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module attributes {iree.fixedpoint.iteration = 0 : index} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ElideTimepoints (iree-stream-elide-timepoints) //----- // | |
| module attributes {iree.fixedpoint.iteration = 0 : index} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseDispatchBindings (iree-stream-fuse-dispatch-bindings) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %c0_0 = arith.constant 0 : index | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%c0, %c0, %c0 : index, index, index) { | |
| ro %arg0[%c0_0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0_0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0_0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After AnnotateDispatchArguments (iree-stream-annotate-dispatch-arguments) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %c0_0 = arith.constant 0 : index | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%c0, %c0, %c0 : index, index, index) { | |
| ro %arg0[%c0_0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0_0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0_0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After PackDispatchOperands (iree-stream-pack-dispatch-operands) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32) { | |
| %0 = arith.index_castui %arg3 {stream.values = [0 : index]} : i32 to index | |
| %1 = arith.index_castui %arg4 {stream.values = [0 : index]} : i32 to index | |
| %2 = arith.index_castui %arg5 {stream.values = [0 : index]} : i32 to index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0 = arith.constant 0 : index | |
| %3 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %4 = stream.binding.subspan %arg1[%1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %5 = stream.binding.subspan %arg2[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %8 = tensor.empty() : tensor<1024x1024xi32> | |
| %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %10 = linalg.matmul ins(%6, %7 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%9 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %c0_0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %c0_i32_1 = arith.constant 0 : i32 | |
| %c0_i32_2 = arith.constant 0 : i32 | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%c0_i32, %c0_i32_1, %c0_i32_2 : i32, i32, i32) { | |
| ro %arg0[%c0_0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0_0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0_0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32) { | |
| %0 = arith.index_castui %arg3 {stream.values = [0 : index]} : i32 to index | |
| %1 = arith.index_castui %arg4 {stream.values = [0 : index]} : i32 to index | |
| %2 = arith.index_castui %arg5 {stream.values = [0 : index]} : i32 to index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %3 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %4 = stream.binding.subspan %arg1[%1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %5 = stream.binding.subspan %arg2[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %8 = tensor.empty() : tensor<1024x1024xi32> | |
| %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %10 = linalg.matmul ins(%6, %7 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%9 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %c0_i32 = arith.constant 0 : i32 | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%c0_i32, %c0_i32, %c0_i32 : i32, i32, i32) { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldUniformOperands (iree-stream-fold-uniform-operands) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index | |
| %1 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index | |
| %2 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index | |
| %c0_i32_0 = arith.constant 0 : i32 | |
| %3 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %4 = stream.binding.subspan %arg1[%1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %5 = stream.binding.subspan %arg2[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %8 = tensor.empty() : tensor<1024x1024xi32> | |
| %9 = linalg.fill ins(%c0_i32_0 : i32) outs(%8 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %10 = linalg.matmul ins(%6, %7 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%9 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %c0_i32 = arith.constant 0 : i32 | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After MemoizeChannels (iree-stream-memoize-channels) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index | |
| %1 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index | |
| %2 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index | |
| %c0_i32_0 = arith.constant 0 : i32 | |
| %3 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %4 = stream.binding.subspan %arg1[%1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %5 = stream.binding.subspan %arg2[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %8 = tensor.empty() : tensor<1024x1024xi32> | |
| %9 = linalg.fill ins(%c0_i32_0 : i32) outs(%8 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %10 = linalg.matmul ins(%6, %7 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%9 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %c0_i32 = arith.constant 0 : i32 | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After IPO (iree-util-ipo) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
| module { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // | |
| #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}> | |
| #device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}> | |
| module attributes {hal.device.targets = [#device_target_vulkan]} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass (iree-hal-verify-target-environment) //----- // | |
| #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}> | |
| #device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}> | |
| module attributes {hal.device.targets = [#device_target_vulkan]} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| stream.executable private @_main_dispatch_0 { | |
| stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) { | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2 | |
| stream.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // | |
| #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}> | |
| #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> | |
| #device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}> | |
| module attributes {hal.device.targets = [#device_target_vulkan]} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| hal.executable private @_main_dispatch_0 { | |
| hal.executable.variant public @vulkan_spirv_fb, target = #executable_target_vulkan_spirv_fb { | |
| hal.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 ordinal(0) layout(#pipeline_layout) { | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After mlir::iree_compiler::IREE::HAL::DumpExecutableSourcesPass (iree-hal-dump-executable-sources) //----- // | |
| #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}> | |
| #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> | |
| #device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}> | |
| module attributes {hal.device.targets = [#device_target_vulkan]} { | |
| func.func @main() attributes {iree.abi.stub} { | |
| call @_main() : () -> () | |
| return | |
| } | |
| hal.executable private @_main_dispatch_0 { | |
| hal.executable.variant public @vulkan_spirv_fb, target = #executable_target_vulkan_spirv_fb { | |
| hal.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 ordinal(0) layout(#pipeline_layout) { | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 | |
| hal.return %x, %y, %z : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| } | |
| } | |
| func.func private @_main() { | |
| %c8388608 = arith.constant 8388608 : index | |
| %c0 = arith.constant 0 : index | |
| %c4194304 = arith.constant 4194304 : index | |
| %c1048576 = arith.constant 1048576 : index | |
| %c1_i8 = arith.constant 1 : i8 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c1024 = arith.constant 1024 : index | |
| %c1 = arith.constant 1 : index | |
| %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576} | |
| %1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) { | |
| stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576} | |
| } => !stream.timepoint | |
| %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576} | |
| %3 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %4 = util.optimization_barrier %2 : !stream.resource<transient> | |
| %5 = stream.resource.size %3 : !stream.resource<transient> | |
| %6 = stream.resource.size %4 : !stream.resource<transient> | |
| %7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608} | |
| %8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) { | |
| stream.cmd.concurrent { | |
| stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] { | |
| ro %arg0[%c0 for %5] : !stream.resource<transient>{%5}, | |
| ro %arg1[%c0 for %6] : !stream.resource<transient>{%6}, | |
| wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608} | |
| } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} | |
| stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608} | |
| } | |
| } => !stream.timepoint | |
| %9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608} | |
| %10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304} | |
| %12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| %13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32> | |
| check.expect_eq(%13, %12) : tensor<1024x1024xi32> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| // -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| } | |
| // -----// IR Dump After DecomposeSoftmax (iree-linalg-ext-decompose-softmax) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| // -----// IR Dump After RematerializeParallelOps (iree-codegen-rematerialize-parallel-ops) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8> | |
| %5 = tensor.empty() : tensor<1024x1024xi32> | |
| %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32> | |
| flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| return | |
| } | |
| // -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- // | |
| hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}> { | |
| hal.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [8 : index, 2 : index, 1 : index]} { | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %c32 = arith.constant 32 : index | |
| %c128 = arith.constant 128 : index | |
| %c1 = arith.constant 1 : index | |
| hal.return %c32, %c128, %c1 : index, index, index | |
| } | |
| builtin.module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c8 = arith.constant 8 : index | |
| %c32 = arith.constant 32 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%c8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<?x1024xi8> | |
| %8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %c32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x?xi8> | |
| %9 = tensor.empty() : tensor<8x32xi32> | |
| %cast = tensor.cast %9 : tensor<8x32xi32> to tensor<?x?xi32> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%cast : tensor<?x?xi32>) -> tensor<?x?xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%7, %8 : tensor<?x1024xi8>, tensor<1024x?xi8>) outs(%10 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [%c8, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| } | |
| // -----// IR Dump After FuseTensorPadWithConsumer (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c8 = arith.constant 8 : index | |
| %c32 = arith.constant 32 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%c8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<?x1024xi8> | |
| %8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %c32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x?xi8> | |
| %9 = tensor.empty() : tensor<8x32xi32> | |
| %cast = tensor.cast %9 : tensor<8x32xi32> to tensor<?x?xi32> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%cast : tensor<?x?xi32>) -> tensor<?x?xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%7, %8 : tensor<?x1024xi8>, tensor<1024x?xi8>) outs(%10 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [%c8, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c8 = arith.constant 8 : index | |
| %c32 = arith.constant 32 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c8, %c32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<?x?xi32> | |
| %cast = tensor.cast %7 : tensor<?x?xi32> to tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%c8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<?x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %c32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x?xi8> | |
| %cast_0 = tensor.cast %cast : tensor<8x32xi32> to tensor<?x?xi32> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%cast_0 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<?x1024xi8>, tensor<1024x?xi8>) outs(%10 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [%c8, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After SPIRVCreateFastSlowPath (iree-spirv-create-fast-slow-path) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| %11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32> | |
| flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After SPIRVTile (iree-spirv-tile) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice : tensor<4x4xi32>) -> tensor<4x4xi32> | |
| %13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) { | |
| %extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8> | |
| %extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8> | |
| %14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<4x4xi8>, tensor<4x4xi8>) outs(%arg7 : tensor<4x4xi32>) -> tensor<4x4xi32> | |
| scf.yield %14 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice : tensor<4x4xi32>) -> tensor<4x4xi32> | |
| %13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) { | |
| %extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8> | |
| %extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8> | |
| %14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<4x4xi8>, tensor<4x4xi8>) outs(%arg7 : tensor<4x4xi32>) -> tensor<4x4xi32> | |
| scf.yield %14 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice : tensor<4x4xi32>) -> tensor<4x4xi32> | |
| %13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) { | |
| %extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8> | |
| %extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8> | |
| %14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<4x4xi8>, tensor<4x4xi8>) outs(%arg7 : tensor<4x4xi32>) -> tensor<4x4xi32> | |
| scf.yield %14 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| --- After vectorization --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x4xi32> | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32> | |
| %13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) { | |
| %extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8> | |
| %extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8> | |
| %14 = vector.transfer_read %extracted_slice_0[%c0, %c0], %c0_i8 {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (d0, 0, d1)>} : tensor<4x4xi8>, vector<4x4x4xi8> | |
| %15 = vector.transfer_read %extracted_slice_1[%c0, %c0], %c0_i8 {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>} : tensor<4x4xi8>, vector<4x4x4xi8> | |
| %16 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<4x4xi32> | |
| %17 = arith.extsi %14 : vector<4x4x4xi8> to vector<4x4x4xi32> | |
| %18 = arith.extsi %15 : vector<4x4x4xi8> to vector<4x4x4xi32> | |
| %19 = arith.muli %17, %18 : vector<4x4x4xi32> | |
| %20 = vector.multi_reduction <add>, %19, %16 [2] : vector<4x4x4xi32> to vector<4x4xi32> | |
| %21 = vector.transfer_write %20, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32> | |
| scf.yield %21 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After peephole optimization --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x4xi32> | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32> | |
| %13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) { | |
| %extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8> | |
| %extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8> | |
| %14 = vector.transfer_read %extracted_slice_0[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : tensor<4x4xi8>, vector<4x4xi8> | |
| %15 = vector.transfer_read %extracted_slice_1[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : tensor<4x4xi8>, vector<4x4xi8> | |
| %16 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<4x4xi32> | |
| %17 = arith.extsi %14 : vector<4x4xi8> to vector<4x4xi32> | |
| %18 = arith.extsi %15 : vector<4x4xi8> to vector<4x4xi32> | |
| %19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<4x4xi32>, vector<4x4xi32> into vector<4x4xi32> | |
| %20 = vector.transfer_write %19, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32> | |
| scf.yield %20 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After folding tensor extract/insert slice ops --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x4xi32> | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32> | |
| %13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) { | |
| %14 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<4x4xi8> | |
| %15 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<4x4xi8> | |
| %16 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<4x4xi32> | |
| %17 = arith.extsi %14 : vector<4x4xi8> to vector<4x4xi32> | |
| %18 = arith.extsi %15 : vector<4x4xi8> to vector<4x4xi32> | |
| %19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<4x4xi32>, vector<4x4xi32> into vector<4x4xi32> | |
| %20 = vector.transfer_write %19, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32> | |
| scf.yield %20 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After lowering multi_reduction ops --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x4xi32> | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32> | |
| %13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) { | |
| %14 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<4x4xi8> | |
| %15 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<4x4xi8> | |
| %16 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<4x4xi32> | |
| %17 = arith.extsi %14 : vector<4x4xi8> to vector<4x4xi32> | |
| %18 = arith.extsi %15 : vector<4x4xi8> to vector<4x4xi32> | |
| %19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<4x4xi32>, vector<4x4xi32> into vector<4x4xi32> | |
| %20 = vector.transfer_write %19, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32> | |
| scf.yield %20 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {4, 4, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| JAKUB: #87 | |
| JAKUB: #94 | |
| JAKUB: #103 | |
| JAKUB: #109 | |
| JAKUB: #77 | |
| JAKUB: #81 | |
| JAKUB: lowerToInnerPro: 1 | |
| JAKUB: bounds: {1, 1, 4} | |
| JAKUB: nativeSize: {1, 1, 4} | |
| --- After unrolling vector --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %cst = arith.constant dense<0> : vector<4x4xi32> | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %13 = vector.transfer_write %12, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %14 = vector.extract_strided_slice %cst {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %15 = vector.transfer_write %14, %13[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %16 = vector.extract_strided_slice %cst {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %17 = vector.transfer_write %16, %15[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %18 = vector.extract_strided_slice %cst {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %19 = vector.transfer_write %18, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %20 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %19) -> (tensor<4x4xi32>) { | |
| %21 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %22 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %24 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %25 = vector.transfer_read %8[%24, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %26 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %27 = vector.transfer_read %8[%26, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %35 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %36 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %37 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %38 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %39 = arith.extsi %21 : vector<1x4xi8> to vector<1x4xi32> | |
| %40 = arith.extsi %23 : vector<1x4xi8> to vector<1x4xi32> | |
| %41 = arith.extsi %25 : vector<1x4xi8> to vector<1x4xi32> | |
| %42 = arith.extsi %27 : vector<1x4xi8> to vector<1x4xi32> | |
| %43 = arith.extsi %28 : vector<1x4xi8> to vector<1x4xi32> | |
| %44 = vector.insert_strided_slice %43, %cst {offsets = [0, 0], strides = [1, 1]} : vector<1x4xi32> into vector<4x4xi32> | |
| %45 = arith.extsi %30 : vector<1x4xi8> to vector<1x4xi32> | |
| %46 = vector.insert_strided_slice %45, %44 {offsets = [1, 0], strides = [1, 1]} : vector<1x4xi32> into vector<4x4xi32> | |
| %47 = arith.extsi %32 : vector<1x4xi8> to vector<1x4xi32> | |
| %48 = vector.insert_strided_slice %47, %46 {offsets = [2, 0], strides = [1, 1]} : vector<1x4xi32> into vector<4x4xi32> | |
| %49 = arith.extsi %34 : vector<1x4xi8> to vector<1x4xi32> | |
| %50 = vector.insert_strided_slice %49, %48 {offsets = [3, 0], strides = [1, 1]} : vector<1x4xi32> into vector<4x4xi32> | |
| %51 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %52 = vector.extract_strided_slice %35 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %53 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %51, %52 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %54 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %55 = vector.extract_strided_slice %35 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %54, %55 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %57 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %58 = vector.extract_strided_slice %35 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %57, %58 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %60 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %61 = vector.extract_strided_slice %35 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %60, %61 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %63 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %64 = vector.extract_strided_slice %36 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %63, %64 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %66 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %67 = vector.extract_strided_slice %36 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %66, %67 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %69 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %70 = vector.extract_strided_slice %36 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %71 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %69, %70 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %72 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %73 = vector.extract_strided_slice %36 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %74 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %72, %73 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %75 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %76 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %77 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %75, %76 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %78 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %79 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %80 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %78, %79 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %81 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %82 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %83 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %81, %82 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %84 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %85 = vector.extract_strided_slice %37 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %86 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %84, %85 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %87 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %88 = vector.extract_strided_slice %38 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %89 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %87, %88 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %90 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %91 = vector.extract_strided_slice %38 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %92 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %90, %91 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %93 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %94 = vector.extract_strided_slice %38 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %95 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %93, %94 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %96 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32> | |
| %97 = vector.extract_strided_slice %38 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %98 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %96, %97 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %99 = vector.insert_strided_slice %53, %cst {offsets = [0, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %100 = vector.insert_strided_slice %56, %99 {offsets = [0, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %101 = vector.insert_strided_slice %59, %100 {offsets = [0, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %102 = vector.insert_strided_slice %62, %101 {offsets = [0, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %103 = vector.insert_strided_slice %65, %102 {offsets = [1, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %104 = vector.insert_strided_slice %68, %103 {offsets = [1, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %105 = vector.insert_strided_slice %71, %104 {offsets = [1, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %106 = vector.insert_strided_slice %74, %105 {offsets = [1, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %107 = vector.insert_strided_slice %77, %106 {offsets = [2, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %108 = vector.insert_strided_slice %80, %107 {offsets = [2, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %109 = vector.insert_strided_slice %83, %108 {offsets = [2, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %110 = vector.insert_strided_slice %86, %109 {offsets = [2, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %111 = vector.insert_strided_slice %89, %110 {offsets = [3, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %112 = vector.insert_strided_slice %92, %111 {offsets = [3, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %113 = vector.insert_strided_slice %95, %112 {offsets = [3, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %114 = vector.insert_strided_slice %98, %113 {offsets = [3, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %115 = vector.extract_strided_slice %114 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %116 = vector.transfer_write %115, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %117 = vector.extract_strided_slice %114 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %118 = vector.transfer_write %117, %116[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %119 = vector.extract_strided_slice %114 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %120 = vector.transfer_write %119, %118[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %121 = vector.extract_strided_slice %114 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %122 = vector.transfer_write %121, %120[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| scf.yield %122 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %20 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After Jakub cleanup extract/insert --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x4xi8> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %cst_0 = arith.constant dense<0> : vector<4x4xi32> | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.extract_strided_slice %cst_0 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %13 = vector.transfer_write %12, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %14 = vector.extract_strided_slice %cst_0 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %15 = vector.transfer_write %14, %13[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %16 = vector.extract_strided_slice %cst_0 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %17 = vector.transfer_write %16, %15[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %18 = vector.extract_strided_slice %cst_0 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %19 = vector.transfer_write %18, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %20 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %19) -> (tensor<4x4xi32>) { | |
| %21 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %22 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %24 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %25 = vector.transfer_read %8[%24, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %26 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %27 = vector.transfer_read %8[%26, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %35 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %36 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %37 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %38 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %39 = arith.extsi %21 : vector<1x4xi8> to vector<1x4xi32> | |
| %40 = arith.extsi %23 : vector<1x4xi8> to vector<1x4xi32> | |
| %41 = arith.extsi %25 : vector<1x4xi8> to vector<1x4xi32> | |
| %42 = arith.extsi %27 : vector<1x4xi8> to vector<1x4xi32> | |
| %43 = vector.insert_strided_slice %28, %cst {offsets = [0, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %44 = vector.insert_strided_slice %30, %43 {offsets = [1, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %45 = vector.insert_strided_slice %32, %44 {offsets = [2, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %46 = vector.insert_strided_slice %34, %45 {offsets = [3, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %47 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %48 = arith.extsi %47 : vector<4x1xi8> to vector<4x1xi32> | |
| %49 = vector.extract_strided_slice %35 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %48, %49 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %51 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %52 = arith.extsi %51 : vector<4x1xi8> to vector<4x1xi32> | |
| %53 = vector.extract_strided_slice %35 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %52, %53 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %55 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %56 = arith.extsi %55 : vector<4x1xi8> to vector<4x1xi32> | |
| %57 = vector.extract_strided_slice %35 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %58 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %56, %57 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %59 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %60 = arith.extsi %59 : vector<4x1xi8> to vector<4x1xi32> | |
| %61 = vector.extract_strided_slice %35 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %60, %61 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %63 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract_strided_slice %36 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %64, %65 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %67 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32> | |
| %69 = vector.extract_strided_slice %36 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %70 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %68, %69 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %71 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %72 = arith.extsi %71 : vector<4x1xi8> to vector<4x1xi32> | |
| %73 = vector.extract_strided_slice %36 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %74 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %72, %73 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %75 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %76 = arith.extsi %75 : vector<4x1xi8> to vector<4x1xi32> | |
| %77 = vector.extract_strided_slice %36 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %76, %77 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %79 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %80 = arith.extsi %79 : vector<4x1xi8> to vector<4x1xi32> | |
| %81 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %82 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %80, %81 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %83 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %84 = arith.extsi %83 : vector<4x1xi8> to vector<4x1xi32> | |
| %85 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %86 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %84, %85 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %87 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %88 = arith.extsi %87 : vector<4x1xi8> to vector<4x1xi32> | |
| %89 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %90 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %88, %89 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %91 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %92 = arith.extsi %91 : vector<4x1xi8> to vector<4x1xi32> | |
| %93 = vector.extract_strided_slice %37 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %94 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %92, %93 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %95 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %96 = arith.extsi %95 : vector<4x1xi8> to vector<4x1xi32> | |
| %97 = vector.extract_strided_slice %38 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %98 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %96, %97 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %99 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %100 = arith.extsi %99 : vector<4x1xi8> to vector<4x1xi32> | |
| %101 = vector.extract_strided_slice %38 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %102 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %100, %101 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %103 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %104 = arith.extsi %103 : vector<4x1xi8> to vector<4x1xi32> | |
| %105 = vector.extract_strided_slice %38 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %106 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %104, %105 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %107 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %108 = arith.extsi %107 : vector<4x1xi8> to vector<4x1xi32> | |
| %109 = vector.extract_strided_slice %38 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32> | |
| %110 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %108, %109 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32> | |
| %111 = vector.insert_strided_slice %50, %cst_0 {offsets = [0, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %112 = vector.insert_strided_slice %54, %111 {offsets = [0, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %113 = vector.insert_strided_slice %58, %112 {offsets = [0, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %114 = vector.insert_strided_slice %62, %113 {offsets = [0, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %115 = vector.insert_strided_slice %66, %114 {offsets = [1, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %116 = vector.insert_strided_slice %70, %115 {offsets = [1, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %117 = vector.insert_strided_slice %74, %116 {offsets = [1, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %118 = vector.insert_strided_slice %78, %117 {offsets = [1, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %119 = vector.insert_strided_slice %82, %118 {offsets = [2, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %120 = vector.insert_strided_slice %86, %119 {offsets = [2, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %121 = vector.insert_strided_slice %90, %120 {offsets = [2, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %122 = vector.insert_strided_slice %94, %121 {offsets = [2, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %123 = vector.insert_strided_slice %98, %122 {offsets = [3, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %124 = vector.insert_strided_slice %102, %123 {offsets = [3, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %125 = vector.insert_strided_slice %106, %124 {offsets = [3, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %126 = vector.insert_strided_slice %110, %125 {offsets = [3, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %127 = vector.extract_strided_slice %126 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %128 = vector.transfer_write %127, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %129 = vector.extract_strided_slice %126 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %130 = vector.transfer_write %129, %128[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %131 = vector.extract_strided_slice %126 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %132 = vector.transfer_write %131, %130[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %133 = vector.extract_strided_slice %126 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %134 = vector.transfer_write %133, %132[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| scf.yield %134 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %20 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After lowering size-1 reduction contract ops --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<1x1xi32> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %cst_2 = arith.constant dense<0> : vector<4x4xi8> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %cst_3 = arith.constant dense<0> : vector<4x4xi32> | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.extract_strided_slice %cst_3 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %13 = vector.transfer_write %12, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %14 = vector.extract_strided_slice %cst_3 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %15 = vector.transfer_write %14, %13[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %16 = vector.extract_strided_slice %cst_3 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %17 = vector.transfer_write %16, %15[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %18 = vector.extract_strided_slice %cst_3 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %19 = vector.transfer_write %18, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %20 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %19) -> (tensor<4x4xi32>) { | |
| %21 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %22 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %24 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %25 = vector.transfer_read %8[%24, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %26 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %27 = vector.transfer_read %8[%26, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %35 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %36 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %37 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %38 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %39 = arith.extsi %21 : vector<1x4xi8> to vector<1x4xi32> | |
| %40 = arith.extsi %23 : vector<1x4xi8> to vector<1x4xi32> | |
| %41 = arith.extsi %25 : vector<1x4xi8> to vector<1x4xi32> | |
| %42 = arith.extsi %27 : vector<1x4xi8> to vector<1x4xi32> | |
| %43 = vector.insert_strided_slice %28, %cst_2 {offsets = [0, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %44 = vector.insert_strided_slice %30, %43 {offsets = [1, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %45 = vector.insert_strided_slice %32, %44 {offsets = [2, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %46 = vector.insert_strided_slice %34, %45 {offsets = [3, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %47 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %48 = arith.extsi %47 : vector<4x1xi8> to vector<4x1xi32> | |
| %49 = vector.extract %39[0] : vector<1x4xi32> | |
| %50 = vector.extract %48[0, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %cst_1 [0] : i32 into vector<4xi32> | |
| %52 = vector.extract %48[1, 0] : vector<4x1xi32> | |
| %53 = vector.insert %52, %51 [1] : i32 into vector<4xi32> | |
| %54 = vector.extract %48[2, 0] : vector<4x1xi32> | |
| %55 = vector.insert %54, %53 [2] : i32 into vector<4xi32> | |
| %56 = vector.extract %48[3, 0] : vector<4x1xi32> | |
| %57 = vector.insert %56, %55 [3] : i32 into vector<4xi32> | |
| %58 = vector.extract %35[0, 0] : vector<1x4xi32> | |
| %59 = arith.muli %49, %57 : vector<4xi32> | |
| %60 = vector.reduction <add>, %59, %58 : vector<4xi32> into i32 | |
| %61 = vector.insert %60, %cst_0 [0] : i32 into vector<1xi32> | |
| %62 = vector.insert %61, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %63 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %39[0] : vector<1x4xi32> | |
| %66 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %67 = vector.insert %66, %cst_1 [0] : i32 into vector<4xi32> | |
| %68 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %69 = vector.insert %68, %67 [1] : i32 into vector<4xi32> | |
| %70 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %71 = vector.insert %70, %69 [2] : i32 into vector<4xi32> | |
| %72 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %73 = vector.insert %72, %71 [3] : i32 into vector<4xi32> | |
| %74 = vector.extract %35[0, 1] : vector<1x4xi32> | |
| %75 = arith.muli %65, %73 : vector<4xi32> | |
| %76 = vector.reduction <add>, %75, %74 : vector<4xi32> into i32 | |
| %77 = vector.insert %76, %cst_0 [0] : i32 into vector<1xi32> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %79 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %80 = arith.extsi %79 : vector<4x1xi8> to vector<4x1xi32> | |
| %81 = vector.extract %39[0] : vector<1x4xi32> | |
| %82 = vector.extract %80[0, 0] : vector<4x1xi32> | |
| %83 = vector.insert %82, %cst_1 [0] : i32 into vector<4xi32> | |
| %84 = vector.extract %80[1, 0] : vector<4x1xi32> | |
| %85 = vector.insert %84, %83 [1] : i32 into vector<4xi32> | |
| %86 = vector.extract %80[2, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %85 [2] : i32 into vector<4xi32> | |
| %88 = vector.extract %80[3, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [3] : i32 into vector<4xi32> | |
| %90 = vector.extract %35[0, 2] : vector<1x4xi32> | |
| %91 = arith.muli %81, %89 : vector<4xi32> | |
| %92 = vector.reduction <add>, %91, %90 : vector<4xi32> into i32 | |
| %93 = vector.insert %92, %cst_0 [0] : i32 into vector<1xi32> | |
| %94 = vector.insert %93, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %95 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %96 = arith.extsi %95 : vector<4x1xi8> to vector<4x1xi32> | |
| %97 = vector.extract %39[0] : vector<1x4xi32> | |
| %98 = vector.extract %96[0, 0] : vector<4x1xi32> | |
| %99 = vector.insert %98, %cst_1 [0] : i32 into vector<4xi32> | |
| %100 = vector.extract %96[1, 0] : vector<4x1xi32> | |
| %101 = vector.insert %100, %99 [1] : i32 into vector<4xi32> | |
| %102 = vector.extract %96[2, 0] : vector<4x1xi32> | |
| %103 = vector.insert %102, %101 [2] : i32 into vector<4xi32> | |
| %104 = vector.extract %96[3, 0] : vector<4x1xi32> | |
| %105 = vector.insert %104, %103 [3] : i32 into vector<4xi32> | |
| %106 = vector.extract %35[0, 3] : vector<1x4xi32> | |
| %107 = arith.muli %97, %105 : vector<4xi32> | |
| %108 = vector.reduction <add>, %107, %106 : vector<4xi32> into i32 | |
| %109 = vector.insert %108, %cst_0 [0] : i32 into vector<1xi32> | |
| %110 = vector.insert %109, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %111 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %112 = arith.extsi %111 : vector<4x1xi8> to vector<4x1xi32> | |
| %113 = vector.extract %40[0] : vector<1x4xi32> | |
| %114 = vector.extract %112[0, 0] : vector<4x1xi32> | |
| %115 = vector.insert %114, %cst_1 [0] : i32 into vector<4xi32> | |
| %116 = vector.extract %112[1, 0] : vector<4x1xi32> | |
| %117 = vector.insert %116, %115 [1] : i32 into vector<4xi32> | |
| %118 = vector.extract %112[2, 0] : vector<4x1xi32> | |
| %119 = vector.insert %118, %117 [2] : i32 into vector<4xi32> | |
| %120 = vector.extract %112[3, 0] : vector<4x1xi32> | |
| %121 = vector.insert %120, %119 [3] : i32 into vector<4xi32> | |
| %122 = vector.extract %36[0, 0] : vector<1x4xi32> | |
| %123 = arith.muli %113, %121 : vector<4xi32> | |
| %124 = vector.reduction <add>, %123, %122 : vector<4xi32> into i32 | |
| %125 = vector.insert %124, %cst_0 [0] : i32 into vector<1xi32> | |
| %126 = vector.insert %125, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %127 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %128 = arith.extsi %127 : vector<4x1xi8> to vector<4x1xi32> | |
| %129 = vector.extract %40[0] : vector<1x4xi32> | |
| %130 = vector.extract %128[0, 0] : vector<4x1xi32> | |
| %131 = vector.insert %130, %cst_1 [0] : i32 into vector<4xi32> | |
| %132 = vector.extract %128[1, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %131 [1] : i32 into vector<4xi32> | |
| %134 = vector.extract %128[2, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [2] : i32 into vector<4xi32> | |
| %136 = vector.extract %128[3, 0] : vector<4x1xi32> | |
| %137 = vector.insert %136, %135 [3] : i32 into vector<4xi32> | |
| %138 = vector.extract %36[0, 1] : vector<1x4xi32> | |
| %139 = arith.muli %129, %137 : vector<4xi32> | |
| %140 = vector.reduction <add>, %139, %138 : vector<4xi32> into i32 | |
| %141 = vector.insert %140, %cst_0 [0] : i32 into vector<1xi32> | |
| %142 = vector.insert %141, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %143 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %144 = arith.extsi %143 : vector<4x1xi8> to vector<4x1xi32> | |
| %145 = vector.extract %40[0] : vector<1x4xi32> | |
| %146 = vector.extract %144[0, 0] : vector<4x1xi32> | |
| %147 = vector.insert %146, %cst_1 [0] : i32 into vector<4xi32> | |
| %148 = vector.extract %144[1, 0] : vector<4x1xi32> | |
| %149 = vector.insert %148, %147 [1] : i32 into vector<4xi32> | |
| %150 = vector.extract %144[2, 0] : vector<4x1xi32> | |
| %151 = vector.insert %150, %149 [2] : i32 into vector<4xi32> | |
| %152 = vector.extract %144[3, 0] : vector<4x1xi32> | |
| %153 = vector.insert %152, %151 [3] : i32 into vector<4xi32> | |
| %154 = vector.extract %36[0, 2] : vector<1x4xi32> | |
| %155 = arith.muli %145, %153 : vector<4xi32> | |
| %156 = vector.reduction <add>, %155, %154 : vector<4xi32> into i32 | |
| %157 = vector.insert %156, %cst_0 [0] : i32 into vector<1xi32> | |
| %158 = vector.insert %157, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %159 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %160 = arith.extsi %159 : vector<4x1xi8> to vector<4x1xi32> | |
| %161 = vector.extract %40[0] : vector<1x4xi32> | |
| %162 = vector.extract %160[0, 0] : vector<4x1xi32> | |
| %163 = vector.insert %162, %cst_1 [0] : i32 into vector<4xi32> | |
| %164 = vector.extract %160[1, 0] : vector<4x1xi32> | |
| %165 = vector.insert %164, %163 [1] : i32 into vector<4xi32> | |
| %166 = vector.extract %160[2, 0] : vector<4x1xi32> | |
| %167 = vector.insert %166, %165 [2] : i32 into vector<4xi32> | |
| %168 = vector.extract %160[3, 0] : vector<4x1xi32> | |
| %169 = vector.insert %168, %167 [3] : i32 into vector<4xi32> | |
| %170 = vector.extract %36[0, 3] : vector<1x4xi32> | |
| %171 = arith.muli %161, %169 : vector<4xi32> | |
| %172 = vector.reduction <add>, %171, %170 : vector<4xi32> into i32 | |
| %173 = vector.insert %172, %cst_0 [0] : i32 into vector<1xi32> | |
| %174 = vector.insert %173, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %175 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %176 = arith.extsi %175 : vector<4x1xi8> to vector<4x1xi32> | |
| %177 = vector.extract %41[0] : vector<1x4xi32> | |
| %178 = vector.extract %176[0, 0] : vector<4x1xi32> | |
| %179 = vector.insert %178, %cst_1 [0] : i32 into vector<4xi32> | |
| %180 = vector.extract %176[1, 0] : vector<4x1xi32> | |
| %181 = vector.insert %180, %179 [1] : i32 into vector<4xi32> | |
| %182 = vector.extract %176[2, 0] : vector<4x1xi32> | |
| %183 = vector.insert %182, %181 [2] : i32 into vector<4xi32> | |
| %184 = vector.extract %176[3, 0] : vector<4x1xi32> | |
| %185 = vector.insert %184, %183 [3] : i32 into vector<4xi32> | |
| %186 = vector.extract %37[0, 0] : vector<1x4xi32> | |
| %187 = arith.muli %177, %185 : vector<4xi32> | |
| %188 = vector.reduction <add>, %187, %186 : vector<4xi32> into i32 | |
| %189 = vector.insert %188, %cst_0 [0] : i32 into vector<1xi32> | |
| %190 = vector.insert %189, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %191 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %192 = arith.extsi %191 : vector<4x1xi8> to vector<4x1xi32> | |
| %193 = vector.extract %41[0] : vector<1x4xi32> | |
| %194 = vector.extract %192[0, 0] : vector<4x1xi32> | |
| %195 = vector.insert %194, %cst_1 [0] : i32 into vector<4xi32> | |
| %196 = vector.extract %192[1, 0] : vector<4x1xi32> | |
| %197 = vector.insert %196, %195 [1] : i32 into vector<4xi32> | |
| %198 = vector.extract %192[2, 0] : vector<4x1xi32> | |
| %199 = vector.insert %198, %197 [2] : i32 into vector<4xi32> | |
| %200 = vector.extract %192[3, 0] : vector<4x1xi32> | |
| %201 = vector.insert %200, %199 [3] : i32 into vector<4xi32> | |
| %202 = vector.extract %37[0, 1] : vector<1x4xi32> | |
| %203 = arith.muli %193, %201 : vector<4xi32> | |
| %204 = vector.reduction <add>, %203, %202 : vector<4xi32> into i32 | |
| %205 = vector.insert %204, %cst_0 [0] : i32 into vector<1xi32> | |
| %206 = vector.insert %205, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %207 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %208 = arith.extsi %207 : vector<4x1xi8> to vector<4x1xi32> | |
| %209 = vector.extract %41[0] : vector<1x4xi32> | |
| %210 = vector.extract %208[0, 0] : vector<4x1xi32> | |
| %211 = vector.insert %210, %cst_1 [0] : i32 into vector<4xi32> | |
| %212 = vector.extract %208[1, 0] : vector<4x1xi32> | |
| %213 = vector.insert %212, %211 [1] : i32 into vector<4xi32> | |
| %214 = vector.extract %208[2, 0] : vector<4x1xi32> | |
| %215 = vector.insert %214, %213 [2] : i32 into vector<4xi32> | |
| %216 = vector.extract %208[3, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %215 [3] : i32 into vector<4xi32> | |
| %218 = vector.extract %37[0, 2] : vector<1x4xi32> | |
| %219 = arith.muli %209, %217 : vector<4xi32> | |
| %220 = vector.reduction <add>, %219, %218 : vector<4xi32> into i32 | |
| %221 = vector.insert %220, %cst_0 [0] : i32 into vector<1xi32> | |
| %222 = vector.insert %221, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %223 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %224 = arith.extsi %223 : vector<4x1xi8> to vector<4x1xi32> | |
| %225 = vector.extract %41[0] : vector<1x4xi32> | |
| %226 = vector.extract %224[0, 0] : vector<4x1xi32> | |
| %227 = vector.insert %226, %cst_1 [0] : i32 into vector<4xi32> | |
| %228 = vector.extract %224[1, 0] : vector<4x1xi32> | |
| %229 = vector.insert %228, %227 [1] : i32 into vector<4xi32> | |
| %230 = vector.extract %224[2, 0] : vector<4x1xi32> | |
| %231 = vector.insert %230, %229 [2] : i32 into vector<4xi32> | |
| %232 = vector.extract %224[3, 0] : vector<4x1xi32> | |
| %233 = vector.insert %232, %231 [3] : i32 into vector<4xi32> | |
| %234 = vector.extract %37[0, 3] : vector<1x4xi32> | |
| %235 = arith.muli %225, %233 : vector<4xi32> | |
| %236 = vector.reduction <add>, %235, %234 : vector<4xi32> into i32 | |
| %237 = vector.insert %236, %cst_0 [0] : i32 into vector<1xi32> | |
| %238 = vector.insert %237, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %239 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %240 = arith.extsi %239 : vector<4x1xi8> to vector<4x1xi32> | |
| %241 = vector.extract %42[0] : vector<1x4xi32> | |
| %242 = vector.extract %240[0, 0] : vector<4x1xi32> | |
| %243 = vector.insert %242, %cst_1 [0] : i32 into vector<4xi32> | |
| %244 = vector.extract %240[1, 0] : vector<4x1xi32> | |
| %245 = vector.insert %244, %243 [1] : i32 into vector<4xi32> | |
| %246 = vector.extract %240[2, 0] : vector<4x1xi32> | |
| %247 = vector.insert %246, %245 [2] : i32 into vector<4xi32> | |
| %248 = vector.extract %240[3, 0] : vector<4x1xi32> | |
| %249 = vector.insert %248, %247 [3] : i32 into vector<4xi32> | |
| %250 = vector.extract %38[0, 0] : vector<1x4xi32> | |
| %251 = arith.muli %241, %249 : vector<4xi32> | |
| %252 = vector.reduction <add>, %251, %250 : vector<4xi32> into i32 | |
| %253 = vector.insert %252, %cst_0 [0] : i32 into vector<1xi32> | |
| %254 = vector.insert %253, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %255 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %256 = arith.extsi %255 : vector<4x1xi8> to vector<4x1xi32> | |
| %257 = vector.extract %42[0] : vector<1x4xi32> | |
| %258 = vector.extract %256[0, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32> | |
| %260 = vector.extract %256[1, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [1] : i32 into vector<4xi32> | |
| %262 = vector.extract %256[2, 0] : vector<4x1xi32> | |
| %263 = vector.insert %262, %261 [2] : i32 into vector<4xi32> | |
| %264 = vector.extract %256[3, 0] : vector<4x1xi32> | |
| %265 = vector.insert %264, %263 [3] : i32 into vector<4xi32> | |
| %266 = vector.extract %38[0, 1] : vector<1x4xi32> | |
| %267 = arith.muli %257, %265 : vector<4xi32> | |
| %268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32 | |
| %269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32> | |
| %270 = vector.insert %269, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %271 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %272 = arith.extsi %271 : vector<4x1xi8> to vector<4x1xi32> | |
| %273 = vector.extract %42[0] : vector<1x4xi32> | |
| %274 = vector.extract %272[0, 0] : vector<4x1xi32> | |
| %275 = vector.insert %274, %cst_1 [0] : i32 into vector<4xi32> | |
| %276 = vector.extract %272[1, 0] : vector<4x1xi32> | |
| %277 = vector.insert %276, %275 [1] : i32 into vector<4xi32> | |
| %278 = vector.extract %272[2, 0] : vector<4x1xi32> | |
| %279 = vector.insert %278, %277 [2] : i32 into vector<4xi32> | |
| %280 = vector.extract %272[3, 0] : vector<4x1xi32> | |
| %281 = vector.insert %280, %279 [3] : i32 into vector<4xi32> | |
| %282 = vector.extract %38[0, 2] : vector<1x4xi32> | |
| %283 = arith.muli %273, %281 : vector<4xi32> | |
| %284 = vector.reduction <add>, %283, %282 : vector<4xi32> into i32 | |
| %285 = vector.insert %284, %cst_0 [0] : i32 into vector<1xi32> | |
| %286 = vector.insert %285, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %287 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %288 = arith.extsi %287 : vector<4x1xi8> to vector<4x1xi32> | |
| %289 = vector.extract %42[0] : vector<1x4xi32> | |
| %290 = vector.extract %288[0, 0] : vector<4x1xi32> | |
| %291 = vector.insert %290, %cst_1 [0] : i32 into vector<4xi32> | |
| %292 = vector.extract %288[1, 0] : vector<4x1xi32> | |
| %293 = vector.insert %292, %291 [1] : i32 into vector<4xi32> | |
| %294 = vector.extract %288[2, 0] : vector<4x1xi32> | |
| %295 = vector.insert %294, %293 [2] : i32 into vector<4xi32> | |
| %296 = vector.extract %288[3, 0] : vector<4x1xi32> | |
| %297 = vector.insert %296, %295 [3] : i32 into vector<4xi32> | |
| %298 = vector.extract %38[0, 3] : vector<1x4xi32> | |
| %299 = arith.muli %289, %297 : vector<4xi32> | |
| %300 = vector.reduction <add>, %299, %298 : vector<4xi32> into i32 | |
| %301 = vector.insert %300, %cst_0 [0] : i32 into vector<1xi32> | |
| %302 = vector.insert %301, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %303 = vector.insert_strided_slice %62, %cst_3 {offsets = [0, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %304 = vector.insert_strided_slice %78, %303 {offsets = [0, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %305 = vector.insert_strided_slice %94, %304 {offsets = [0, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %306 = vector.insert_strided_slice %110, %305 {offsets = [0, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %307 = vector.insert_strided_slice %126, %306 {offsets = [1, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %308 = vector.insert_strided_slice %142, %307 {offsets = [1, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %309 = vector.insert_strided_slice %158, %308 {offsets = [1, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %310 = vector.insert_strided_slice %174, %309 {offsets = [1, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %311 = vector.insert_strided_slice %190, %310 {offsets = [2, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %312 = vector.insert_strided_slice %206, %311 {offsets = [2, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %313 = vector.insert_strided_slice %222, %312 {offsets = [2, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %314 = vector.insert_strided_slice %238, %313 {offsets = [2, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %315 = vector.insert_strided_slice %254, %314 {offsets = [3, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %316 = vector.insert_strided_slice %270, %315 {offsets = [3, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %317 = vector.insert_strided_slice %286, %316 {offsets = [3, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %318 = vector.insert_strided_slice %302, %317 {offsets = [3, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %319 = vector.extract_strided_slice %318 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %320 = vector.transfer_write %319, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %321 = vector.extract_strided_slice %318 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %322 = vector.transfer_write %321, %320[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %323 = vector.extract_strided_slice %318 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %324 = vector.transfer_write %323, %322[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %325 = vector.extract_strided_slice %318 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %326 = vector.transfer_write %325, %324[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| scf.yield %326 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %20 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After lowering transpose ops --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<1x1xi32> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %cst_2 = arith.constant dense<0> : vector<4x4xi8> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %cst_3 = arith.constant dense<0> : vector<4x4xi32> | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.extract_strided_slice %cst_3 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %13 = vector.transfer_write %12, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %14 = vector.extract_strided_slice %cst_3 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %15 = vector.transfer_write %14, %13[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %16 = vector.extract_strided_slice %cst_3 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %17 = vector.transfer_write %16, %15[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %18 = vector.extract_strided_slice %cst_3 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %19 = vector.transfer_write %18, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %20 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %19) -> (tensor<4x4xi32>) { | |
| %21 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %22 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %24 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %25 = vector.transfer_read %8[%24, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %26 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %27 = vector.transfer_read %8[%26, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8> | |
| %28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8> | |
| %35 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %36 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %37 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %38 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32> | |
| %39 = arith.extsi %21 : vector<1x4xi8> to vector<1x4xi32> | |
| %40 = arith.extsi %23 : vector<1x4xi8> to vector<1x4xi32> | |
| %41 = arith.extsi %25 : vector<1x4xi8> to vector<1x4xi32> | |
| %42 = arith.extsi %27 : vector<1x4xi8> to vector<1x4xi32> | |
| %43 = vector.insert_strided_slice %28, %cst_2 {offsets = [0, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %44 = vector.insert_strided_slice %30, %43 {offsets = [1, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %45 = vector.insert_strided_slice %32, %44 {offsets = [2, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %46 = vector.insert_strided_slice %34, %45 {offsets = [3, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8> | |
| %47 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %48 = arith.extsi %47 : vector<4x1xi8> to vector<4x1xi32> | |
| %49 = vector.extract %39[0] : vector<1x4xi32> | |
| %50 = vector.extract %48[0, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %cst_1 [0] : i32 into vector<4xi32> | |
| %52 = vector.extract %48[1, 0] : vector<4x1xi32> | |
| %53 = vector.insert %52, %51 [1] : i32 into vector<4xi32> | |
| %54 = vector.extract %48[2, 0] : vector<4x1xi32> | |
| %55 = vector.insert %54, %53 [2] : i32 into vector<4xi32> | |
| %56 = vector.extract %48[3, 0] : vector<4x1xi32> | |
| %57 = vector.insert %56, %55 [3] : i32 into vector<4xi32> | |
| %58 = vector.extract %35[0, 0] : vector<1x4xi32> | |
| %59 = arith.muli %49, %57 : vector<4xi32> | |
| %60 = vector.reduction <add>, %59, %58 : vector<4xi32> into i32 | |
| %61 = vector.insert %60, %cst_0 [0] : i32 into vector<1xi32> | |
| %62 = vector.insert %61, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %63 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %39[0] : vector<1x4xi32> | |
| %66 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %67 = vector.insert %66, %cst_1 [0] : i32 into vector<4xi32> | |
| %68 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %69 = vector.insert %68, %67 [1] : i32 into vector<4xi32> | |
| %70 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %71 = vector.insert %70, %69 [2] : i32 into vector<4xi32> | |
| %72 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %73 = vector.insert %72, %71 [3] : i32 into vector<4xi32> | |
| %74 = vector.extract %35[0, 1] : vector<1x4xi32> | |
| %75 = arith.muli %65, %73 : vector<4xi32> | |
| %76 = vector.reduction <add>, %75, %74 : vector<4xi32> into i32 | |
| %77 = vector.insert %76, %cst_0 [0] : i32 into vector<1xi32> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %79 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %80 = arith.extsi %79 : vector<4x1xi8> to vector<4x1xi32> | |
| %81 = vector.extract %39[0] : vector<1x4xi32> | |
| %82 = vector.extract %80[0, 0] : vector<4x1xi32> | |
| %83 = vector.insert %82, %cst_1 [0] : i32 into vector<4xi32> | |
| %84 = vector.extract %80[1, 0] : vector<4x1xi32> | |
| %85 = vector.insert %84, %83 [1] : i32 into vector<4xi32> | |
| %86 = vector.extract %80[2, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %85 [2] : i32 into vector<4xi32> | |
| %88 = vector.extract %80[3, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [3] : i32 into vector<4xi32> | |
| %90 = vector.extract %35[0, 2] : vector<1x4xi32> | |
| %91 = arith.muli %81, %89 : vector<4xi32> | |
| %92 = vector.reduction <add>, %91, %90 : vector<4xi32> into i32 | |
| %93 = vector.insert %92, %cst_0 [0] : i32 into vector<1xi32> | |
| %94 = vector.insert %93, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %95 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %96 = arith.extsi %95 : vector<4x1xi8> to vector<4x1xi32> | |
| %97 = vector.extract %39[0] : vector<1x4xi32> | |
| %98 = vector.extract %96[0, 0] : vector<4x1xi32> | |
| %99 = vector.insert %98, %cst_1 [0] : i32 into vector<4xi32> | |
| %100 = vector.extract %96[1, 0] : vector<4x1xi32> | |
| %101 = vector.insert %100, %99 [1] : i32 into vector<4xi32> | |
| %102 = vector.extract %96[2, 0] : vector<4x1xi32> | |
| %103 = vector.insert %102, %101 [2] : i32 into vector<4xi32> | |
| %104 = vector.extract %96[3, 0] : vector<4x1xi32> | |
| %105 = vector.insert %104, %103 [3] : i32 into vector<4xi32> | |
| %106 = vector.extract %35[0, 3] : vector<1x4xi32> | |
| %107 = arith.muli %97, %105 : vector<4xi32> | |
| %108 = vector.reduction <add>, %107, %106 : vector<4xi32> into i32 | |
| %109 = vector.insert %108, %cst_0 [0] : i32 into vector<1xi32> | |
| %110 = vector.insert %109, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %111 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %112 = arith.extsi %111 : vector<4x1xi8> to vector<4x1xi32> | |
| %113 = vector.extract %40[0] : vector<1x4xi32> | |
| %114 = vector.extract %112[0, 0] : vector<4x1xi32> | |
| %115 = vector.insert %114, %cst_1 [0] : i32 into vector<4xi32> | |
| %116 = vector.extract %112[1, 0] : vector<4x1xi32> | |
| %117 = vector.insert %116, %115 [1] : i32 into vector<4xi32> | |
| %118 = vector.extract %112[2, 0] : vector<4x1xi32> | |
| %119 = vector.insert %118, %117 [2] : i32 into vector<4xi32> | |
| %120 = vector.extract %112[3, 0] : vector<4x1xi32> | |
| %121 = vector.insert %120, %119 [3] : i32 into vector<4xi32> | |
| %122 = vector.extract %36[0, 0] : vector<1x4xi32> | |
| %123 = arith.muli %113, %121 : vector<4xi32> | |
| %124 = vector.reduction <add>, %123, %122 : vector<4xi32> into i32 | |
| %125 = vector.insert %124, %cst_0 [0] : i32 into vector<1xi32> | |
| %126 = vector.insert %125, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %127 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %128 = arith.extsi %127 : vector<4x1xi8> to vector<4x1xi32> | |
| %129 = vector.extract %40[0] : vector<1x4xi32> | |
| %130 = vector.extract %128[0, 0] : vector<4x1xi32> | |
| %131 = vector.insert %130, %cst_1 [0] : i32 into vector<4xi32> | |
| %132 = vector.extract %128[1, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %131 [1] : i32 into vector<4xi32> | |
| %134 = vector.extract %128[2, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [2] : i32 into vector<4xi32> | |
| %136 = vector.extract %128[3, 0] : vector<4x1xi32> | |
| %137 = vector.insert %136, %135 [3] : i32 into vector<4xi32> | |
| %138 = vector.extract %36[0, 1] : vector<1x4xi32> | |
| %139 = arith.muli %129, %137 : vector<4xi32> | |
| %140 = vector.reduction <add>, %139, %138 : vector<4xi32> into i32 | |
| %141 = vector.insert %140, %cst_0 [0] : i32 into vector<1xi32> | |
| %142 = vector.insert %141, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %143 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %144 = arith.extsi %143 : vector<4x1xi8> to vector<4x1xi32> | |
| %145 = vector.extract %40[0] : vector<1x4xi32> | |
| %146 = vector.extract %144[0, 0] : vector<4x1xi32> | |
| %147 = vector.insert %146, %cst_1 [0] : i32 into vector<4xi32> | |
| %148 = vector.extract %144[1, 0] : vector<4x1xi32> | |
| %149 = vector.insert %148, %147 [1] : i32 into vector<4xi32> | |
| %150 = vector.extract %144[2, 0] : vector<4x1xi32> | |
| %151 = vector.insert %150, %149 [2] : i32 into vector<4xi32> | |
| %152 = vector.extract %144[3, 0] : vector<4x1xi32> | |
| %153 = vector.insert %152, %151 [3] : i32 into vector<4xi32> | |
| %154 = vector.extract %36[0, 2] : vector<1x4xi32> | |
| %155 = arith.muli %145, %153 : vector<4xi32> | |
| %156 = vector.reduction <add>, %155, %154 : vector<4xi32> into i32 | |
| %157 = vector.insert %156, %cst_0 [0] : i32 into vector<1xi32> | |
| %158 = vector.insert %157, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %159 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %160 = arith.extsi %159 : vector<4x1xi8> to vector<4x1xi32> | |
| %161 = vector.extract %40[0] : vector<1x4xi32> | |
| %162 = vector.extract %160[0, 0] : vector<4x1xi32> | |
| %163 = vector.insert %162, %cst_1 [0] : i32 into vector<4xi32> | |
| %164 = vector.extract %160[1, 0] : vector<4x1xi32> | |
| %165 = vector.insert %164, %163 [1] : i32 into vector<4xi32> | |
| %166 = vector.extract %160[2, 0] : vector<4x1xi32> | |
| %167 = vector.insert %166, %165 [2] : i32 into vector<4xi32> | |
| %168 = vector.extract %160[3, 0] : vector<4x1xi32> | |
| %169 = vector.insert %168, %167 [3] : i32 into vector<4xi32> | |
| %170 = vector.extract %36[0, 3] : vector<1x4xi32> | |
| %171 = arith.muli %161, %169 : vector<4xi32> | |
| %172 = vector.reduction <add>, %171, %170 : vector<4xi32> into i32 | |
| %173 = vector.insert %172, %cst_0 [0] : i32 into vector<1xi32> | |
| %174 = vector.insert %173, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %175 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %176 = arith.extsi %175 : vector<4x1xi8> to vector<4x1xi32> | |
| %177 = vector.extract %41[0] : vector<1x4xi32> | |
| %178 = vector.extract %176[0, 0] : vector<4x1xi32> | |
| %179 = vector.insert %178, %cst_1 [0] : i32 into vector<4xi32> | |
| %180 = vector.extract %176[1, 0] : vector<4x1xi32> | |
| %181 = vector.insert %180, %179 [1] : i32 into vector<4xi32> | |
| %182 = vector.extract %176[2, 0] : vector<4x1xi32> | |
| %183 = vector.insert %182, %181 [2] : i32 into vector<4xi32> | |
| %184 = vector.extract %176[3, 0] : vector<4x1xi32> | |
| %185 = vector.insert %184, %183 [3] : i32 into vector<4xi32> | |
| %186 = vector.extract %37[0, 0] : vector<1x4xi32> | |
| %187 = arith.muli %177, %185 : vector<4xi32> | |
| %188 = vector.reduction <add>, %187, %186 : vector<4xi32> into i32 | |
| %189 = vector.insert %188, %cst_0 [0] : i32 into vector<1xi32> | |
| %190 = vector.insert %189, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %191 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %192 = arith.extsi %191 : vector<4x1xi8> to vector<4x1xi32> | |
| %193 = vector.extract %41[0] : vector<1x4xi32> | |
| %194 = vector.extract %192[0, 0] : vector<4x1xi32> | |
| %195 = vector.insert %194, %cst_1 [0] : i32 into vector<4xi32> | |
| %196 = vector.extract %192[1, 0] : vector<4x1xi32> | |
| %197 = vector.insert %196, %195 [1] : i32 into vector<4xi32> | |
| %198 = vector.extract %192[2, 0] : vector<4x1xi32> | |
| %199 = vector.insert %198, %197 [2] : i32 into vector<4xi32> | |
| %200 = vector.extract %192[3, 0] : vector<4x1xi32> | |
| %201 = vector.insert %200, %199 [3] : i32 into vector<4xi32> | |
| %202 = vector.extract %37[0, 1] : vector<1x4xi32> | |
| %203 = arith.muli %193, %201 : vector<4xi32> | |
| %204 = vector.reduction <add>, %203, %202 : vector<4xi32> into i32 | |
| %205 = vector.insert %204, %cst_0 [0] : i32 into vector<1xi32> | |
| %206 = vector.insert %205, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %207 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %208 = arith.extsi %207 : vector<4x1xi8> to vector<4x1xi32> | |
| %209 = vector.extract %41[0] : vector<1x4xi32> | |
| %210 = vector.extract %208[0, 0] : vector<4x1xi32> | |
| %211 = vector.insert %210, %cst_1 [0] : i32 into vector<4xi32> | |
| %212 = vector.extract %208[1, 0] : vector<4x1xi32> | |
| %213 = vector.insert %212, %211 [1] : i32 into vector<4xi32> | |
| %214 = vector.extract %208[2, 0] : vector<4x1xi32> | |
| %215 = vector.insert %214, %213 [2] : i32 into vector<4xi32> | |
| %216 = vector.extract %208[3, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %215 [3] : i32 into vector<4xi32> | |
| %218 = vector.extract %37[0, 2] : vector<1x4xi32> | |
| %219 = arith.muli %209, %217 : vector<4xi32> | |
| %220 = vector.reduction <add>, %219, %218 : vector<4xi32> into i32 | |
| %221 = vector.insert %220, %cst_0 [0] : i32 into vector<1xi32> | |
| %222 = vector.insert %221, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %223 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %224 = arith.extsi %223 : vector<4x1xi8> to vector<4x1xi32> | |
| %225 = vector.extract %41[0] : vector<1x4xi32> | |
| %226 = vector.extract %224[0, 0] : vector<4x1xi32> | |
| %227 = vector.insert %226, %cst_1 [0] : i32 into vector<4xi32> | |
| %228 = vector.extract %224[1, 0] : vector<4x1xi32> | |
| %229 = vector.insert %228, %227 [1] : i32 into vector<4xi32> | |
| %230 = vector.extract %224[2, 0] : vector<4x1xi32> | |
| %231 = vector.insert %230, %229 [2] : i32 into vector<4xi32> | |
| %232 = vector.extract %224[3, 0] : vector<4x1xi32> | |
| %233 = vector.insert %232, %231 [3] : i32 into vector<4xi32> | |
| %234 = vector.extract %37[0, 3] : vector<1x4xi32> | |
| %235 = arith.muli %225, %233 : vector<4xi32> | |
| %236 = vector.reduction <add>, %235, %234 : vector<4xi32> into i32 | |
| %237 = vector.insert %236, %cst_0 [0] : i32 into vector<1xi32> | |
| %238 = vector.insert %237, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %239 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %240 = arith.extsi %239 : vector<4x1xi8> to vector<4x1xi32> | |
| %241 = vector.extract %42[0] : vector<1x4xi32> | |
| %242 = vector.extract %240[0, 0] : vector<4x1xi32> | |
| %243 = vector.insert %242, %cst_1 [0] : i32 into vector<4xi32> | |
| %244 = vector.extract %240[1, 0] : vector<4x1xi32> | |
| %245 = vector.insert %244, %243 [1] : i32 into vector<4xi32> | |
| %246 = vector.extract %240[2, 0] : vector<4x1xi32> | |
| %247 = vector.insert %246, %245 [2] : i32 into vector<4xi32> | |
| %248 = vector.extract %240[3, 0] : vector<4x1xi32> | |
| %249 = vector.insert %248, %247 [3] : i32 into vector<4xi32> | |
| %250 = vector.extract %38[0, 0] : vector<1x4xi32> | |
| %251 = arith.muli %241, %249 : vector<4xi32> | |
| %252 = vector.reduction <add>, %251, %250 : vector<4xi32> into i32 | |
| %253 = vector.insert %252, %cst_0 [0] : i32 into vector<1xi32> | |
| %254 = vector.insert %253, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %255 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %256 = arith.extsi %255 : vector<4x1xi8> to vector<4x1xi32> | |
| %257 = vector.extract %42[0] : vector<1x4xi32> | |
| %258 = vector.extract %256[0, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32> | |
| %260 = vector.extract %256[1, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [1] : i32 into vector<4xi32> | |
| %262 = vector.extract %256[2, 0] : vector<4x1xi32> | |
| %263 = vector.insert %262, %261 [2] : i32 into vector<4xi32> | |
| %264 = vector.extract %256[3, 0] : vector<4x1xi32> | |
| %265 = vector.insert %264, %263 [3] : i32 into vector<4xi32> | |
| %266 = vector.extract %38[0, 1] : vector<1x4xi32> | |
| %267 = arith.muli %257, %265 : vector<4xi32> | |
| %268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32 | |
| %269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32> | |
| %270 = vector.insert %269, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %271 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %272 = arith.extsi %271 : vector<4x1xi8> to vector<4x1xi32> | |
| %273 = vector.extract %42[0] : vector<1x4xi32> | |
| %274 = vector.extract %272[0, 0] : vector<4x1xi32> | |
| %275 = vector.insert %274, %cst_1 [0] : i32 into vector<4xi32> | |
| %276 = vector.extract %272[1, 0] : vector<4x1xi32> | |
| %277 = vector.insert %276, %275 [1] : i32 into vector<4xi32> | |
| %278 = vector.extract %272[2, 0] : vector<4x1xi32> | |
| %279 = vector.insert %278, %277 [2] : i32 into vector<4xi32> | |
| %280 = vector.extract %272[3, 0] : vector<4x1xi32> | |
| %281 = vector.insert %280, %279 [3] : i32 into vector<4xi32> | |
| %282 = vector.extract %38[0, 2] : vector<1x4xi32> | |
| %283 = arith.muli %273, %281 : vector<4xi32> | |
| %284 = vector.reduction <add>, %283, %282 : vector<4xi32> into i32 | |
| %285 = vector.insert %284, %cst_0 [0] : i32 into vector<1xi32> | |
| %286 = vector.insert %285, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %287 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8> | |
| %288 = arith.extsi %287 : vector<4x1xi8> to vector<4x1xi32> | |
| %289 = vector.extract %42[0] : vector<1x4xi32> | |
| %290 = vector.extract %288[0, 0] : vector<4x1xi32> | |
| %291 = vector.insert %290, %cst_1 [0] : i32 into vector<4xi32> | |
| %292 = vector.extract %288[1, 0] : vector<4x1xi32> | |
| %293 = vector.insert %292, %291 [1] : i32 into vector<4xi32> | |
| %294 = vector.extract %288[2, 0] : vector<4x1xi32> | |
| %295 = vector.insert %294, %293 [2] : i32 into vector<4xi32> | |
| %296 = vector.extract %288[3, 0] : vector<4x1xi32> | |
| %297 = vector.insert %296, %295 [3] : i32 into vector<4xi32> | |
| %298 = vector.extract %38[0, 3] : vector<1x4xi32> | |
| %299 = arith.muli %289, %297 : vector<4xi32> | |
| %300 = vector.reduction <add>, %299, %298 : vector<4xi32> into i32 | |
| %301 = vector.insert %300, %cst_0 [0] : i32 into vector<1xi32> | |
| %302 = vector.insert %301, %cst [0] : vector<1xi32> into vector<1x1xi32> | |
| %303 = vector.insert_strided_slice %62, %cst_3 {offsets = [0, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %304 = vector.insert_strided_slice %78, %303 {offsets = [0, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %305 = vector.insert_strided_slice %94, %304 {offsets = [0, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %306 = vector.insert_strided_slice %110, %305 {offsets = [0, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %307 = vector.insert_strided_slice %126, %306 {offsets = [1, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %308 = vector.insert_strided_slice %142, %307 {offsets = [1, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %309 = vector.insert_strided_slice %158, %308 {offsets = [1, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %310 = vector.insert_strided_slice %174, %309 {offsets = [1, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %311 = vector.insert_strided_slice %190, %310 {offsets = [2, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %312 = vector.insert_strided_slice %206, %311 {offsets = [2, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %313 = vector.insert_strided_slice %222, %312 {offsets = [2, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %314 = vector.insert_strided_slice %238, %313 {offsets = [2, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %315 = vector.insert_strided_slice %254, %314 {offsets = [3, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %316 = vector.insert_strided_slice %270, %315 {offsets = [3, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %317 = vector.insert_strided_slice %286, %316 {offsets = [3, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %318 = vector.insert_strided_slice %302, %317 {offsets = [3, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32> | |
| %319 = vector.extract_strided_slice %318 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %320 = vector.transfer_write %319, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %321 = vector.extract_strided_slice %318 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %322 = vector.transfer_write %321, %320[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %323 = vector.extract_strided_slice %318 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %324 = vector.transfer_write %323, %322[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| %325 = vector.extract_strided_slice %318 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32> | |
| %326 = vector.transfer_write %325, %324[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32> | |
| scf.yield %326 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %20 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After trimming leading unit dims --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %c0_i32 = arith.constant 0 : i32 | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) { | |
| %11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %12 = vector.transfer_write %cst_1, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %13 = vector.transfer_write %cst_1, %12[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %14 = vector.transfer_write %cst_1, %13[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %15 = vector.transfer_write %cst_1, %14[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %16 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %15) -> (tensor<4x4xi32>) { | |
| %17 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %18 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %19 = vector.transfer_read %8[%18, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %20 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %21 = vector.transfer_read %8[%20, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %22 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<4x4xi32>, vector<4xi32> | |
| %32 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true]} : tensor<4x4xi32>, vector<4xi32> | |
| %33 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true]} : tensor<4x4xi32>, vector<4xi32> | |
| %34 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true]} : tensor<4x4xi32>, vector<4xi32> | |
| %35 = arith.extsi %17 : vector<4xi8> to vector<4xi32> | |
| %36 = arith.extsi %19 : vector<4xi8> to vector<4xi32> | |
| %37 = arith.extsi %21 : vector<4xi8> to vector<4xi32> | |
| %38 = arith.extsi %23 : vector<4xi8> to vector<4xi32> | |
| %39 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [1] : vector<1xi8> into vector<4x1xi8> | |
| %43 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %44 = vector.insert %43, %42 [2] : vector<1xi8> into vector<4x1xi8> | |
| %45 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %46 = vector.insert %45, %44 [3] : vector<1xi8> into vector<4x1xi8> | |
| %47 = arith.extsi %46 : vector<4x1xi8> to vector<4x1xi32> | |
| %48 = vector.extract %47[0, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %cst_1 [0] : i32 into vector<4xi32> | |
| %50 = vector.extract %47[1, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [1] : i32 into vector<4xi32> | |
| %52 = vector.extract %47[2, 0] : vector<4x1xi32> | |
| %53 = vector.insert %52, %51 [2] : i32 into vector<4xi32> | |
| %54 = vector.extract %47[3, 0] : vector<4x1xi32> | |
| %55 = vector.insert %54, %53 [3] : i32 into vector<4xi32> | |
| %56 = vector.extract %31[0] : vector<4xi32> | |
| %57 = arith.muli %35, %55 : vector<4xi32> | |
| %58 = vector.reduction <add>, %57, %56 : vector<4xi32> into i32 | |
| %59 = vector.insert %58, %cst_0 [0] : i32 into vector<1xi32> | |
| %60 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [1] : vector<1xi8> into vector<4x1xi8> | |
| %64 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %65 = vector.insert %64, %63 [2] : vector<1xi8> into vector<4x1xi8> | |
| %66 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %67 = vector.insert %66, %65 [3] : vector<1xi8> into vector<4x1xi8> | |
| %68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32> | |
| %69 = vector.extract %68[0, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %cst_1 [0] : i32 into vector<4xi32> | |
| %71 = vector.extract %68[1, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [1] : i32 into vector<4xi32> | |
| %73 = vector.extract %68[2, 0] : vector<4x1xi32> | |
| %74 = vector.insert %73, %72 [2] : i32 into vector<4xi32> | |
| %75 = vector.extract %68[3, 0] : vector<4x1xi32> | |
| %76 = vector.insert %75, %74 [3] : i32 into vector<4xi32> | |
| %77 = vector.extract %31[1] : vector<4xi32> | |
| %78 = arith.muli %35, %76 : vector<4xi32> | |
| %79 = vector.reduction <add>, %78, %77 : vector<4xi32> into i32 | |
| %80 = vector.insert %79, %cst_0 [0] : i32 into vector<1xi32> | |
| %81 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [1] : vector<1xi8> into vector<4x1xi8> | |
| %85 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %86 = vector.insert %85, %84 [2] : vector<1xi8> into vector<4x1xi8> | |
| %87 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %88 = vector.insert %87, %86 [3] : vector<1xi8> into vector<4x1xi8> | |
| %89 = arith.extsi %88 : vector<4x1xi8> to vector<4x1xi32> | |
| %90 = vector.extract %89[0, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %cst_1 [0] : i32 into vector<4xi32> | |
| %92 = vector.extract %89[1, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [1] : i32 into vector<4xi32> | |
| %94 = vector.extract %89[2, 0] : vector<4x1xi32> | |
| %95 = vector.insert %94, %93 [2] : i32 into vector<4xi32> | |
| %96 = vector.extract %89[3, 0] : vector<4x1xi32> | |
| %97 = vector.insert %96, %95 [3] : i32 into vector<4xi32> | |
| %98 = vector.extract %31[2] : vector<4xi32> | |
| %99 = arith.muli %35, %97 : vector<4xi32> | |
| %100 = vector.reduction <add>, %99, %98 : vector<4xi32> into i32 | |
| %101 = vector.insert %100, %cst_0 [0] : i32 into vector<1xi32> | |
| %102 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [1] : vector<1xi8> into vector<4x1xi8> | |
| %106 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %107 = vector.insert %106, %105 [2] : vector<1xi8> into vector<4x1xi8> | |
| %108 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %109 = vector.insert %108, %107 [3] : vector<1xi8> into vector<4x1xi8> | |
| %110 = arith.extsi %109 : vector<4x1xi8> to vector<4x1xi32> | |
| %111 = vector.extract %110[0, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %cst_1 [0] : i32 into vector<4xi32> | |
| %113 = vector.extract %110[1, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [1] : i32 into vector<4xi32> | |
| %115 = vector.extract %110[2, 0] : vector<4x1xi32> | |
| %116 = vector.insert %115, %114 [2] : i32 into vector<4xi32> | |
| %117 = vector.extract %110[3, 0] : vector<4x1xi32> | |
| %118 = vector.insert %117, %116 [3] : i32 into vector<4xi32> | |
| %119 = vector.extract %31[3] : vector<4xi32> | |
| %120 = arith.muli %35, %118 : vector<4xi32> | |
| %121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32 | |
| %122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32> | |
| %123 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %124 = vector.insert %123, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %125 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %126 = vector.insert %125, %124 [1] : vector<1xi8> into vector<4x1xi8> | |
| %127 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %128 = vector.insert %127, %126 [2] : vector<1xi8> into vector<4x1xi8> | |
| %129 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %130 = vector.insert %129, %128 [3] : vector<1xi8> into vector<4x1xi8> | |
| %131 = arith.extsi %130 : vector<4x1xi8> to vector<4x1xi32> | |
| %132 = vector.extract %131[0, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %cst_1 [0] : i32 into vector<4xi32> | |
| %134 = vector.extract %131[1, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [1] : i32 into vector<4xi32> | |
| %136 = vector.extract %131[2, 0] : vector<4x1xi32> | |
| %137 = vector.insert %136, %135 [2] : i32 into vector<4xi32> | |
| %138 = vector.extract %131[3, 0] : vector<4x1xi32> | |
| %139 = vector.insert %138, %137 [3] : i32 into vector<4xi32> | |
| %140 = vector.extract %32[0] : vector<4xi32> | |
| %141 = arith.muli %36, %139 : vector<4xi32> | |
| %142 = vector.reduction <add>, %141, %140 : vector<4xi32> into i32 | |
| %143 = vector.insert %142, %cst_0 [0] : i32 into vector<1xi32> | |
| %144 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %145 = vector.insert %144, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %146 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %147 = vector.insert %146, %145 [1] : vector<1xi8> into vector<4x1xi8> | |
| %148 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %149 = vector.insert %148, %147 [2] : vector<1xi8> into vector<4x1xi8> | |
| %150 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %151 = vector.insert %150, %149 [3] : vector<1xi8> into vector<4x1xi8> | |
| %152 = arith.extsi %151 : vector<4x1xi8> to vector<4x1xi32> | |
| %153 = vector.extract %152[0, 0] : vector<4x1xi32> | |
| %154 = vector.insert %153, %cst_1 [0] : i32 into vector<4xi32> | |
| %155 = vector.extract %152[1, 0] : vector<4x1xi32> | |
| %156 = vector.insert %155, %154 [1] : i32 into vector<4xi32> | |
| %157 = vector.extract %152[2, 0] : vector<4x1xi32> | |
| %158 = vector.insert %157, %156 [2] : i32 into vector<4xi32> | |
| %159 = vector.extract %152[3, 0] : vector<4x1xi32> | |
| %160 = vector.insert %159, %158 [3] : i32 into vector<4xi32> | |
| %161 = vector.extract %32[1] : vector<4xi32> | |
| %162 = arith.muli %36, %160 : vector<4xi32> | |
| %163 = vector.reduction <add>, %162, %161 : vector<4xi32> into i32 | |
| %164 = vector.insert %163, %cst_0 [0] : i32 into vector<1xi32> | |
| %165 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %166 = vector.insert %165, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %167 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %168 = vector.insert %167, %166 [1] : vector<1xi8> into vector<4x1xi8> | |
| %169 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %170 = vector.insert %169, %168 [2] : vector<1xi8> into vector<4x1xi8> | |
| %171 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %172 = vector.insert %171, %170 [3] : vector<1xi8> into vector<4x1xi8> | |
| %173 = arith.extsi %172 : vector<4x1xi8> to vector<4x1xi32> | |
| %174 = vector.extract %173[0, 0] : vector<4x1xi32> | |
| %175 = vector.insert %174, %cst_1 [0] : i32 into vector<4xi32> | |
| %176 = vector.extract %173[1, 0] : vector<4x1xi32> | |
| %177 = vector.insert %176, %175 [1] : i32 into vector<4xi32> | |
| %178 = vector.extract %173[2, 0] : vector<4x1xi32> | |
| %179 = vector.insert %178, %177 [2] : i32 into vector<4xi32> | |
| %180 = vector.extract %173[3, 0] : vector<4x1xi32> | |
| %181 = vector.insert %180, %179 [3] : i32 into vector<4xi32> | |
| %182 = vector.extract %32[2] : vector<4xi32> | |
| %183 = arith.muli %36, %181 : vector<4xi32> | |
| %184 = vector.reduction <add>, %183, %182 : vector<4xi32> into i32 | |
| %185 = vector.insert %184, %cst_0 [0] : i32 into vector<1xi32> | |
| %186 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %187 = vector.insert %186, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %188 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %189 = vector.insert %188, %187 [1] : vector<1xi8> into vector<4x1xi8> | |
| %190 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %191 = vector.insert %190, %189 [2] : vector<1xi8> into vector<4x1xi8> | |
| %192 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %193 = vector.insert %192, %191 [3] : vector<1xi8> into vector<4x1xi8> | |
| %194 = arith.extsi %193 : vector<4x1xi8> to vector<4x1xi32> | |
| %195 = vector.extract %194[0, 0] : vector<4x1xi32> | |
| %196 = vector.insert %195, %cst_1 [0] : i32 into vector<4xi32> | |
| %197 = vector.extract %194[1, 0] : vector<4x1xi32> | |
| %198 = vector.insert %197, %196 [1] : i32 into vector<4xi32> | |
| %199 = vector.extract %194[2, 0] : vector<4x1xi32> | |
| %200 = vector.insert %199, %198 [2] : i32 into vector<4xi32> | |
| %201 = vector.extract %194[3, 0] : vector<4x1xi32> | |
| %202 = vector.insert %201, %200 [3] : i32 into vector<4xi32> | |
| %203 = vector.extract %32[3] : vector<4xi32> | |
| %204 = arith.muli %36, %202 : vector<4xi32> | |
| %205 = vector.reduction <add>, %204, %203 : vector<4xi32> into i32 | |
| %206 = vector.insert %205, %cst_0 [0] : i32 into vector<1xi32> | |
| %207 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %208 = vector.insert %207, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %209 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %210 = vector.insert %209, %208 [1] : vector<1xi8> into vector<4x1xi8> | |
| %211 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %212 = vector.insert %211, %210 [2] : vector<1xi8> into vector<4x1xi8> | |
| %213 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %214 = vector.insert %213, %212 [3] : vector<1xi8> into vector<4x1xi8> | |
| %215 = arith.extsi %214 : vector<4x1xi8> to vector<4x1xi32> | |
| %216 = vector.extract %215[0, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %cst_1 [0] : i32 into vector<4xi32> | |
| %218 = vector.extract %215[1, 0] : vector<4x1xi32> | |
| %219 = vector.insert %218, %217 [1] : i32 into vector<4xi32> | |
| %220 = vector.extract %215[2, 0] : vector<4x1xi32> | |
| %221 = vector.insert %220, %219 [2] : i32 into vector<4xi32> | |
| %222 = vector.extract %215[3, 0] : vector<4x1xi32> | |
| %223 = vector.insert %222, %221 [3] : i32 into vector<4xi32> | |
| %224 = vector.extract %33[0] : vector<4xi32> | |
| %225 = arith.muli %37, %223 : vector<4xi32> | |
| %226 = vector.reduction <add>, %225, %224 : vector<4xi32> into i32 | |
| %227 = vector.insert %226, %cst_0 [0] : i32 into vector<1xi32> | |
| %228 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %229 = vector.insert %228, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %230 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %231 = vector.insert %230, %229 [1] : vector<1xi8> into vector<4x1xi8> | |
| %232 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %233 = vector.insert %232, %231 [2] : vector<1xi8> into vector<4x1xi8> | |
| %234 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %235 = vector.insert %234, %233 [3] : vector<1xi8> into vector<4x1xi8> | |
| %236 = arith.extsi %235 : vector<4x1xi8> to vector<4x1xi32> | |
| %237 = vector.extract %236[0, 0] : vector<4x1xi32> | |
| %238 = vector.insert %237, %cst_1 [0] : i32 into vector<4xi32> | |
| %239 = vector.extract %236[1, 0] : vector<4x1xi32> | |
| %240 = vector.insert %239, %238 [1] : i32 into vector<4xi32> | |
| %241 = vector.extract %236[2, 0] : vector<4x1xi32> | |
| %242 = vector.insert %241, %240 [2] : i32 into vector<4xi32> | |
| %243 = vector.extract %236[3, 0] : vector<4x1xi32> | |
| %244 = vector.insert %243, %242 [3] : i32 into vector<4xi32> | |
| %245 = vector.extract %33[1] : vector<4xi32> | |
| %246 = arith.muli %37, %244 : vector<4xi32> | |
| %247 = vector.reduction <add>, %246, %245 : vector<4xi32> into i32 | |
| %248 = vector.insert %247, %cst_0 [0] : i32 into vector<1xi32> | |
| %249 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %250 = vector.insert %249, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %251 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %252 = vector.insert %251, %250 [1] : vector<1xi8> into vector<4x1xi8> | |
| %253 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %254 = vector.insert %253, %252 [2] : vector<1xi8> into vector<4x1xi8> | |
| %255 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %256 = vector.insert %255, %254 [3] : vector<1xi8> into vector<4x1xi8> | |
| %257 = arith.extsi %256 : vector<4x1xi8> to vector<4x1xi32> | |
| %258 = vector.extract %257[0, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32> | |
| %260 = vector.extract %257[1, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [1] : i32 into vector<4xi32> | |
| %262 = vector.extract %257[2, 0] : vector<4x1xi32> | |
| %263 = vector.insert %262, %261 [2] : i32 into vector<4xi32> | |
| %264 = vector.extract %257[3, 0] : vector<4x1xi32> | |
| %265 = vector.insert %264, %263 [3] : i32 into vector<4xi32> | |
| %266 = vector.extract %33[2] : vector<4xi32> | |
| %267 = arith.muli %37, %265 : vector<4xi32> | |
| %268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32 | |
| %269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32> | |
| %270 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %271 = vector.insert %270, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %272 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %273 = vector.insert %272, %271 [1] : vector<1xi8> into vector<4x1xi8> | |
| %274 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %275 = vector.insert %274, %273 [2] : vector<1xi8> into vector<4x1xi8> | |
| %276 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %277 = vector.insert %276, %275 [3] : vector<1xi8> into vector<4x1xi8> | |
| %278 = arith.extsi %277 : vector<4x1xi8> to vector<4x1xi32> | |
| %279 = vector.extract %278[0, 0] : vector<4x1xi32> | |
| %280 = vector.insert %279, %cst_1 [0] : i32 into vector<4xi32> | |
| %281 = vector.extract %278[1, 0] : vector<4x1xi32> | |
| %282 = vector.insert %281, %280 [1] : i32 into vector<4xi32> | |
| %283 = vector.extract %278[2, 0] : vector<4x1xi32> | |
| %284 = vector.insert %283, %282 [2] : i32 into vector<4xi32> | |
| %285 = vector.extract %278[3, 0] : vector<4x1xi32> | |
| %286 = vector.insert %285, %284 [3] : i32 into vector<4xi32> | |
| %287 = vector.extract %33[3] : vector<4xi32> | |
| %288 = arith.muli %37, %286 : vector<4xi32> | |
| %289 = vector.reduction <add>, %288, %287 : vector<4xi32> into i32 | |
| %290 = vector.insert %289, %cst_0 [0] : i32 into vector<1xi32> | |
| %291 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %292 = vector.insert %291, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %293 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %294 = vector.insert %293, %292 [1] : vector<1xi8> into vector<4x1xi8> | |
| %295 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %296 = vector.insert %295, %294 [2] : vector<1xi8> into vector<4x1xi8> | |
| %297 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %298 = vector.insert %297, %296 [3] : vector<1xi8> into vector<4x1xi8> | |
| %299 = arith.extsi %298 : vector<4x1xi8> to vector<4x1xi32> | |
| %300 = vector.extract %299[0, 0] : vector<4x1xi32> | |
| %301 = vector.insert %300, %cst_1 [0] : i32 into vector<4xi32> | |
| %302 = vector.extract %299[1, 0] : vector<4x1xi32> | |
| %303 = vector.insert %302, %301 [1] : i32 into vector<4xi32> | |
| %304 = vector.extract %299[2, 0] : vector<4x1xi32> | |
| %305 = vector.insert %304, %303 [2] : i32 into vector<4xi32> | |
| %306 = vector.extract %299[3, 0] : vector<4x1xi32> | |
| %307 = vector.insert %306, %305 [3] : i32 into vector<4xi32> | |
| %308 = vector.extract %34[0] : vector<4xi32> | |
| %309 = arith.muli %38, %307 : vector<4xi32> | |
| %310 = vector.reduction <add>, %309, %308 : vector<4xi32> into i32 | |
| %311 = vector.insert %310, %cst_0 [0] : i32 into vector<1xi32> | |
| %312 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %313 = vector.insert %312, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %314 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %315 = vector.insert %314, %313 [1] : vector<1xi8> into vector<4x1xi8> | |
| %316 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %317 = vector.insert %316, %315 [2] : vector<1xi8> into vector<4x1xi8> | |
| %318 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %319 = vector.insert %318, %317 [3] : vector<1xi8> into vector<4x1xi8> | |
| %320 = arith.extsi %319 : vector<4x1xi8> to vector<4x1xi32> | |
| %321 = vector.extract %320[0, 0] : vector<4x1xi32> | |
| %322 = vector.insert %321, %cst_1 [0] : i32 into vector<4xi32> | |
| %323 = vector.extract %320[1, 0] : vector<4x1xi32> | |
| %324 = vector.insert %323, %322 [1] : i32 into vector<4xi32> | |
| %325 = vector.extract %320[2, 0] : vector<4x1xi32> | |
| %326 = vector.insert %325, %324 [2] : i32 into vector<4xi32> | |
| %327 = vector.extract %320[3, 0] : vector<4x1xi32> | |
| %328 = vector.insert %327, %326 [3] : i32 into vector<4xi32> | |
| %329 = vector.extract %34[1] : vector<4xi32> | |
| %330 = arith.muli %38, %328 : vector<4xi32> | |
| %331 = vector.reduction <add>, %330, %329 : vector<4xi32> into i32 | |
| %332 = vector.insert %331, %cst_0 [0] : i32 into vector<1xi32> | |
| %333 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %334 = vector.insert %333, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %335 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %336 = vector.insert %335, %334 [1] : vector<1xi8> into vector<4x1xi8> | |
| %337 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %338 = vector.insert %337, %336 [2] : vector<1xi8> into vector<4x1xi8> | |
| %339 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %340 = vector.insert %339, %338 [3] : vector<1xi8> into vector<4x1xi8> | |
| %341 = arith.extsi %340 : vector<4x1xi8> to vector<4x1xi32> | |
| %342 = vector.extract %341[0, 0] : vector<4x1xi32> | |
| %343 = vector.insert %342, %cst_1 [0] : i32 into vector<4xi32> | |
| %344 = vector.extract %341[1, 0] : vector<4x1xi32> | |
| %345 = vector.insert %344, %343 [1] : i32 into vector<4xi32> | |
| %346 = vector.extract %341[2, 0] : vector<4x1xi32> | |
| %347 = vector.insert %346, %345 [2] : i32 into vector<4xi32> | |
| %348 = vector.extract %341[3, 0] : vector<4x1xi32> | |
| %349 = vector.insert %348, %347 [3] : i32 into vector<4xi32> | |
| %350 = vector.extract %34[2] : vector<4xi32> | |
| %351 = arith.muli %38, %349 : vector<4xi32> | |
| %352 = vector.reduction <add>, %351, %350 : vector<4xi32> into i32 | |
| %353 = vector.insert %352, %cst_0 [0] : i32 into vector<1xi32> | |
| %354 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %355 = vector.insert %354, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %356 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %357 = vector.insert %356, %355 [1] : vector<1xi8> into vector<4x1xi8> | |
| %358 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %359 = vector.insert %358, %357 [2] : vector<1xi8> into vector<4x1xi8> | |
| %360 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %361 = vector.insert %360, %359 [3] : vector<1xi8> into vector<4x1xi8> | |
| %362 = arith.extsi %361 : vector<4x1xi8> to vector<4x1xi32> | |
| %363 = vector.extract %362[0, 0] : vector<4x1xi32> | |
| %364 = vector.insert %363, %cst_1 [0] : i32 into vector<4xi32> | |
| %365 = vector.extract %362[1, 0] : vector<4x1xi32> | |
| %366 = vector.insert %365, %364 [1] : i32 into vector<4xi32> | |
| %367 = vector.extract %362[2, 0] : vector<4x1xi32> | |
| %368 = vector.insert %367, %366 [2] : i32 into vector<4xi32> | |
| %369 = vector.extract %362[3, 0] : vector<4x1xi32> | |
| %370 = vector.insert %369, %368 [3] : i32 into vector<4xi32> | |
| %371 = vector.extract %34[3] : vector<4xi32> | |
| %372 = arith.muli %38, %370 : vector<4xi32> | |
| %373 = vector.reduction <add>, %372, %371 : vector<4xi32> into i32 | |
| %374 = vector.insert %373, %cst_0 [0] : i32 into vector<1xi32> | |
| %375 = vector.insert_strided_slice %59, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %376 = vector.insert_strided_slice %80, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %377 = vector.insert_strided_slice %101, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %378 = vector.insert_strided_slice %122, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %379 = vector.insert_strided_slice %143, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %380 = vector.insert_strided_slice %164, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %381 = vector.insert_strided_slice %185, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %382 = vector.insert_strided_slice %206, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %383 = vector.insert_strided_slice %227, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %384 = vector.insert_strided_slice %248, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %385 = vector.insert_strided_slice %269, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %386 = vector.insert_strided_slice %290, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %387 = vector.insert_strided_slice %311, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %388 = vector.insert_strided_slice %332, %387 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %389 = vector.insert_strided_slice %353, %388 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %390 = vector.insert_strided_slice %374, %389 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %391 = vector.transfer_write %378, %arg7[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %392 = vector.transfer_write %382, %391[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %393 = vector.transfer_write %386, %392[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %394 = vector.transfer_write %390, %393[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| scf.yield %394 : tensor<4x4xi32> | |
| } | |
| %inserted_slice = tensor.insert_slice %16 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %11 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After hoisting transfers --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15 = vector.transfer_write %cst_1, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %16 = vector.transfer_write %cst_1, %15[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %cst_1, %16[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %cst_1, %17[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %24 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %25 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %26 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %27 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %35 = arith.extsi %24 : vector<4xi8> to vector<4xi32> | |
| %36 = arith.extsi %25 : vector<4xi8> to vector<4xi32> | |
| %37 = arith.extsi %26 : vector<4xi8> to vector<4xi32> | |
| %38 = arith.extsi %27 : vector<4xi8> to vector<4xi32> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [1] : vector<1xi8> into vector<4x1xi8> | |
| %43 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %44 = vector.insert %43, %42 [2] : vector<1xi8> into vector<4x1xi8> | |
| %45 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %46 = vector.insert %45, %44 [3] : vector<1xi8> into vector<4x1xi8> | |
| %47 = arith.extsi %46 : vector<4x1xi8> to vector<4x1xi32> | |
| %48 = vector.extract %47[0, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %cst_1 [0] : i32 into vector<4xi32> | |
| %50 = vector.extract %47[1, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [1] : i32 into vector<4xi32> | |
| %52 = vector.extract %47[2, 0] : vector<4x1xi32> | |
| %53 = vector.insert %52, %51 [2] : i32 into vector<4xi32> | |
| %54 = vector.extract %47[3, 0] : vector<4x1xi32> | |
| %55 = vector.insert %54, %53 [3] : i32 into vector<4xi32> | |
| %56 = vector.extract %arg10[0] : vector<4xi32> | |
| %57 = arith.muli %35, %55 : vector<4xi32> | |
| %58 = vector.reduction <add>, %57, %56 : vector<4xi32> into i32 | |
| %59 = vector.insert %58, %cst_0 [0] : i32 into vector<1xi32> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [1] : vector<1xi8> into vector<4x1xi8> | |
| %64 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %65 = vector.insert %64, %63 [2] : vector<1xi8> into vector<4x1xi8> | |
| %66 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %67 = vector.insert %66, %65 [3] : vector<1xi8> into vector<4x1xi8> | |
| %68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32> | |
| %69 = vector.extract %68[0, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %cst_1 [0] : i32 into vector<4xi32> | |
| %71 = vector.extract %68[1, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [1] : i32 into vector<4xi32> | |
| %73 = vector.extract %68[2, 0] : vector<4x1xi32> | |
| %74 = vector.insert %73, %72 [2] : i32 into vector<4xi32> | |
| %75 = vector.extract %68[3, 0] : vector<4x1xi32> | |
| %76 = vector.insert %75, %74 [3] : i32 into vector<4xi32> | |
| %77 = vector.extract %arg10[1] : vector<4xi32> | |
| %78 = arith.muli %35, %76 : vector<4xi32> | |
| %79 = vector.reduction <add>, %78, %77 : vector<4xi32> into i32 | |
| %80 = vector.insert %79, %cst_0 [0] : i32 into vector<1xi32> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [1] : vector<1xi8> into vector<4x1xi8> | |
| %85 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %86 = vector.insert %85, %84 [2] : vector<1xi8> into vector<4x1xi8> | |
| %87 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %88 = vector.insert %87, %86 [3] : vector<1xi8> into vector<4x1xi8> | |
| %89 = arith.extsi %88 : vector<4x1xi8> to vector<4x1xi32> | |
| %90 = vector.extract %89[0, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %cst_1 [0] : i32 into vector<4xi32> | |
| %92 = vector.extract %89[1, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [1] : i32 into vector<4xi32> | |
| %94 = vector.extract %89[2, 0] : vector<4x1xi32> | |
| %95 = vector.insert %94, %93 [2] : i32 into vector<4xi32> | |
| %96 = vector.extract %89[3, 0] : vector<4x1xi32> | |
| %97 = vector.insert %96, %95 [3] : i32 into vector<4xi32> | |
| %98 = vector.extract %arg10[2] : vector<4xi32> | |
| %99 = arith.muli %35, %97 : vector<4xi32> | |
| %100 = vector.reduction <add>, %99, %98 : vector<4xi32> into i32 | |
| %101 = vector.insert %100, %cst_0 [0] : i32 into vector<1xi32> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [1] : vector<1xi8> into vector<4x1xi8> | |
| %106 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %107 = vector.insert %106, %105 [2] : vector<1xi8> into vector<4x1xi8> | |
| %108 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %109 = vector.insert %108, %107 [3] : vector<1xi8> into vector<4x1xi8> | |
| %110 = arith.extsi %109 : vector<4x1xi8> to vector<4x1xi32> | |
| %111 = vector.extract %110[0, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %cst_1 [0] : i32 into vector<4xi32> | |
| %113 = vector.extract %110[1, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [1] : i32 into vector<4xi32> | |
| %115 = vector.extract %110[2, 0] : vector<4x1xi32> | |
| %116 = vector.insert %115, %114 [2] : i32 into vector<4xi32> | |
| %117 = vector.extract %110[3, 0] : vector<4x1xi32> | |
| %118 = vector.insert %117, %116 [3] : i32 into vector<4xi32> | |
| %119 = vector.extract %arg10[3] : vector<4xi32> | |
| %120 = arith.muli %35, %118 : vector<4xi32> | |
| %121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32 | |
| %122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32> | |
| %123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %124 = vector.insert %123, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %126 = vector.insert %125, %124 [1] : vector<1xi8> into vector<4x1xi8> | |
| %127 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %128 = vector.insert %127, %126 [2] : vector<1xi8> into vector<4x1xi8> | |
| %129 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %130 = vector.insert %129, %128 [3] : vector<1xi8> into vector<4x1xi8> | |
| %131 = arith.extsi %130 : vector<4x1xi8> to vector<4x1xi32> | |
| %132 = vector.extract %131[0, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %cst_1 [0] : i32 into vector<4xi32> | |
| %134 = vector.extract %131[1, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [1] : i32 into vector<4xi32> | |
| %136 = vector.extract %131[2, 0] : vector<4x1xi32> | |
| %137 = vector.insert %136, %135 [2] : i32 into vector<4xi32> | |
| %138 = vector.extract %131[3, 0] : vector<4x1xi32> | |
| %139 = vector.insert %138, %137 [3] : i32 into vector<4xi32> | |
| %140 = vector.extract %arg9[0] : vector<4xi32> | |
| %141 = arith.muli %36, %139 : vector<4xi32> | |
| %142 = vector.reduction <add>, %141, %140 : vector<4xi32> into i32 | |
| %143 = vector.insert %142, %cst_0 [0] : i32 into vector<1xi32> | |
| %144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %145 = vector.insert %144, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %147 = vector.insert %146, %145 [1] : vector<1xi8> into vector<4x1xi8> | |
| %148 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %149 = vector.insert %148, %147 [2] : vector<1xi8> into vector<4x1xi8> | |
| %150 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %151 = vector.insert %150, %149 [3] : vector<1xi8> into vector<4x1xi8> | |
| %152 = arith.extsi %151 : vector<4x1xi8> to vector<4x1xi32> | |
| %153 = vector.extract %152[0, 0] : vector<4x1xi32> | |
| %154 = vector.insert %153, %cst_1 [0] : i32 into vector<4xi32> | |
| %155 = vector.extract %152[1, 0] : vector<4x1xi32> | |
| %156 = vector.insert %155, %154 [1] : i32 into vector<4xi32> | |
| %157 = vector.extract %152[2, 0] : vector<4x1xi32> | |
| %158 = vector.insert %157, %156 [2] : i32 into vector<4xi32> | |
| %159 = vector.extract %152[3, 0] : vector<4x1xi32> | |
| %160 = vector.insert %159, %158 [3] : i32 into vector<4xi32> | |
| %161 = vector.extract %arg9[1] : vector<4xi32> | |
| %162 = arith.muli %36, %160 : vector<4xi32> | |
| %163 = vector.reduction <add>, %162, %161 : vector<4xi32> into i32 | |
| %164 = vector.insert %163, %cst_0 [0] : i32 into vector<1xi32> | |
| %165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %166 = vector.insert %165, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %168 = vector.insert %167, %166 [1] : vector<1xi8> into vector<4x1xi8> | |
| %169 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %170 = vector.insert %169, %168 [2] : vector<1xi8> into vector<4x1xi8> | |
| %171 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %172 = vector.insert %171, %170 [3] : vector<1xi8> into vector<4x1xi8> | |
| %173 = arith.extsi %172 : vector<4x1xi8> to vector<4x1xi32> | |
| %174 = vector.extract %173[0, 0] : vector<4x1xi32> | |
| %175 = vector.insert %174, %cst_1 [0] : i32 into vector<4xi32> | |
| %176 = vector.extract %173[1, 0] : vector<4x1xi32> | |
| %177 = vector.insert %176, %175 [1] : i32 into vector<4xi32> | |
| %178 = vector.extract %173[2, 0] : vector<4x1xi32> | |
| %179 = vector.insert %178, %177 [2] : i32 into vector<4xi32> | |
| %180 = vector.extract %173[3, 0] : vector<4x1xi32> | |
| %181 = vector.insert %180, %179 [3] : i32 into vector<4xi32> | |
| %182 = vector.extract %arg9[2] : vector<4xi32> | |
| %183 = arith.muli %36, %181 : vector<4xi32> | |
| %184 = vector.reduction <add>, %183, %182 : vector<4xi32> into i32 | |
| %185 = vector.insert %184, %cst_0 [0] : i32 into vector<1xi32> | |
| %186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %187 = vector.insert %186, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %189 = vector.insert %188, %187 [1] : vector<1xi8> into vector<4x1xi8> | |
| %190 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %191 = vector.insert %190, %189 [2] : vector<1xi8> into vector<4x1xi8> | |
| %192 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %193 = vector.insert %192, %191 [3] : vector<1xi8> into vector<4x1xi8> | |
| %194 = arith.extsi %193 : vector<4x1xi8> to vector<4x1xi32> | |
| %195 = vector.extract %194[0, 0] : vector<4x1xi32> | |
| %196 = vector.insert %195, %cst_1 [0] : i32 into vector<4xi32> | |
| %197 = vector.extract %194[1, 0] : vector<4x1xi32> | |
| %198 = vector.insert %197, %196 [1] : i32 into vector<4xi32> | |
| %199 = vector.extract %194[2, 0] : vector<4x1xi32> | |
| %200 = vector.insert %199, %198 [2] : i32 into vector<4xi32> | |
| %201 = vector.extract %194[3, 0] : vector<4x1xi32> | |
| %202 = vector.insert %201, %200 [3] : i32 into vector<4xi32> | |
| %203 = vector.extract %arg9[3] : vector<4xi32> | |
| %204 = arith.muli %36, %202 : vector<4xi32> | |
| %205 = vector.reduction <add>, %204, %203 : vector<4xi32> into i32 | |
| %206 = vector.insert %205, %cst_0 [0] : i32 into vector<1xi32> | |
| %207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %208 = vector.insert %207, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %210 = vector.insert %209, %208 [1] : vector<1xi8> into vector<4x1xi8> | |
| %211 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %212 = vector.insert %211, %210 [2] : vector<1xi8> into vector<4x1xi8> | |
| %213 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %214 = vector.insert %213, %212 [3] : vector<1xi8> into vector<4x1xi8> | |
| %215 = arith.extsi %214 : vector<4x1xi8> to vector<4x1xi32> | |
| %216 = vector.extract %215[0, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %cst_1 [0] : i32 into vector<4xi32> | |
| %218 = vector.extract %215[1, 0] : vector<4x1xi32> | |
| %219 = vector.insert %218, %217 [1] : i32 into vector<4xi32> | |
| %220 = vector.extract %215[2, 0] : vector<4x1xi32> | |
| %221 = vector.insert %220, %219 [2] : i32 into vector<4xi32> | |
| %222 = vector.extract %215[3, 0] : vector<4x1xi32> | |
| %223 = vector.insert %222, %221 [3] : i32 into vector<4xi32> | |
| %224 = vector.extract %arg8[0] : vector<4xi32> | |
| %225 = arith.muli %37, %223 : vector<4xi32> | |
| %226 = vector.reduction <add>, %225, %224 : vector<4xi32> into i32 | |
| %227 = vector.insert %226, %cst_0 [0] : i32 into vector<1xi32> | |
| %228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %229 = vector.insert %228, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %231 = vector.insert %230, %229 [1] : vector<1xi8> into vector<4x1xi8> | |
| %232 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %233 = vector.insert %232, %231 [2] : vector<1xi8> into vector<4x1xi8> | |
| %234 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %235 = vector.insert %234, %233 [3] : vector<1xi8> into vector<4x1xi8> | |
| %236 = arith.extsi %235 : vector<4x1xi8> to vector<4x1xi32> | |
| %237 = vector.extract %236[0, 0] : vector<4x1xi32> | |
| %238 = vector.insert %237, %cst_1 [0] : i32 into vector<4xi32> | |
| %239 = vector.extract %236[1, 0] : vector<4x1xi32> | |
| %240 = vector.insert %239, %238 [1] : i32 into vector<4xi32> | |
| %241 = vector.extract %236[2, 0] : vector<4x1xi32> | |
| %242 = vector.insert %241, %240 [2] : i32 into vector<4xi32> | |
| %243 = vector.extract %236[3, 0] : vector<4x1xi32> | |
| %244 = vector.insert %243, %242 [3] : i32 into vector<4xi32> | |
| %245 = vector.extract %arg8[1] : vector<4xi32> | |
| %246 = arith.muli %37, %244 : vector<4xi32> | |
| %247 = vector.reduction <add>, %246, %245 : vector<4xi32> into i32 | |
| %248 = vector.insert %247, %cst_0 [0] : i32 into vector<1xi32> | |
| %249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %250 = vector.insert %249, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %252 = vector.insert %251, %250 [1] : vector<1xi8> into vector<4x1xi8> | |
| %253 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %254 = vector.insert %253, %252 [2] : vector<1xi8> into vector<4x1xi8> | |
| %255 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %256 = vector.insert %255, %254 [3] : vector<1xi8> into vector<4x1xi8> | |
| %257 = arith.extsi %256 : vector<4x1xi8> to vector<4x1xi32> | |
| %258 = vector.extract %257[0, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32> | |
| %260 = vector.extract %257[1, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [1] : i32 into vector<4xi32> | |
| %262 = vector.extract %257[2, 0] : vector<4x1xi32> | |
| %263 = vector.insert %262, %261 [2] : i32 into vector<4xi32> | |
| %264 = vector.extract %257[3, 0] : vector<4x1xi32> | |
| %265 = vector.insert %264, %263 [3] : i32 into vector<4xi32> | |
| %266 = vector.extract %arg8[2] : vector<4xi32> | |
| %267 = arith.muli %37, %265 : vector<4xi32> | |
| %268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32 | |
| %269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32> | |
| %270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %271 = vector.insert %270, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %273 = vector.insert %272, %271 [1] : vector<1xi8> into vector<4x1xi8> | |
| %274 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %275 = vector.insert %274, %273 [2] : vector<1xi8> into vector<4x1xi8> | |
| %276 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %277 = vector.insert %276, %275 [3] : vector<1xi8> into vector<4x1xi8> | |
| %278 = arith.extsi %277 : vector<4x1xi8> to vector<4x1xi32> | |
| %279 = vector.extract %278[0, 0] : vector<4x1xi32> | |
| %280 = vector.insert %279, %cst_1 [0] : i32 into vector<4xi32> | |
| %281 = vector.extract %278[1, 0] : vector<4x1xi32> | |
| %282 = vector.insert %281, %280 [1] : i32 into vector<4xi32> | |
| %283 = vector.extract %278[2, 0] : vector<4x1xi32> | |
| %284 = vector.insert %283, %282 [2] : i32 into vector<4xi32> | |
| %285 = vector.extract %278[3, 0] : vector<4x1xi32> | |
| %286 = vector.insert %285, %284 [3] : i32 into vector<4xi32> | |
| %287 = vector.extract %arg8[3] : vector<4xi32> | |
| %288 = arith.muli %37, %286 : vector<4xi32> | |
| %289 = vector.reduction <add>, %288, %287 : vector<4xi32> into i32 | |
| %290 = vector.insert %289, %cst_0 [0] : i32 into vector<1xi32> | |
| %291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %292 = vector.insert %291, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %294 = vector.insert %293, %292 [1] : vector<1xi8> into vector<4x1xi8> | |
| %295 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %296 = vector.insert %295, %294 [2] : vector<1xi8> into vector<4x1xi8> | |
| %297 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %298 = vector.insert %297, %296 [3] : vector<1xi8> into vector<4x1xi8> | |
| %299 = arith.extsi %298 : vector<4x1xi8> to vector<4x1xi32> | |
| %300 = vector.extract %299[0, 0] : vector<4x1xi32> | |
| %301 = vector.insert %300, %cst_1 [0] : i32 into vector<4xi32> | |
| %302 = vector.extract %299[1, 0] : vector<4x1xi32> | |
| %303 = vector.insert %302, %301 [1] : i32 into vector<4xi32> | |
| %304 = vector.extract %299[2, 0] : vector<4x1xi32> | |
| %305 = vector.insert %304, %303 [2] : i32 into vector<4xi32> | |
| %306 = vector.extract %299[3, 0] : vector<4x1xi32> | |
| %307 = vector.insert %306, %305 [3] : i32 into vector<4xi32> | |
| %308 = vector.extract %arg7[0] : vector<4xi32> | |
| %309 = arith.muli %38, %307 : vector<4xi32> | |
| %310 = vector.reduction <add>, %309, %308 : vector<4xi32> into i32 | |
| %311 = vector.insert %310, %cst_0 [0] : i32 into vector<1xi32> | |
| %312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %313 = vector.insert %312, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %315 = vector.insert %314, %313 [1] : vector<1xi8> into vector<4x1xi8> | |
| %316 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %317 = vector.insert %316, %315 [2] : vector<1xi8> into vector<4x1xi8> | |
| %318 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %319 = vector.insert %318, %317 [3] : vector<1xi8> into vector<4x1xi8> | |
| %320 = arith.extsi %319 : vector<4x1xi8> to vector<4x1xi32> | |
| %321 = vector.extract %320[0, 0] : vector<4x1xi32> | |
| %322 = vector.insert %321, %cst_1 [0] : i32 into vector<4xi32> | |
| %323 = vector.extract %320[1, 0] : vector<4x1xi32> | |
| %324 = vector.insert %323, %322 [1] : i32 into vector<4xi32> | |
| %325 = vector.extract %320[2, 0] : vector<4x1xi32> | |
| %326 = vector.insert %325, %324 [2] : i32 into vector<4xi32> | |
| %327 = vector.extract %320[3, 0] : vector<4x1xi32> | |
| %328 = vector.insert %327, %326 [3] : i32 into vector<4xi32> | |
| %329 = vector.extract %arg7[1] : vector<4xi32> | |
| %330 = arith.muli %38, %328 : vector<4xi32> | |
| %331 = vector.reduction <add>, %330, %329 : vector<4xi32> into i32 | |
| %332 = vector.insert %331, %cst_0 [0] : i32 into vector<1xi32> | |
| %333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %334 = vector.insert %333, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %336 = vector.insert %335, %334 [1] : vector<1xi8> into vector<4x1xi8> | |
| %337 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %338 = vector.insert %337, %336 [2] : vector<1xi8> into vector<4x1xi8> | |
| %339 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %340 = vector.insert %339, %338 [3] : vector<1xi8> into vector<4x1xi8> | |
| %341 = arith.extsi %340 : vector<4x1xi8> to vector<4x1xi32> | |
| %342 = vector.extract %341[0, 0] : vector<4x1xi32> | |
| %343 = vector.insert %342, %cst_1 [0] : i32 into vector<4xi32> | |
| %344 = vector.extract %341[1, 0] : vector<4x1xi32> | |
| %345 = vector.insert %344, %343 [1] : i32 into vector<4xi32> | |
| %346 = vector.extract %341[2, 0] : vector<4x1xi32> | |
| %347 = vector.insert %346, %345 [2] : i32 into vector<4xi32> | |
| %348 = vector.extract %341[3, 0] : vector<4x1xi32> | |
| %349 = vector.insert %348, %347 [3] : i32 into vector<4xi32> | |
| %350 = vector.extract %arg7[2] : vector<4xi32> | |
| %351 = arith.muli %38, %349 : vector<4xi32> | |
| %352 = vector.reduction <add>, %351, %350 : vector<4xi32> into i32 | |
| %353 = vector.insert %352, %cst_0 [0] : i32 into vector<1xi32> | |
| %354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %355 = vector.insert %354, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %357 = vector.insert %356, %355 [1] : vector<1xi8> into vector<4x1xi8> | |
| %358 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %359 = vector.insert %358, %357 [2] : vector<1xi8> into vector<4x1xi8> | |
| %360 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %361 = vector.insert %360, %359 [3] : vector<1xi8> into vector<4x1xi8> | |
| %362 = arith.extsi %361 : vector<4x1xi8> to vector<4x1xi32> | |
| %363 = vector.extract %362[0, 0] : vector<4x1xi32> | |
| %364 = vector.insert %363, %cst_1 [0] : i32 into vector<4xi32> | |
| %365 = vector.extract %362[1, 0] : vector<4x1xi32> | |
| %366 = vector.insert %365, %364 [1] : i32 into vector<4xi32> | |
| %367 = vector.extract %362[2, 0] : vector<4x1xi32> | |
| %368 = vector.insert %367, %366 [2] : i32 into vector<4xi32> | |
| %369 = vector.extract %362[3, 0] : vector<4x1xi32> | |
| %370 = vector.insert %369, %368 [3] : i32 into vector<4xi32> | |
| %371 = vector.extract %arg7[3] : vector<4xi32> | |
| %372 = arith.muli %38, %370 : vector<4xi32> | |
| %373 = vector.reduction <add>, %372, %371 : vector<4xi32> into i32 | |
| %374 = vector.insert %373, %cst_0 [0] : i32 into vector<1xi32> | |
| %375 = vector.insert_strided_slice %59, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %376 = vector.insert_strided_slice %80, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %377 = vector.insert_strided_slice %101, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %378 = vector.insert_strided_slice %122, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %379 = vector.insert_strided_slice %143, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %380 = vector.insert_strided_slice %164, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %381 = vector.insert_strided_slice %185, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %382 = vector.insert_strided_slice %206, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %383 = vector.insert_strided_slice %227, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %384 = vector.insert_strided_slice %248, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %385 = vector.insert_strided_slice %269, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %386 = vector.insert_strided_slice %290, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %387 = vector.insert_strided_slice %311, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %388 = vector.insert_strided_slice %332, %387 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %389 = vector.insert_strided_slice %353, %388 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %390 = vector.insert_strided_slice %374, %389 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %390, %386, %382, %378 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %20 = vector.transfer_write %19#3, %18[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %21 = vector.transfer_write %19#2, %20[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %22 = vector.transfer_write %19#1, %21[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %23 = vector.transfer_write %19#0, %22[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %23 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After lowering transfer ops --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15 = vector.transfer_write %cst_1, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %16 = vector.transfer_write %cst_1, %15[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %cst_1, %16[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %cst_1, %17[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %24 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %25 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %26 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %27 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %35 = arith.extsi %24 : vector<4xi8> to vector<4xi32> | |
| %36 = arith.extsi %25 : vector<4xi8> to vector<4xi32> | |
| %37 = arith.extsi %26 : vector<4xi8> to vector<4xi32> | |
| %38 = arith.extsi %27 : vector<4xi8> to vector<4xi32> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [1] : vector<1xi8> into vector<4x1xi8> | |
| %43 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %44 = vector.insert %43, %42 [2] : vector<1xi8> into vector<4x1xi8> | |
| %45 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %46 = vector.insert %45, %44 [3] : vector<1xi8> into vector<4x1xi8> | |
| %47 = arith.extsi %46 : vector<4x1xi8> to vector<4x1xi32> | |
| %48 = vector.extract %47[0, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %cst_1 [0] : i32 into vector<4xi32> | |
| %50 = vector.extract %47[1, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [1] : i32 into vector<4xi32> | |
| %52 = vector.extract %47[2, 0] : vector<4x1xi32> | |
| %53 = vector.insert %52, %51 [2] : i32 into vector<4xi32> | |
| %54 = vector.extract %47[3, 0] : vector<4x1xi32> | |
| %55 = vector.insert %54, %53 [3] : i32 into vector<4xi32> | |
| %56 = vector.extract %arg10[0] : vector<4xi32> | |
| %57 = arith.muli %35, %55 : vector<4xi32> | |
| %58 = vector.reduction <add>, %57, %56 : vector<4xi32> into i32 | |
| %59 = vector.insert %58, %cst_0 [0] : i32 into vector<1xi32> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [1] : vector<1xi8> into vector<4x1xi8> | |
| %64 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %65 = vector.insert %64, %63 [2] : vector<1xi8> into vector<4x1xi8> | |
| %66 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %67 = vector.insert %66, %65 [3] : vector<1xi8> into vector<4x1xi8> | |
| %68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32> | |
| %69 = vector.extract %68[0, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %cst_1 [0] : i32 into vector<4xi32> | |
| %71 = vector.extract %68[1, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [1] : i32 into vector<4xi32> | |
| %73 = vector.extract %68[2, 0] : vector<4x1xi32> | |
| %74 = vector.insert %73, %72 [2] : i32 into vector<4xi32> | |
| %75 = vector.extract %68[3, 0] : vector<4x1xi32> | |
| %76 = vector.insert %75, %74 [3] : i32 into vector<4xi32> | |
| %77 = vector.extract %arg10[1] : vector<4xi32> | |
| %78 = arith.muli %35, %76 : vector<4xi32> | |
| %79 = vector.reduction <add>, %78, %77 : vector<4xi32> into i32 | |
| %80 = vector.insert %79, %cst_0 [0] : i32 into vector<1xi32> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [1] : vector<1xi8> into vector<4x1xi8> | |
| %85 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %86 = vector.insert %85, %84 [2] : vector<1xi8> into vector<4x1xi8> | |
| %87 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %88 = vector.insert %87, %86 [3] : vector<1xi8> into vector<4x1xi8> | |
| %89 = arith.extsi %88 : vector<4x1xi8> to vector<4x1xi32> | |
| %90 = vector.extract %89[0, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %cst_1 [0] : i32 into vector<4xi32> | |
| %92 = vector.extract %89[1, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [1] : i32 into vector<4xi32> | |
| %94 = vector.extract %89[2, 0] : vector<4x1xi32> | |
| %95 = vector.insert %94, %93 [2] : i32 into vector<4xi32> | |
| %96 = vector.extract %89[3, 0] : vector<4x1xi32> | |
| %97 = vector.insert %96, %95 [3] : i32 into vector<4xi32> | |
| %98 = vector.extract %arg10[2] : vector<4xi32> | |
| %99 = arith.muli %35, %97 : vector<4xi32> | |
| %100 = vector.reduction <add>, %99, %98 : vector<4xi32> into i32 | |
| %101 = vector.insert %100, %cst_0 [0] : i32 into vector<1xi32> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [1] : vector<1xi8> into vector<4x1xi8> | |
| %106 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %107 = vector.insert %106, %105 [2] : vector<1xi8> into vector<4x1xi8> | |
| %108 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %109 = vector.insert %108, %107 [3] : vector<1xi8> into vector<4x1xi8> | |
| %110 = arith.extsi %109 : vector<4x1xi8> to vector<4x1xi32> | |
| %111 = vector.extract %110[0, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %cst_1 [0] : i32 into vector<4xi32> | |
| %113 = vector.extract %110[1, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [1] : i32 into vector<4xi32> | |
| %115 = vector.extract %110[2, 0] : vector<4x1xi32> | |
| %116 = vector.insert %115, %114 [2] : i32 into vector<4xi32> | |
| %117 = vector.extract %110[3, 0] : vector<4x1xi32> | |
| %118 = vector.insert %117, %116 [3] : i32 into vector<4xi32> | |
| %119 = vector.extract %arg10[3] : vector<4xi32> | |
| %120 = arith.muli %35, %118 : vector<4xi32> | |
| %121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32 | |
| %122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32> | |
| %123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %124 = vector.insert %123, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %126 = vector.insert %125, %124 [1] : vector<1xi8> into vector<4x1xi8> | |
| %127 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %128 = vector.insert %127, %126 [2] : vector<1xi8> into vector<4x1xi8> | |
| %129 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %130 = vector.insert %129, %128 [3] : vector<1xi8> into vector<4x1xi8> | |
| %131 = arith.extsi %130 : vector<4x1xi8> to vector<4x1xi32> | |
| %132 = vector.extract %131[0, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %cst_1 [0] : i32 into vector<4xi32> | |
| %134 = vector.extract %131[1, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [1] : i32 into vector<4xi32> | |
| %136 = vector.extract %131[2, 0] : vector<4x1xi32> | |
| %137 = vector.insert %136, %135 [2] : i32 into vector<4xi32> | |
| %138 = vector.extract %131[3, 0] : vector<4x1xi32> | |
| %139 = vector.insert %138, %137 [3] : i32 into vector<4xi32> | |
| %140 = vector.extract %arg9[0] : vector<4xi32> | |
| %141 = arith.muli %36, %139 : vector<4xi32> | |
| %142 = vector.reduction <add>, %141, %140 : vector<4xi32> into i32 | |
| %143 = vector.insert %142, %cst_0 [0] : i32 into vector<1xi32> | |
| %144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %145 = vector.insert %144, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %147 = vector.insert %146, %145 [1] : vector<1xi8> into vector<4x1xi8> | |
| %148 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %149 = vector.insert %148, %147 [2] : vector<1xi8> into vector<4x1xi8> | |
| %150 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %151 = vector.insert %150, %149 [3] : vector<1xi8> into vector<4x1xi8> | |
| %152 = arith.extsi %151 : vector<4x1xi8> to vector<4x1xi32> | |
| %153 = vector.extract %152[0, 0] : vector<4x1xi32> | |
| %154 = vector.insert %153, %cst_1 [0] : i32 into vector<4xi32> | |
| %155 = vector.extract %152[1, 0] : vector<4x1xi32> | |
| %156 = vector.insert %155, %154 [1] : i32 into vector<4xi32> | |
| %157 = vector.extract %152[2, 0] : vector<4x1xi32> | |
| %158 = vector.insert %157, %156 [2] : i32 into vector<4xi32> | |
| %159 = vector.extract %152[3, 0] : vector<4x1xi32> | |
| %160 = vector.insert %159, %158 [3] : i32 into vector<4xi32> | |
| %161 = vector.extract %arg9[1] : vector<4xi32> | |
| %162 = arith.muli %36, %160 : vector<4xi32> | |
| %163 = vector.reduction <add>, %162, %161 : vector<4xi32> into i32 | |
| %164 = vector.insert %163, %cst_0 [0] : i32 into vector<1xi32> | |
| %165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %166 = vector.insert %165, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %168 = vector.insert %167, %166 [1] : vector<1xi8> into vector<4x1xi8> | |
| %169 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %170 = vector.insert %169, %168 [2] : vector<1xi8> into vector<4x1xi8> | |
| %171 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %172 = vector.insert %171, %170 [3] : vector<1xi8> into vector<4x1xi8> | |
| %173 = arith.extsi %172 : vector<4x1xi8> to vector<4x1xi32> | |
| %174 = vector.extract %173[0, 0] : vector<4x1xi32> | |
| %175 = vector.insert %174, %cst_1 [0] : i32 into vector<4xi32> | |
| %176 = vector.extract %173[1, 0] : vector<4x1xi32> | |
| %177 = vector.insert %176, %175 [1] : i32 into vector<4xi32> | |
| %178 = vector.extract %173[2, 0] : vector<4x1xi32> | |
| %179 = vector.insert %178, %177 [2] : i32 into vector<4xi32> | |
| %180 = vector.extract %173[3, 0] : vector<4x1xi32> | |
| %181 = vector.insert %180, %179 [3] : i32 into vector<4xi32> | |
| %182 = vector.extract %arg9[2] : vector<4xi32> | |
| %183 = arith.muli %36, %181 : vector<4xi32> | |
| %184 = vector.reduction <add>, %183, %182 : vector<4xi32> into i32 | |
| %185 = vector.insert %184, %cst_0 [0] : i32 into vector<1xi32> | |
| %186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %187 = vector.insert %186, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %189 = vector.insert %188, %187 [1] : vector<1xi8> into vector<4x1xi8> | |
| %190 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %191 = vector.insert %190, %189 [2] : vector<1xi8> into vector<4x1xi8> | |
| %192 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %193 = vector.insert %192, %191 [3] : vector<1xi8> into vector<4x1xi8> | |
| %194 = arith.extsi %193 : vector<4x1xi8> to vector<4x1xi32> | |
| %195 = vector.extract %194[0, 0] : vector<4x1xi32> | |
| %196 = vector.insert %195, %cst_1 [0] : i32 into vector<4xi32> | |
| %197 = vector.extract %194[1, 0] : vector<4x1xi32> | |
| %198 = vector.insert %197, %196 [1] : i32 into vector<4xi32> | |
| %199 = vector.extract %194[2, 0] : vector<4x1xi32> | |
| %200 = vector.insert %199, %198 [2] : i32 into vector<4xi32> | |
| %201 = vector.extract %194[3, 0] : vector<4x1xi32> | |
| %202 = vector.insert %201, %200 [3] : i32 into vector<4xi32> | |
| %203 = vector.extract %arg9[3] : vector<4xi32> | |
| %204 = arith.muli %36, %202 : vector<4xi32> | |
| %205 = vector.reduction <add>, %204, %203 : vector<4xi32> into i32 | |
| %206 = vector.insert %205, %cst_0 [0] : i32 into vector<1xi32> | |
| %207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %208 = vector.insert %207, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %210 = vector.insert %209, %208 [1] : vector<1xi8> into vector<4x1xi8> | |
| %211 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %212 = vector.insert %211, %210 [2] : vector<1xi8> into vector<4x1xi8> | |
| %213 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %214 = vector.insert %213, %212 [3] : vector<1xi8> into vector<4x1xi8> | |
| %215 = arith.extsi %214 : vector<4x1xi8> to vector<4x1xi32> | |
| %216 = vector.extract %215[0, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %cst_1 [0] : i32 into vector<4xi32> | |
| %218 = vector.extract %215[1, 0] : vector<4x1xi32> | |
| %219 = vector.insert %218, %217 [1] : i32 into vector<4xi32> | |
| %220 = vector.extract %215[2, 0] : vector<4x1xi32> | |
| %221 = vector.insert %220, %219 [2] : i32 into vector<4xi32> | |
| %222 = vector.extract %215[3, 0] : vector<4x1xi32> | |
| %223 = vector.insert %222, %221 [3] : i32 into vector<4xi32> | |
| %224 = vector.extract %arg8[0] : vector<4xi32> | |
| %225 = arith.muli %37, %223 : vector<4xi32> | |
| %226 = vector.reduction <add>, %225, %224 : vector<4xi32> into i32 | |
| %227 = vector.insert %226, %cst_0 [0] : i32 into vector<1xi32> | |
| %228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %229 = vector.insert %228, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %231 = vector.insert %230, %229 [1] : vector<1xi8> into vector<4x1xi8> | |
| %232 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %233 = vector.insert %232, %231 [2] : vector<1xi8> into vector<4x1xi8> | |
| %234 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %235 = vector.insert %234, %233 [3] : vector<1xi8> into vector<4x1xi8> | |
| %236 = arith.extsi %235 : vector<4x1xi8> to vector<4x1xi32> | |
| %237 = vector.extract %236[0, 0] : vector<4x1xi32> | |
| %238 = vector.insert %237, %cst_1 [0] : i32 into vector<4xi32> | |
| %239 = vector.extract %236[1, 0] : vector<4x1xi32> | |
| %240 = vector.insert %239, %238 [1] : i32 into vector<4xi32> | |
| %241 = vector.extract %236[2, 0] : vector<4x1xi32> | |
| %242 = vector.insert %241, %240 [2] : i32 into vector<4xi32> | |
| %243 = vector.extract %236[3, 0] : vector<4x1xi32> | |
| %244 = vector.insert %243, %242 [3] : i32 into vector<4xi32> | |
| %245 = vector.extract %arg8[1] : vector<4xi32> | |
| %246 = arith.muli %37, %244 : vector<4xi32> | |
| %247 = vector.reduction <add>, %246, %245 : vector<4xi32> into i32 | |
| %248 = vector.insert %247, %cst_0 [0] : i32 into vector<1xi32> | |
| %249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %250 = vector.insert %249, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %252 = vector.insert %251, %250 [1] : vector<1xi8> into vector<4x1xi8> | |
| %253 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %254 = vector.insert %253, %252 [2] : vector<1xi8> into vector<4x1xi8> | |
| %255 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %256 = vector.insert %255, %254 [3] : vector<1xi8> into vector<4x1xi8> | |
| %257 = arith.extsi %256 : vector<4x1xi8> to vector<4x1xi32> | |
| %258 = vector.extract %257[0, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32> | |
| %260 = vector.extract %257[1, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [1] : i32 into vector<4xi32> | |
| %262 = vector.extract %257[2, 0] : vector<4x1xi32> | |
| %263 = vector.insert %262, %261 [2] : i32 into vector<4xi32> | |
| %264 = vector.extract %257[3, 0] : vector<4x1xi32> | |
| %265 = vector.insert %264, %263 [3] : i32 into vector<4xi32> | |
| %266 = vector.extract %arg8[2] : vector<4xi32> | |
| %267 = arith.muli %37, %265 : vector<4xi32> | |
| %268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32 | |
| %269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32> | |
| %270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %271 = vector.insert %270, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %273 = vector.insert %272, %271 [1] : vector<1xi8> into vector<4x1xi8> | |
| %274 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %275 = vector.insert %274, %273 [2] : vector<1xi8> into vector<4x1xi8> | |
| %276 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %277 = vector.insert %276, %275 [3] : vector<1xi8> into vector<4x1xi8> | |
| %278 = arith.extsi %277 : vector<4x1xi8> to vector<4x1xi32> | |
| %279 = vector.extract %278[0, 0] : vector<4x1xi32> | |
| %280 = vector.insert %279, %cst_1 [0] : i32 into vector<4xi32> | |
| %281 = vector.extract %278[1, 0] : vector<4x1xi32> | |
| %282 = vector.insert %281, %280 [1] : i32 into vector<4xi32> | |
| %283 = vector.extract %278[2, 0] : vector<4x1xi32> | |
| %284 = vector.insert %283, %282 [2] : i32 into vector<4xi32> | |
| %285 = vector.extract %278[3, 0] : vector<4x1xi32> | |
| %286 = vector.insert %285, %284 [3] : i32 into vector<4xi32> | |
| %287 = vector.extract %arg8[3] : vector<4xi32> | |
| %288 = arith.muli %37, %286 : vector<4xi32> | |
| %289 = vector.reduction <add>, %288, %287 : vector<4xi32> into i32 | |
| %290 = vector.insert %289, %cst_0 [0] : i32 into vector<1xi32> | |
| %291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %292 = vector.insert %291, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %294 = vector.insert %293, %292 [1] : vector<1xi8> into vector<4x1xi8> | |
| %295 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %296 = vector.insert %295, %294 [2] : vector<1xi8> into vector<4x1xi8> | |
| %297 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %298 = vector.insert %297, %296 [3] : vector<1xi8> into vector<4x1xi8> | |
| %299 = arith.extsi %298 : vector<4x1xi8> to vector<4x1xi32> | |
| %300 = vector.extract %299[0, 0] : vector<4x1xi32> | |
| %301 = vector.insert %300, %cst_1 [0] : i32 into vector<4xi32> | |
| %302 = vector.extract %299[1, 0] : vector<4x1xi32> | |
| %303 = vector.insert %302, %301 [1] : i32 into vector<4xi32> | |
| %304 = vector.extract %299[2, 0] : vector<4x1xi32> | |
| %305 = vector.insert %304, %303 [2] : i32 into vector<4xi32> | |
| %306 = vector.extract %299[3, 0] : vector<4x1xi32> | |
| %307 = vector.insert %306, %305 [3] : i32 into vector<4xi32> | |
| %308 = vector.extract %arg7[0] : vector<4xi32> | |
| %309 = arith.muli %38, %307 : vector<4xi32> | |
| %310 = vector.reduction <add>, %309, %308 : vector<4xi32> into i32 | |
| %311 = vector.insert %310, %cst_0 [0] : i32 into vector<1xi32> | |
| %312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %313 = vector.insert %312, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %315 = vector.insert %314, %313 [1] : vector<1xi8> into vector<4x1xi8> | |
| %316 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %317 = vector.insert %316, %315 [2] : vector<1xi8> into vector<4x1xi8> | |
| %318 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %319 = vector.insert %318, %317 [3] : vector<1xi8> into vector<4x1xi8> | |
| %320 = arith.extsi %319 : vector<4x1xi8> to vector<4x1xi32> | |
| %321 = vector.extract %320[0, 0] : vector<4x1xi32> | |
| %322 = vector.insert %321, %cst_1 [0] : i32 into vector<4xi32> | |
| %323 = vector.extract %320[1, 0] : vector<4x1xi32> | |
| %324 = vector.insert %323, %322 [1] : i32 into vector<4xi32> | |
| %325 = vector.extract %320[2, 0] : vector<4x1xi32> | |
| %326 = vector.insert %325, %324 [2] : i32 into vector<4xi32> | |
| %327 = vector.extract %320[3, 0] : vector<4x1xi32> | |
| %328 = vector.insert %327, %326 [3] : i32 into vector<4xi32> | |
| %329 = vector.extract %arg7[1] : vector<4xi32> | |
| %330 = arith.muli %38, %328 : vector<4xi32> | |
| %331 = vector.reduction <add>, %330, %329 : vector<4xi32> into i32 | |
| %332 = vector.insert %331, %cst_0 [0] : i32 into vector<1xi32> | |
| %333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %334 = vector.insert %333, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %336 = vector.insert %335, %334 [1] : vector<1xi8> into vector<4x1xi8> | |
| %337 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %338 = vector.insert %337, %336 [2] : vector<1xi8> into vector<4x1xi8> | |
| %339 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %340 = vector.insert %339, %338 [3] : vector<1xi8> into vector<4x1xi8> | |
| %341 = arith.extsi %340 : vector<4x1xi8> to vector<4x1xi32> | |
| %342 = vector.extract %341[0, 0] : vector<4x1xi32> | |
| %343 = vector.insert %342, %cst_1 [0] : i32 into vector<4xi32> | |
| %344 = vector.extract %341[1, 0] : vector<4x1xi32> | |
| %345 = vector.insert %344, %343 [1] : i32 into vector<4xi32> | |
| %346 = vector.extract %341[2, 0] : vector<4x1xi32> | |
| %347 = vector.insert %346, %345 [2] : i32 into vector<4xi32> | |
| %348 = vector.extract %341[3, 0] : vector<4x1xi32> | |
| %349 = vector.insert %348, %347 [3] : i32 into vector<4xi32> | |
| %350 = vector.extract %arg7[2] : vector<4xi32> | |
| %351 = arith.muli %38, %349 : vector<4xi32> | |
| %352 = vector.reduction <add>, %351, %350 : vector<4xi32> into i32 | |
| %353 = vector.insert %352, %cst_0 [0] : i32 into vector<1xi32> | |
| %354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %355 = vector.insert %354, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %357 = vector.insert %356, %355 [1] : vector<1xi8> into vector<4x1xi8> | |
| %358 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %359 = vector.insert %358, %357 [2] : vector<1xi8> into vector<4x1xi8> | |
| %360 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %361 = vector.insert %360, %359 [3] : vector<1xi8> into vector<4x1xi8> | |
| %362 = arith.extsi %361 : vector<4x1xi8> to vector<4x1xi32> | |
| %363 = vector.extract %362[0, 0] : vector<4x1xi32> | |
| %364 = vector.insert %363, %cst_1 [0] : i32 into vector<4xi32> | |
| %365 = vector.extract %362[1, 0] : vector<4x1xi32> | |
| %366 = vector.insert %365, %364 [1] : i32 into vector<4xi32> | |
| %367 = vector.extract %362[2, 0] : vector<4x1xi32> | |
| %368 = vector.insert %367, %366 [2] : i32 into vector<4xi32> | |
| %369 = vector.extract %362[3, 0] : vector<4x1xi32> | |
| %370 = vector.insert %369, %368 [3] : i32 into vector<4xi32> | |
| %371 = vector.extract %arg7[3] : vector<4xi32> | |
| %372 = arith.muli %38, %370 : vector<4xi32> | |
| %373 = vector.reduction <add>, %372, %371 : vector<4xi32> into i32 | |
| %374 = vector.insert %373, %cst_0 [0] : i32 into vector<1xi32> | |
| %375 = vector.insert_strided_slice %59, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %376 = vector.insert_strided_slice %80, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %377 = vector.insert_strided_slice %101, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %378 = vector.insert_strided_slice %122, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %379 = vector.insert_strided_slice %143, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %380 = vector.insert_strided_slice %164, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %381 = vector.insert_strided_slice %185, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %382 = vector.insert_strided_slice %206, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %383 = vector.insert_strided_slice %227, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %384 = vector.insert_strided_slice %248, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %385 = vector.insert_strided_slice %269, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %386 = vector.insert_strided_slice %290, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %387 = vector.insert_strided_slice %311, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %388 = vector.insert_strided_slice %332, %387 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %389 = vector.insert_strided_slice %353, %388 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %390 = vector.insert_strided_slice %374, %389 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %390, %386, %382, %378 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %20 = vector.transfer_write %19#3, %18[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %21 = vector.transfer_write %19#2, %20[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %22 = vector.transfer_write %19#1, %21[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %23 = vector.transfer_write %19#0, %22[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %23 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| --- After lowering various vector ops --- | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15 = vector.transfer_write %cst_1, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %16 = vector.transfer_write %cst_1, %15[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %cst_1, %16[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %cst_1, %17[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %24 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %25 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %26 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %27 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %35 = arith.extsi %24 : vector<4xi8> to vector<4xi32> | |
| %36 = arith.extsi %25 : vector<4xi8> to vector<4xi32> | |
| %37 = arith.extsi %26 : vector<4xi8> to vector<4xi32> | |
| %38 = arith.extsi %27 : vector<4xi8> to vector<4xi32> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [1] : vector<1xi8> into vector<4x1xi8> | |
| %43 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %44 = vector.insert %43, %42 [2] : vector<1xi8> into vector<4x1xi8> | |
| %45 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %46 = vector.insert %45, %44 [3] : vector<1xi8> into vector<4x1xi8> | |
| %47 = arith.extsi %46 : vector<4x1xi8> to vector<4x1xi32> | |
| %48 = vector.extract %47[0, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %cst_1 [0] : i32 into vector<4xi32> | |
| %50 = vector.extract %47[1, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [1] : i32 into vector<4xi32> | |
| %52 = vector.extract %47[2, 0] : vector<4x1xi32> | |
| %53 = vector.insert %52, %51 [2] : i32 into vector<4xi32> | |
| %54 = vector.extract %47[3, 0] : vector<4x1xi32> | |
| %55 = vector.insert %54, %53 [3] : i32 into vector<4xi32> | |
| %56 = vector.extract %arg10[0] : vector<4xi32> | |
| %57 = arith.muli %35, %55 : vector<4xi32> | |
| %58 = vector.reduction <add>, %57, %56 : vector<4xi32> into i32 | |
| %59 = vector.insert %58, %cst_0 [0] : i32 into vector<1xi32> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [1] : vector<1xi8> into vector<4x1xi8> | |
| %64 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %65 = vector.insert %64, %63 [2] : vector<1xi8> into vector<4x1xi8> | |
| %66 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %67 = vector.insert %66, %65 [3] : vector<1xi8> into vector<4x1xi8> | |
| %68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32> | |
| %69 = vector.extract %68[0, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %cst_1 [0] : i32 into vector<4xi32> | |
| %71 = vector.extract %68[1, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [1] : i32 into vector<4xi32> | |
| %73 = vector.extract %68[2, 0] : vector<4x1xi32> | |
| %74 = vector.insert %73, %72 [2] : i32 into vector<4xi32> | |
| %75 = vector.extract %68[3, 0] : vector<4x1xi32> | |
| %76 = vector.insert %75, %74 [3] : i32 into vector<4xi32> | |
| %77 = vector.extract %arg10[1] : vector<4xi32> | |
| %78 = arith.muli %35, %76 : vector<4xi32> | |
| %79 = vector.reduction <add>, %78, %77 : vector<4xi32> into i32 | |
| %80 = vector.insert %79, %cst_0 [0] : i32 into vector<1xi32> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [1] : vector<1xi8> into vector<4x1xi8> | |
| %85 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %86 = vector.insert %85, %84 [2] : vector<1xi8> into vector<4x1xi8> | |
| %87 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %88 = vector.insert %87, %86 [3] : vector<1xi8> into vector<4x1xi8> | |
| %89 = arith.extsi %88 : vector<4x1xi8> to vector<4x1xi32> | |
| %90 = vector.extract %89[0, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %cst_1 [0] : i32 into vector<4xi32> | |
| %92 = vector.extract %89[1, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [1] : i32 into vector<4xi32> | |
| %94 = vector.extract %89[2, 0] : vector<4x1xi32> | |
| %95 = vector.insert %94, %93 [2] : i32 into vector<4xi32> | |
| %96 = vector.extract %89[3, 0] : vector<4x1xi32> | |
| %97 = vector.insert %96, %95 [3] : i32 into vector<4xi32> | |
| %98 = vector.extract %arg10[2] : vector<4xi32> | |
| %99 = arith.muli %35, %97 : vector<4xi32> | |
| %100 = vector.reduction <add>, %99, %98 : vector<4xi32> into i32 | |
| %101 = vector.insert %100, %cst_0 [0] : i32 into vector<1xi32> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [1] : vector<1xi8> into vector<4x1xi8> | |
| %106 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %107 = vector.insert %106, %105 [2] : vector<1xi8> into vector<4x1xi8> | |
| %108 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %109 = vector.insert %108, %107 [3] : vector<1xi8> into vector<4x1xi8> | |
| %110 = arith.extsi %109 : vector<4x1xi8> to vector<4x1xi32> | |
| %111 = vector.extract %110[0, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %cst_1 [0] : i32 into vector<4xi32> | |
| %113 = vector.extract %110[1, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [1] : i32 into vector<4xi32> | |
| %115 = vector.extract %110[2, 0] : vector<4x1xi32> | |
| %116 = vector.insert %115, %114 [2] : i32 into vector<4xi32> | |
| %117 = vector.extract %110[3, 0] : vector<4x1xi32> | |
| %118 = vector.insert %117, %116 [3] : i32 into vector<4xi32> | |
| %119 = vector.extract %arg10[3] : vector<4xi32> | |
| %120 = arith.muli %35, %118 : vector<4xi32> | |
| %121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32 | |
| %122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32> | |
| %123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %124 = vector.insert %123, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %126 = vector.insert %125, %124 [1] : vector<1xi8> into vector<4x1xi8> | |
| %127 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %128 = vector.insert %127, %126 [2] : vector<1xi8> into vector<4x1xi8> | |
| %129 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %130 = vector.insert %129, %128 [3] : vector<1xi8> into vector<4x1xi8> | |
| %131 = arith.extsi %130 : vector<4x1xi8> to vector<4x1xi32> | |
| %132 = vector.extract %131[0, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %cst_1 [0] : i32 into vector<4xi32> | |
| %134 = vector.extract %131[1, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [1] : i32 into vector<4xi32> | |
| %136 = vector.extract %131[2, 0] : vector<4x1xi32> | |
| %137 = vector.insert %136, %135 [2] : i32 into vector<4xi32> | |
| %138 = vector.extract %131[3, 0] : vector<4x1xi32> | |
| %139 = vector.insert %138, %137 [3] : i32 into vector<4xi32> | |
| %140 = vector.extract %arg9[0] : vector<4xi32> | |
| %141 = arith.muli %36, %139 : vector<4xi32> | |
| %142 = vector.reduction <add>, %141, %140 : vector<4xi32> into i32 | |
| %143 = vector.insert %142, %cst_0 [0] : i32 into vector<1xi32> | |
| %144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %145 = vector.insert %144, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %147 = vector.insert %146, %145 [1] : vector<1xi8> into vector<4x1xi8> | |
| %148 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %149 = vector.insert %148, %147 [2] : vector<1xi8> into vector<4x1xi8> | |
| %150 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %151 = vector.insert %150, %149 [3] : vector<1xi8> into vector<4x1xi8> | |
| %152 = arith.extsi %151 : vector<4x1xi8> to vector<4x1xi32> | |
| %153 = vector.extract %152[0, 0] : vector<4x1xi32> | |
| %154 = vector.insert %153, %cst_1 [0] : i32 into vector<4xi32> | |
| %155 = vector.extract %152[1, 0] : vector<4x1xi32> | |
| %156 = vector.insert %155, %154 [1] : i32 into vector<4xi32> | |
| %157 = vector.extract %152[2, 0] : vector<4x1xi32> | |
| %158 = vector.insert %157, %156 [2] : i32 into vector<4xi32> | |
| %159 = vector.extract %152[3, 0] : vector<4x1xi32> | |
| %160 = vector.insert %159, %158 [3] : i32 into vector<4xi32> | |
| %161 = vector.extract %arg9[1] : vector<4xi32> | |
| %162 = arith.muli %36, %160 : vector<4xi32> | |
| %163 = vector.reduction <add>, %162, %161 : vector<4xi32> into i32 | |
| %164 = vector.insert %163, %cst_0 [0] : i32 into vector<1xi32> | |
| %165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %166 = vector.insert %165, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %168 = vector.insert %167, %166 [1] : vector<1xi8> into vector<4x1xi8> | |
| %169 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %170 = vector.insert %169, %168 [2] : vector<1xi8> into vector<4x1xi8> | |
| %171 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %172 = vector.insert %171, %170 [3] : vector<1xi8> into vector<4x1xi8> | |
| %173 = arith.extsi %172 : vector<4x1xi8> to vector<4x1xi32> | |
| %174 = vector.extract %173[0, 0] : vector<4x1xi32> | |
| %175 = vector.insert %174, %cst_1 [0] : i32 into vector<4xi32> | |
| %176 = vector.extract %173[1, 0] : vector<4x1xi32> | |
| %177 = vector.insert %176, %175 [1] : i32 into vector<4xi32> | |
| %178 = vector.extract %173[2, 0] : vector<4x1xi32> | |
| %179 = vector.insert %178, %177 [2] : i32 into vector<4xi32> | |
| %180 = vector.extract %173[3, 0] : vector<4x1xi32> | |
| %181 = vector.insert %180, %179 [3] : i32 into vector<4xi32> | |
| %182 = vector.extract %arg9[2] : vector<4xi32> | |
| %183 = arith.muli %36, %181 : vector<4xi32> | |
| %184 = vector.reduction <add>, %183, %182 : vector<4xi32> into i32 | |
| %185 = vector.insert %184, %cst_0 [0] : i32 into vector<1xi32> | |
| %186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %187 = vector.insert %186, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %189 = vector.insert %188, %187 [1] : vector<1xi8> into vector<4x1xi8> | |
| %190 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %191 = vector.insert %190, %189 [2] : vector<1xi8> into vector<4x1xi8> | |
| %192 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %193 = vector.insert %192, %191 [3] : vector<1xi8> into vector<4x1xi8> | |
| %194 = arith.extsi %193 : vector<4x1xi8> to vector<4x1xi32> | |
| %195 = vector.extract %194[0, 0] : vector<4x1xi32> | |
| %196 = vector.insert %195, %cst_1 [0] : i32 into vector<4xi32> | |
| %197 = vector.extract %194[1, 0] : vector<4x1xi32> | |
| %198 = vector.insert %197, %196 [1] : i32 into vector<4xi32> | |
| %199 = vector.extract %194[2, 0] : vector<4x1xi32> | |
| %200 = vector.insert %199, %198 [2] : i32 into vector<4xi32> | |
| %201 = vector.extract %194[3, 0] : vector<4x1xi32> | |
| %202 = vector.insert %201, %200 [3] : i32 into vector<4xi32> | |
| %203 = vector.extract %arg9[3] : vector<4xi32> | |
| %204 = arith.muli %36, %202 : vector<4xi32> | |
| %205 = vector.reduction <add>, %204, %203 : vector<4xi32> into i32 | |
| %206 = vector.insert %205, %cst_0 [0] : i32 into vector<1xi32> | |
| %207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %208 = vector.insert %207, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %210 = vector.insert %209, %208 [1] : vector<1xi8> into vector<4x1xi8> | |
| %211 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %212 = vector.insert %211, %210 [2] : vector<1xi8> into vector<4x1xi8> | |
| %213 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %214 = vector.insert %213, %212 [3] : vector<1xi8> into vector<4x1xi8> | |
| %215 = arith.extsi %214 : vector<4x1xi8> to vector<4x1xi32> | |
| %216 = vector.extract %215[0, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %cst_1 [0] : i32 into vector<4xi32> | |
| %218 = vector.extract %215[1, 0] : vector<4x1xi32> | |
| %219 = vector.insert %218, %217 [1] : i32 into vector<4xi32> | |
| %220 = vector.extract %215[2, 0] : vector<4x1xi32> | |
| %221 = vector.insert %220, %219 [2] : i32 into vector<4xi32> | |
| %222 = vector.extract %215[3, 0] : vector<4x1xi32> | |
| %223 = vector.insert %222, %221 [3] : i32 into vector<4xi32> | |
| %224 = vector.extract %arg8[0] : vector<4xi32> | |
| %225 = arith.muli %37, %223 : vector<4xi32> | |
| %226 = vector.reduction <add>, %225, %224 : vector<4xi32> into i32 | |
| %227 = vector.insert %226, %cst_0 [0] : i32 into vector<1xi32> | |
| %228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %229 = vector.insert %228, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %231 = vector.insert %230, %229 [1] : vector<1xi8> into vector<4x1xi8> | |
| %232 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %233 = vector.insert %232, %231 [2] : vector<1xi8> into vector<4x1xi8> | |
| %234 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %235 = vector.insert %234, %233 [3] : vector<1xi8> into vector<4x1xi8> | |
| %236 = arith.extsi %235 : vector<4x1xi8> to vector<4x1xi32> | |
| %237 = vector.extract %236[0, 0] : vector<4x1xi32> | |
| %238 = vector.insert %237, %cst_1 [0] : i32 into vector<4xi32> | |
| %239 = vector.extract %236[1, 0] : vector<4x1xi32> | |
| %240 = vector.insert %239, %238 [1] : i32 into vector<4xi32> | |
| %241 = vector.extract %236[2, 0] : vector<4x1xi32> | |
| %242 = vector.insert %241, %240 [2] : i32 into vector<4xi32> | |
| %243 = vector.extract %236[3, 0] : vector<4x1xi32> | |
| %244 = vector.insert %243, %242 [3] : i32 into vector<4xi32> | |
| %245 = vector.extract %arg8[1] : vector<4xi32> | |
| %246 = arith.muli %37, %244 : vector<4xi32> | |
| %247 = vector.reduction <add>, %246, %245 : vector<4xi32> into i32 | |
| %248 = vector.insert %247, %cst_0 [0] : i32 into vector<1xi32> | |
| %249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %250 = vector.insert %249, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %252 = vector.insert %251, %250 [1] : vector<1xi8> into vector<4x1xi8> | |
| %253 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %254 = vector.insert %253, %252 [2] : vector<1xi8> into vector<4x1xi8> | |
| %255 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %256 = vector.insert %255, %254 [3] : vector<1xi8> into vector<4x1xi8> | |
| %257 = arith.extsi %256 : vector<4x1xi8> to vector<4x1xi32> | |
| %258 = vector.extract %257[0, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32> | |
| %260 = vector.extract %257[1, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [1] : i32 into vector<4xi32> | |
| %262 = vector.extract %257[2, 0] : vector<4x1xi32> | |
| %263 = vector.insert %262, %261 [2] : i32 into vector<4xi32> | |
| %264 = vector.extract %257[3, 0] : vector<4x1xi32> | |
| %265 = vector.insert %264, %263 [3] : i32 into vector<4xi32> | |
| %266 = vector.extract %arg8[2] : vector<4xi32> | |
| %267 = arith.muli %37, %265 : vector<4xi32> | |
| %268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32 | |
| %269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32> | |
| %270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %271 = vector.insert %270, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %273 = vector.insert %272, %271 [1] : vector<1xi8> into vector<4x1xi8> | |
| %274 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %275 = vector.insert %274, %273 [2] : vector<1xi8> into vector<4x1xi8> | |
| %276 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %277 = vector.insert %276, %275 [3] : vector<1xi8> into vector<4x1xi8> | |
| %278 = arith.extsi %277 : vector<4x1xi8> to vector<4x1xi32> | |
| %279 = vector.extract %278[0, 0] : vector<4x1xi32> | |
| %280 = vector.insert %279, %cst_1 [0] : i32 into vector<4xi32> | |
| %281 = vector.extract %278[1, 0] : vector<4x1xi32> | |
| %282 = vector.insert %281, %280 [1] : i32 into vector<4xi32> | |
| %283 = vector.extract %278[2, 0] : vector<4x1xi32> | |
| %284 = vector.insert %283, %282 [2] : i32 into vector<4xi32> | |
| %285 = vector.extract %278[3, 0] : vector<4x1xi32> | |
| %286 = vector.insert %285, %284 [3] : i32 into vector<4xi32> | |
| %287 = vector.extract %arg8[3] : vector<4xi32> | |
| %288 = arith.muli %37, %286 : vector<4xi32> | |
| %289 = vector.reduction <add>, %288, %287 : vector<4xi32> into i32 | |
| %290 = vector.insert %289, %cst_0 [0] : i32 into vector<1xi32> | |
| %291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %292 = vector.insert %291, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %294 = vector.insert %293, %292 [1] : vector<1xi8> into vector<4x1xi8> | |
| %295 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %296 = vector.insert %295, %294 [2] : vector<1xi8> into vector<4x1xi8> | |
| %297 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %298 = vector.insert %297, %296 [3] : vector<1xi8> into vector<4x1xi8> | |
| %299 = arith.extsi %298 : vector<4x1xi8> to vector<4x1xi32> | |
| %300 = vector.extract %299[0, 0] : vector<4x1xi32> | |
| %301 = vector.insert %300, %cst_1 [0] : i32 into vector<4xi32> | |
| %302 = vector.extract %299[1, 0] : vector<4x1xi32> | |
| %303 = vector.insert %302, %301 [1] : i32 into vector<4xi32> | |
| %304 = vector.extract %299[2, 0] : vector<4x1xi32> | |
| %305 = vector.insert %304, %303 [2] : i32 into vector<4xi32> | |
| %306 = vector.extract %299[3, 0] : vector<4x1xi32> | |
| %307 = vector.insert %306, %305 [3] : i32 into vector<4xi32> | |
| %308 = vector.extract %arg7[0] : vector<4xi32> | |
| %309 = arith.muli %38, %307 : vector<4xi32> | |
| %310 = vector.reduction <add>, %309, %308 : vector<4xi32> into i32 | |
| %311 = vector.insert %310, %cst_0 [0] : i32 into vector<1xi32> | |
| %312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %313 = vector.insert %312, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %315 = vector.insert %314, %313 [1] : vector<1xi8> into vector<4x1xi8> | |
| %316 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %317 = vector.insert %316, %315 [2] : vector<1xi8> into vector<4x1xi8> | |
| %318 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %319 = vector.insert %318, %317 [3] : vector<1xi8> into vector<4x1xi8> | |
| %320 = arith.extsi %319 : vector<4x1xi8> to vector<4x1xi32> | |
| %321 = vector.extract %320[0, 0] : vector<4x1xi32> | |
| %322 = vector.insert %321, %cst_1 [0] : i32 into vector<4xi32> | |
| %323 = vector.extract %320[1, 0] : vector<4x1xi32> | |
| %324 = vector.insert %323, %322 [1] : i32 into vector<4xi32> | |
| %325 = vector.extract %320[2, 0] : vector<4x1xi32> | |
| %326 = vector.insert %325, %324 [2] : i32 into vector<4xi32> | |
| %327 = vector.extract %320[3, 0] : vector<4x1xi32> | |
| %328 = vector.insert %327, %326 [3] : i32 into vector<4xi32> | |
| %329 = vector.extract %arg7[1] : vector<4xi32> | |
| %330 = arith.muli %38, %328 : vector<4xi32> | |
| %331 = vector.reduction <add>, %330, %329 : vector<4xi32> into i32 | |
| %332 = vector.insert %331, %cst_0 [0] : i32 into vector<1xi32> | |
| %333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %334 = vector.insert %333, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %336 = vector.insert %335, %334 [1] : vector<1xi8> into vector<4x1xi8> | |
| %337 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %338 = vector.insert %337, %336 [2] : vector<1xi8> into vector<4x1xi8> | |
| %339 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %340 = vector.insert %339, %338 [3] : vector<1xi8> into vector<4x1xi8> | |
| %341 = arith.extsi %340 : vector<4x1xi8> to vector<4x1xi32> | |
| %342 = vector.extract %341[0, 0] : vector<4x1xi32> | |
| %343 = vector.insert %342, %cst_1 [0] : i32 into vector<4xi32> | |
| %344 = vector.extract %341[1, 0] : vector<4x1xi32> | |
| %345 = vector.insert %344, %343 [1] : i32 into vector<4xi32> | |
| %346 = vector.extract %341[2, 0] : vector<4x1xi32> | |
| %347 = vector.insert %346, %345 [2] : i32 into vector<4xi32> | |
| %348 = vector.extract %341[3, 0] : vector<4x1xi32> | |
| %349 = vector.insert %348, %347 [3] : i32 into vector<4xi32> | |
| %350 = vector.extract %arg7[2] : vector<4xi32> | |
| %351 = arith.muli %38, %349 : vector<4xi32> | |
| %352 = vector.reduction <add>, %351, %350 : vector<4xi32> into i32 | |
| %353 = vector.insert %352, %cst_0 [0] : i32 into vector<1xi32> | |
| %354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %355 = vector.insert %354, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %357 = vector.insert %356, %355 [1] : vector<1xi8> into vector<4x1xi8> | |
| %358 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %359 = vector.insert %358, %357 [2] : vector<1xi8> into vector<4x1xi8> | |
| %360 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %361 = vector.insert %360, %359 [3] : vector<1xi8> into vector<4x1xi8> | |
| %362 = arith.extsi %361 : vector<4x1xi8> to vector<4x1xi32> | |
| %363 = vector.extract %362[0, 0] : vector<4x1xi32> | |
| %364 = vector.insert %363, %cst_1 [0] : i32 into vector<4xi32> | |
| %365 = vector.extract %362[1, 0] : vector<4x1xi32> | |
| %366 = vector.insert %365, %364 [1] : i32 into vector<4xi32> | |
| %367 = vector.extract %362[2, 0] : vector<4x1xi32> | |
| %368 = vector.insert %367, %366 [2] : i32 into vector<4xi32> | |
| %369 = vector.extract %362[3, 0] : vector<4x1xi32> | |
| %370 = vector.insert %369, %368 [3] : i32 into vector<4xi32> | |
| %371 = vector.extract %arg7[3] : vector<4xi32> | |
| %372 = arith.muli %38, %370 : vector<4xi32> | |
| %373 = vector.reduction <add>, %372, %371 : vector<4xi32> into i32 | |
| %374 = vector.insert %373, %cst_0 [0] : i32 into vector<1xi32> | |
| %375 = vector.insert_strided_slice %59, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %376 = vector.insert_strided_slice %80, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %377 = vector.insert_strided_slice %101, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %378 = vector.insert_strided_slice %122, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %379 = vector.insert_strided_slice %143, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %380 = vector.insert_strided_slice %164, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %381 = vector.insert_strided_slice %185, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %382 = vector.insert_strided_slice %206, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %383 = vector.insert_strided_slice %227, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %384 = vector.insert_strided_slice %248, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %385 = vector.insert_strided_slice %269, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %386 = vector.insert_strided_slice %290, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %387 = vector.insert_strided_slice %311, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %388 = vector.insert_strided_slice %332, %387 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %389 = vector.insert_strided_slice %353, %388 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %390 = vector.insert_strided_slice %374, %389 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %390, %386, %382, %378 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %20 = vector.transfer_write %19#3, %18[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %21 = vector.transfer_write %19#2, %20[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %22 = vector.transfer_write %19#1, %21[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %23 = vector.transfer_write %19#0, %22[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %23 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After SPIRVVectorize (iree-spirv-vectorize) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = arith.extsi %20 : vector<4xi8> to vector<4xi32> | |
| %32 = arith.extsi %21 : vector<4xi8> to vector<4xi32> | |
| %33 = arith.extsi %22 : vector<4xi8> to vector<4xi32> | |
| %34 = arith.extsi %23 : vector<4xi8> to vector<4xi32> | |
| %35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8> | |
| %43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32> | |
| %44 = vector.extract %43[0, 0] : vector<4x1xi32> | |
| %45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32> | |
| %46 = vector.extract %43[1, 0] : vector<4x1xi32> | |
| %47 = vector.insert %46, %45 [1] : i32 into vector<4xi32> | |
| %48 = vector.extract %43[2, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %47 [2] : i32 into vector<4xi32> | |
| %50 = vector.extract %43[3, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [3] : i32 into vector<4xi32> | |
| %52 = vector.extract %arg10[0] : vector<4xi32> | |
| %53 = arith.muli %31, %51 : vector<4xi32> | |
| %54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32 | |
| %55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32> | |
| %56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32> | |
| %67 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %68 = vector.insert %67, %66 [1] : i32 into vector<4xi32> | |
| %69 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %68 [2] : i32 into vector<4xi32> | |
| %71 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [3] : i32 into vector<4xi32> | |
| %73 = vector.extract %arg10[1] : vector<4xi32> | |
| %74 = arith.muli %31, %72 : vector<4xi32> | |
| %75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32 | |
| %76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32> | |
| %77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8> | |
| %85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32> | |
| %86 = vector.extract %85[0, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32> | |
| %88 = vector.extract %85[1, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [1] : i32 into vector<4xi32> | |
| %90 = vector.extract %85[2, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %89 [2] : i32 into vector<4xi32> | |
| %92 = vector.extract %85[3, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [3] : i32 into vector<4xi32> | |
| %94 = vector.extract %arg10[2] : vector<4xi32> | |
| %95 = arith.muli %31, %93 : vector<4xi32> | |
| %96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32 | |
| %97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32> | |
| %98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8> | |
| %106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32> | |
| %107 = vector.extract %106[0, 0] : vector<4x1xi32> | |
| %108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32> | |
| %109 = vector.extract %106[1, 0] : vector<4x1xi32> | |
| %110 = vector.insert %109, %108 [1] : i32 into vector<4xi32> | |
| %111 = vector.extract %106[2, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %110 [2] : i32 into vector<4xi32> | |
| %113 = vector.extract %106[3, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [3] : i32 into vector<4xi32> | |
| %115 = vector.extract %arg10[3] : vector<4xi32> | |
| %116 = arith.muli %31, %114 : vector<4xi32> | |
| %117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32 | |
| %118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32> | |
| %119 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %120 = vector.insert %119, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %121 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %122 = vector.insert %121, %120 [1] : vector<1xi8> into vector<4x1xi8> | |
| %123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %124 = vector.insert %123, %122 [2] : vector<1xi8> into vector<4x1xi8> | |
| %125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %126 = vector.insert %125, %124 [3] : vector<1xi8> into vector<4x1xi8> | |
| %127 = arith.extsi %126 : vector<4x1xi8> to vector<4x1xi32> | |
| %128 = vector.extract %127[0, 0] : vector<4x1xi32> | |
| %129 = vector.insert %128, %cst_1 [0] : i32 into vector<4xi32> | |
| %130 = vector.extract %127[1, 0] : vector<4x1xi32> | |
| %131 = vector.insert %130, %129 [1] : i32 into vector<4xi32> | |
| %132 = vector.extract %127[2, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %131 [2] : i32 into vector<4xi32> | |
| %134 = vector.extract %127[3, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [3] : i32 into vector<4xi32> | |
| %136 = vector.extract %arg9[0] : vector<4xi32> | |
| %137 = arith.muli %32, %135 : vector<4xi32> | |
| %138 = vector.reduction <add>, %137, %136 : vector<4xi32> into i32 | |
| %139 = vector.insert %138, %cst_0 [0] : i32 into vector<1xi32> | |
| %140 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %141 = vector.insert %140, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %142 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %143 = vector.insert %142, %141 [1] : vector<1xi8> into vector<4x1xi8> | |
| %144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %145 = vector.insert %144, %143 [2] : vector<1xi8> into vector<4x1xi8> | |
| %146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %147 = vector.insert %146, %145 [3] : vector<1xi8> into vector<4x1xi8> | |
| %148 = arith.extsi %147 : vector<4x1xi8> to vector<4x1xi32> | |
| %149 = vector.extract %148[0, 0] : vector<4x1xi32> | |
| %150 = vector.insert %149, %cst_1 [0] : i32 into vector<4xi32> | |
| %151 = vector.extract %148[1, 0] : vector<4x1xi32> | |
| %152 = vector.insert %151, %150 [1] : i32 into vector<4xi32> | |
| %153 = vector.extract %148[2, 0] : vector<4x1xi32> | |
| %154 = vector.insert %153, %152 [2] : i32 into vector<4xi32> | |
| %155 = vector.extract %148[3, 0] : vector<4x1xi32> | |
| %156 = vector.insert %155, %154 [3] : i32 into vector<4xi32> | |
| %157 = vector.extract %arg9[1] : vector<4xi32> | |
| %158 = arith.muli %32, %156 : vector<4xi32> | |
| %159 = vector.reduction <add>, %158, %157 : vector<4xi32> into i32 | |
| %160 = vector.insert %159, %cst_0 [0] : i32 into vector<1xi32> | |
| %161 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %162 = vector.insert %161, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %163 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %164 = vector.insert %163, %162 [1] : vector<1xi8> into vector<4x1xi8> | |
| %165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %166 = vector.insert %165, %164 [2] : vector<1xi8> into vector<4x1xi8> | |
| %167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %168 = vector.insert %167, %166 [3] : vector<1xi8> into vector<4x1xi8> | |
| %169 = arith.extsi %168 : vector<4x1xi8> to vector<4x1xi32> | |
| %170 = vector.extract %169[0, 0] : vector<4x1xi32> | |
| %171 = vector.insert %170, %cst_1 [0] : i32 into vector<4xi32> | |
| %172 = vector.extract %169[1, 0] : vector<4x1xi32> | |
| %173 = vector.insert %172, %171 [1] : i32 into vector<4xi32> | |
| %174 = vector.extract %169[2, 0] : vector<4x1xi32> | |
| %175 = vector.insert %174, %173 [2] : i32 into vector<4xi32> | |
| %176 = vector.extract %169[3, 0] : vector<4x1xi32> | |
| %177 = vector.insert %176, %175 [3] : i32 into vector<4xi32> | |
| %178 = vector.extract %arg9[2] : vector<4xi32> | |
| %179 = arith.muli %32, %177 : vector<4xi32> | |
| %180 = vector.reduction <add>, %179, %178 : vector<4xi32> into i32 | |
| %181 = vector.insert %180, %cst_0 [0] : i32 into vector<1xi32> | |
| %182 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %183 = vector.insert %182, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %184 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %185 = vector.insert %184, %183 [1] : vector<1xi8> into vector<4x1xi8> | |
| %186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %187 = vector.insert %186, %185 [2] : vector<1xi8> into vector<4x1xi8> | |
| %188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %189 = vector.insert %188, %187 [3] : vector<1xi8> into vector<4x1xi8> | |
| %190 = arith.extsi %189 : vector<4x1xi8> to vector<4x1xi32> | |
| %191 = vector.extract %190[0, 0] : vector<4x1xi32> | |
| %192 = vector.insert %191, %cst_1 [0] : i32 into vector<4xi32> | |
| %193 = vector.extract %190[1, 0] : vector<4x1xi32> | |
| %194 = vector.insert %193, %192 [1] : i32 into vector<4xi32> | |
| %195 = vector.extract %190[2, 0] : vector<4x1xi32> | |
| %196 = vector.insert %195, %194 [2] : i32 into vector<4xi32> | |
| %197 = vector.extract %190[3, 0] : vector<4x1xi32> | |
| %198 = vector.insert %197, %196 [3] : i32 into vector<4xi32> | |
| %199 = vector.extract %arg9[3] : vector<4xi32> | |
| %200 = arith.muli %32, %198 : vector<4xi32> | |
| %201 = vector.reduction <add>, %200, %199 : vector<4xi32> into i32 | |
| %202 = vector.insert %201, %cst_0 [0] : i32 into vector<1xi32> | |
| %203 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %204 = vector.insert %203, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %205 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %206 = vector.insert %205, %204 [1] : vector<1xi8> into vector<4x1xi8> | |
| %207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %208 = vector.insert %207, %206 [2] : vector<1xi8> into vector<4x1xi8> | |
| %209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %210 = vector.insert %209, %208 [3] : vector<1xi8> into vector<4x1xi8> | |
| %211 = arith.extsi %210 : vector<4x1xi8> to vector<4x1xi32> | |
| %212 = vector.extract %211[0, 0] : vector<4x1xi32> | |
| %213 = vector.insert %212, %cst_1 [0] : i32 into vector<4xi32> | |
| %214 = vector.extract %211[1, 0] : vector<4x1xi32> | |
| %215 = vector.insert %214, %213 [1] : i32 into vector<4xi32> | |
| %216 = vector.extract %211[2, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %215 [2] : i32 into vector<4xi32> | |
| %218 = vector.extract %211[3, 0] : vector<4x1xi32> | |
| %219 = vector.insert %218, %217 [3] : i32 into vector<4xi32> | |
| %220 = vector.extract %arg8[0] : vector<4xi32> | |
| %221 = arith.muli %33, %219 : vector<4xi32> | |
| %222 = vector.reduction <add>, %221, %220 : vector<4xi32> into i32 | |
| %223 = vector.insert %222, %cst_0 [0] : i32 into vector<1xi32> | |
| %224 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %225 = vector.insert %224, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %226 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %227 = vector.insert %226, %225 [1] : vector<1xi8> into vector<4x1xi8> | |
| %228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %229 = vector.insert %228, %227 [2] : vector<1xi8> into vector<4x1xi8> | |
| %230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %231 = vector.insert %230, %229 [3] : vector<1xi8> into vector<4x1xi8> | |
| %232 = arith.extsi %231 : vector<4x1xi8> to vector<4x1xi32> | |
| %233 = vector.extract %232[0, 0] : vector<4x1xi32> | |
| %234 = vector.insert %233, %cst_1 [0] : i32 into vector<4xi32> | |
| %235 = vector.extract %232[1, 0] : vector<4x1xi32> | |
| %236 = vector.insert %235, %234 [1] : i32 into vector<4xi32> | |
| %237 = vector.extract %232[2, 0] : vector<4x1xi32> | |
| %238 = vector.insert %237, %236 [2] : i32 into vector<4xi32> | |
| %239 = vector.extract %232[3, 0] : vector<4x1xi32> | |
| %240 = vector.insert %239, %238 [3] : i32 into vector<4xi32> | |
| %241 = vector.extract %arg8[1] : vector<4xi32> | |
| %242 = arith.muli %33, %240 : vector<4xi32> | |
| %243 = vector.reduction <add>, %242, %241 : vector<4xi32> into i32 | |
| %244 = vector.insert %243, %cst_0 [0] : i32 into vector<1xi32> | |
| %245 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %246 = vector.insert %245, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %247 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %248 = vector.insert %247, %246 [1] : vector<1xi8> into vector<4x1xi8> | |
| %249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %250 = vector.insert %249, %248 [2] : vector<1xi8> into vector<4x1xi8> | |
| %251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %252 = vector.insert %251, %250 [3] : vector<1xi8> into vector<4x1xi8> | |
| %253 = arith.extsi %252 : vector<4x1xi8> to vector<4x1xi32> | |
| %254 = vector.extract %253[0, 0] : vector<4x1xi32> | |
| %255 = vector.insert %254, %cst_1 [0] : i32 into vector<4xi32> | |
| %256 = vector.extract %253[1, 0] : vector<4x1xi32> | |
| %257 = vector.insert %256, %255 [1] : i32 into vector<4xi32> | |
| %258 = vector.extract %253[2, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %257 [2] : i32 into vector<4xi32> | |
| %260 = vector.extract %253[3, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [3] : i32 into vector<4xi32> | |
| %262 = vector.extract %arg8[2] : vector<4xi32> | |
| %263 = arith.muli %33, %261 : vector<4xi32> | |
| %264 = vector.reduction <add>, %263, %262 : vector<4xi32> into i32 | |
| %265 = vector.insert %264, %cst_0 [0] : i32 into vector<1xi32> | |
| %266 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %267 = vector.insert %266, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %268 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %269 = vector.insert %268, %267 [1] : vector<1xi8> into vector<4x1xi8> | |
| %270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %271 = vector.insert %270, %269 [2] : vector<1xi8> into vector<4x1xi8> | |
| %272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %273 = vector.insert %272, %271 [3] : vector<1xi8> into vector<4x1xi8> | |
| %274 = arith.extsi %273 : vector<4x1xi8> to vector<4x1xi32> | |
| %275 = vector.extract %274[0, 0] : vector<4x1xi32> | |
| %276 = vector.insert %275, %cst_1 [0] : i32 into vector<4xi32> | |
| %277 = vector.extract %274[1, 0] : vector<4x1xi32> | |
| %278 = vector.insert %277, %276 [1] : i32 into vector<4xi32> | |
| %279 = vector.extract %274[2, 0] : vector<4x1xi32> | |
| %280 = vector.insert %279, %278 [2] : i32 into vector<4xi32> | |
| %281 = vector.extract %274[3, 0] : vector<4x1xi32> | |
| %282 = vector.insert %281, %280 [3] : i32 into vector<4xi32> | |
| %283 = vector.extract %arg8[3] : vector<4xi32> | |
| %284 = arith.muli %33, %282 : vector<4xi32> | |
| %285 = vector.reduction <add>, %284, %283 : vector<4xi32> into i32 | |
| %286 = vector.insert %285, %cst_0 [0] : i32 into vector<1xi32> | |
| %287 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %288 = vector.insert %287, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %289 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %290 = vector.insert %289, %288 [1] : vector<1xi8> into vector<4x1xi8> | |
| %291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %292 = vector.insert %291, %290 [2] : vector<1xi8> into vector<4x1xi8> | |
| %293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %294 = vector.insert %293, %292 [3] : vector<1xi8> into vector<4x1xi8> | |
| %295 = arith.extsi %294 : vector<4x1xi8> to vector<4x1xi32> | |
| %296 = vector.extract %295[0, 0] : vector<4x1xi32> | |
| %297 = vector.insert %296, %cst_1 [0] : i32 into vector<4xi32> | |
| %298 = vector.extract %295[1, 0] : vector<4x1xi32> | |
| %299 = vector.insert %298, %297 [1] : i32 into vector<4xi32> | |
| %300 = vector.extract %295[2, 0] : vector<4x1xi32> | |
| %301 = vector.insert %300, %299 [2] : i32 into vector<4xi32> | |
| %302 = vector.extract %295[3, 0] : vector<4x1xi32> | |
| %303 = vector.insert %302, %301 [3] : i32 into vector<4xi32> | |
| %304 = vector.extract %arg7[0] : vector<4xi32> | |
| %305 = arith.muli %34, %303 : vector<4xi32> | |
| %306 = vector.reduction <add>, %305, %304 : vector<4xi32> into i32 | |
| %307 = vector.insert %306, %cst_0 [0] : i32 into vector<1xi32> | |
| %308 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %309 = vector.insert %308, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %310 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %311 = vector.insert %310, %309 [1] : vector<1xi8> into vector<4x1xi8> | |
| %312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %313 = vector.insert %312, %311 [2] : vector<1xi8> into vector<4x1xi8> | |
| %314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %315 = vector.insert %314, %313 [3] : vector<1xi8> into vector<4x1xi8> | |
| %316 = arith.extsi %315 : vector<4x1xi8> to vector<4x1xi32> | |
| %317 = vector.extract %316[0, 0] : vector<4x1xi32> | |
| %318 = vector.insert %317, %cst_1 [0] : i32 into vector<4xi32> | |
| %319 = vector.extract %316[1, 0] : vector<4x1xi32> | |
| %320 = vector.insert %319, %318 [1] : i32 into vector<4xi32> | |
| %321 = vector.extract %316[2, 0] : vector<4x1xi32> | |
| %322 = vector.insert %321, %320 [2] : i32 into vector<4xi32> | |
| %323 = vector.extract %316[3, 0] : vector<4x1xi32> | |
| %324 = vector.insert %323, %322 [3] : i32 into vector<4xi32> | |
| %325 = vector.extract %arg7[1] : vector<4xi32> | |
| %326 = arith.muli %34, %324 : vector<4xi32> | |
| %327 = vector.reduction <add>, %326, %325 : vector<4xi32> into i32 | |
| %328 = vector.insert %327, %cst_0 [0] : i32 into vector<1xi32> | |
| %329 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %330 = vector.insert %329, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %331 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %332 = vector.insert %331, %330 [1] : vector<1xi8> into vector<4x1xi8> | |
| %333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %334 = vector.insert %333, %332 [2] : vector<1xi8> into vector<4x1xi8> | |
| %335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %336 = vector.insert %335, %334 [3] : vector<1xi8> into vector<4x1xi8> | |
| %337 = arith.extsi %336 : vector<4x1xi8> to vector<4x1xi32> | |
| %338 = vector.extract %337[0, 0] : vector<4x1xi32> | |
| %339 = vector.insert %338, %cst_1 [0] : i32 into vector<4xi32> | |
| %340 = vector.extract %337[1, 0] : vector<4x1xi32> | |
| %341 = vector.insert %340, %339 [1] : i32 into vector<4xi32> | |
| %342 = vector.extract %337[2, 0] : vector<4x1xi32> | |
| %343 = vector.insert %342, %341 [2] : i32 into vector<4xi32> | |
| %344 = vector.extract %337[3, 0] : vector<4x1xi32> | |
| %345 = vector.insert %344, %343 [3] : i32 into vector<4xi32> | |
| %346 = vector.extract %arg7[2] : vector<4xi32> | |
| %347 = arith.muli %34, %345 : vector<4xi32> | |
| %348 = vector.reduction <add>, %347, %346 : vector<4xi32> into i32 | |
| %349 = vector.insert %348, %cst_0 [0] : i32 into vector<1xi32> | |
| %350 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %351 = vector.insert %350, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %352 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %353 = vector.insert %352, %351 [1] : vector<1xi8> into vector<4x1xi8> | |
| %354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %355 = vector.insert %354, %353 [2] : vector<1xi8> into vector<4x1xi8> | |
| %356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %357 = vector.insert %356, %355 [3] : vector<1xi8> into vector<4x1xi8> | |
| %358 = arith.extsi %357 : vector<4x1xi8> to vector<4x1xi32> | |
| %359 = vector.extract %358[0, 0] : vector<4x1xi32> | |
| %360 = vector.insert %359, %cst_1 [0] : i32 into vector<4xi32> | |
| %361 = vector.extract %358[1, 0] : vector<4x1xi32> | |
| %362 = vector.insert %361, %360 [1] : i32 into vector<4xi32> | |
| %363 = vector.extract %358[2, 0] : vector<4x1xi32> | |
| %364 = vector.insert %363, %362 [2] : i32 into vector<4xi32> | |
| %365 = vector.extract %358[3, 0] : vector<4x1xi32> | |
| %366 = vector.insert %365, %364 [3] : i32 into vector<4xi32> | |
| %367 = vector.extract %arg7[3] : vector<4xi32> | |
| %368 = arith.muli %34, %366 : vector<4xi32> | |
| %369 = vector.reduction <add>, %368, %367 : vector<4xi32> into i32 | |
| %370 = vector.insert %369, %cst_0 [0] : i32 into vector<1xi32> | |
| %371 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %372 = vector.insert_strided_slice %76, %371 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %373 = vector.insert_strided_slice %97, %372 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %374 = vector.insert_strided_slice %118, %373 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %375 = vector.insert_strided_slice %139, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %376 = vector.insert_strided_slice %160, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %377 = vector.insert_strided_slice %181, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %378 = vector.insert_strided_slice %202, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %379 = vector.insert_strided_slice %223, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %380 = vector.insert_strided_slice %244, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %381 = vector.insert_strided_slice %265, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %382 = vector.insert_strided_slice %286, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %383 = vector.insert_strided_slice %307, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %384 = vector.insert_strided_slice %328, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %385 = vector.insert_strided_slice %349, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %386 = vector.insert_strided_slice %370, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %386, %382, %378, %374 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After ForOpCanonicalization (iree-codegen-canonicalize-scf-for) //----- // | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = arith.extsi %20 : vector<4xi8> to vector<4xi32> | |
| %32 = arith.extsi %21 : vector<4xi8> to vector<4xi32> | |
| %33 = arith.extsi %22 : vector<4xi8> to vector<4xi32> | |
| %34 = arith.extsi %23 : vector<4xi8> to vector<4xi32> | |
| %35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8> | |
| %43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32> | |
| %44 = vector.extract %43[0, 0] : vector<4x1xi32> | |
| %45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32> | |
| %46 = vector.extract %43[1, 0] : vector<4x1xi32> | |
| %47 = vector.insert %46, %45 [1] : i32 into vector<4xi32> | |
| %48 = vector.extract %43[2, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %47 [2] : i32 into vector<4xi32> | |
| %50 = vector.extract %43[3, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [3] : i32 into vector<4xi32> | |
| %52 = vector.extract %arg10[0] : vector<4xi32> | |
| %53 = arith.muli %31, %51 : vector<4xi32> | |
| %54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32 | |
| %55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32> | |
| %56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32> | |
| %67 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %68 = vector.insert %67, %66 [1] : i32 into vector<4xi32> | |
| %69 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %68 [2] : i32 into vector<4xi32> | |
| %71 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [3] : i32 into vector<4xi32> | |
| %73 = vector.extract %arg10[1] : vector<4xi32> | |
| %74 = arith.muli %31, %72 : vector<4xi32> | |
| %75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32 | |
| %76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32> | |
| %77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8> | |
| %85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32> | |
| %86 = vector.extract %85[0, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32> | |
| %88 = vector.extract %85[1, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [1] : i32 into vector<4xi32> | |
| %90 = vector.extract %85[2, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %89 [2] : i32 into vector<4xi32> | |
| %92 = vector.extract %85[3, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [3] : i32 into vector<4xi32> | |
| %94 = vector.extract %arg10[2] : vector<4xi32> | |
| %95 = arith.muli %31, %93 : vector<4xi32> | |
| %96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32 | |
| %97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32> | |
| %98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8> | |
| %106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32> | |
| %107 = vector.extract %106[0, 0] : vector<4x1xi32> | |
| %108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32> | |
| %109 = vector.extract %106[1, 0] : vector<4x1xi32> | |
| %110 = vector.insert %109, %108 [1] : i32 into vector<4xi32> | |
| %111 = vector.extract %106[2, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %110 [2] : i32 into vector<4xi32> | |
| %113 = vector.extract %106[3, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [3] : i32 into vector<4xi32> | |
| %115 = vector.extract %arg10[3] : vector<4xi32> | |
| %116 = arith.muli %31, %114 : vector<4xi32> | |
| %117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32 | |
| %118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32> | |
| %119 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %120 = vector.insert %119, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %121 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %122 = vector.insert %121, %120 [1] : vector<1xi8> into vector<4x1xi8> | |
| %123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %124 = vector.insert %123, %122 [2] : vector<1xi8> into vector<4x1xi8> | |
| %125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %126 = vector.insert %125, %124 [3] : vector<1xi8> into vector<4x1xi8> | |
| %127 = arith.extsi %126 : vector<4x1xi8> to vector<4x1xi32> | |
| %128 = vector.extract %127[0, 0] : vector<4x1xi32> | |
| %129 = vector.insert %128, %cst_1 [0] : i32 into vector<4xi32> | |
| %130 = vector.extract %127[1, 0] : vector<4x1xi32> | |
| %131 = vector.insert %130, %129 [1] : i32 into vector<4xi32> | |
| %132 = vector.extract %127[2, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %131 [2] : i32 into vector<4xi32> | |
| %134 = vector.extract %127[3, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [3] : i32 into vector<4xi32> | |
| %136 = vector.extract %arg9[0] : vector<4xi32> | |
| %137 = arith.muli %32, %135 : vector<4xi32> | |
| %138 = vector.reduction <add>, %137, %136 : vector<4xi32> into i32 | |
| %139 = vector.insert %138, %cst_0 [0] : i32 into vector<1xi32> | |
| %140 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %141 = vector.insert %140, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %142 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %143 = vector.insert %142, %141 [1] : vector<1xi8> into vector<4x1xi8> | |
| %144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %145 = vector.insert %144, %143 [2] : vector<1xi8> into vector<4x1xi8> | |
| %146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %147 = vector.insert %146, %145 [3] : vector<1xi8> into vector<4x1xi8> | |
| %148 = arith.extsi %147 : vector<4x1xi8> to vector<4x1xi32> | |
| %149 = vector.extract %148[0, 0] : vector<4x1xi32> | |
| %150 = vector.insert %149, %cst_1 [0] : i32 into vector<4xi32> | |
| %151 = vector.extract %148[1, 0] : vector<4x1xi32> | |
| %152 = vector.insert %151, %150 [1] : i32 into vector<4xi32> | |
| %153 = vector.extract %148[2, 0] : vector<4x1xi32> | |
| %154 = vector.insert %153, %152 [2] : i32 into vector<4xi32> | |
| %155 = vector.extract %148[3, 0] : vector<4x1xi32> | |
| %156 = vector.insert %155, %154 [3] : i32 into vector<4xi32> | |
| %157 = vector.extract %arg9[1] : vector<4xi32> | |
| %158 = arith.muli %32, %156 : vector<4xi32> | |
| %159 = vector.reduction <add>, %158, %157 : vector<4xi32> into i32 | |
| %160 = vector.insert %159, %cst_0 [0] : i32 into vector<1xi32> | |
| %161 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %162 = vector.insert %161, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %163 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %164 = vector.insert %163, %162 [1] : vector<1xi8> into vector<4x1xi8> | |
| %165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %166 = vector.insert %165, %164 [2] : vector<1xi8> into vector<4x1xi8> | |
| %167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %168 = vector.insert %167, %166 [3] : vector<1xi8> into vector<4x1xi8> | |
| %169 = arith.extsi %168 : vector<4x1xi8> to vector<4x1xi32> | |
| %170 = vector.extract %169[0, 0] : vector<4x1xi32> | |
| %171 = vector.insert %170, %cst_1 [0] : i32 into vector<4xi32> | |
| %172 = vector.extract %169[1, 0] : vector<4x1xi32> | |
| %173 = vector.insert %172, %171 [1] : i32 into vector<4xi32> | |
| %174 = vector.extract %169[2, 0] : vector<4x1xi32> | |
| %175 = vector.insert %174, %173 [2] : i32 into vector<4xi32> | |
| %176 = vector.extract %169[3, 0] : vector<4x1xi32> | |
| %177 = vector.insert %176, %175 [3] : i32 into vector<4xi32> | |
| %178 = vector.extract %arg9[2] : vector<4xi32> | |
| %179 = arith.muli %32, %177 : vector<4xi32> | |
| %180 = vector.reduction <add>, %179, %178 : vector<4xi32> into i32 | |
| %181 = vector.insert %180, %cst_0 [0] : i32 into vector<1xi32> | |
| %182 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %183 = vector.insert %182, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %184 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %185 = vector.insert %184, %183 [1] : vector<1xi8> into vector<4x1xi8> | |
| %186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %187 = vector.insert %186, %185 [2] : vector<1xi8> into vector<4x1xi8> | |
| %188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %189 = vector.insert %188, %187 [3] : vector<1xi8> into vector<4x1xi8> | |
| %190 = arith.extsi %189 : vector<4x1xi8> to vector<4x1xi32> | |
| %191 = vector.extract %190[0, 0] : vector<4x1xi32> | |
| %192 = vector.insert %191, %cst_1 [0] : i32 into vector<4xi32> | |
| %193 = vector.extract %190[1, 0] : vector<4x1xi32> | |
| %194 = vector.insert %193, %192 [1] : i32 into vector<4xi32> | |
| %195 = vector.extract %190[2, 0] : vector<4x1xi32> | |
| %196 = vector.insert %195, %194 [2] : i32 into vector<4xi32> | |
| %197 = vector.extract %190[3, 0] : vector<4x1xi32> | |
| %198 = vector.insert %197, %196 [3] : i32 into vector<4xi32> | |
| %199 = vector.extract %arg9[3] : vector<4xi32> | |
| %200 = arith.muli %32, %198 : vector<4xi32> | |
| %201 = vector.reduction <add>, %200, %199 : vector<4xi32> into i32 | |
| %202 = vector.insert %201, %cst_0 [0] : i32 into vector<1xi32> | |
| %203 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %204 = vector.insert %203, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %205 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %206 = vector.insert %205, %204 [1] : vector<1xi8> into vector<4x1xi8> | |
| %207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %208 = vector.insert %207, %206 [2] : vector<1xi8> into vector<4x1xi8> | |
| %209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %210 = vector.insert %209, %208 [3] : vector<1xi8> into vector<4x1xi8> | |
| %211 = arith.extsi %210 : vector<4x1xi8> to vector<4x1xi32> | |
| %212 = vector.extract %211[0, 0] : vector<4x1xi32> | |
| %213 = vector.insert %212, %cst_1 [0] : i32 into vector<4xi32> | |
| %214 = vector.extract %211[1, 0] : vector<4x1xi32> | |
| %215 = vector.insert %214, %213 [1] : i32 into vector<4xi32> | |
| %216 = vector.extract %211[2, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %215 [2] : i32 into vector<4xi32> | |
| %218 = vector.extract %211[3, 0] : vector<4x1xi32> | |
| %219 = vector.insert %218, %217 [3] : i32 into vector<4xi32> | |
| %220 = vector.extract %arg8[0] : vector<4xi32> | |
| %221 = arith.muli %33, %219 : vector<4xi32> | |
| %222 = vector.reduction <add>, %221, %220 : vector<4xi32> into i32 | |
| %223 = vector.insert %222, %cst_0 [0] : i32 into vector<1xi32> | |
| %224 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %225 = vector.insert %224, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %226 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %227 = vector.insert %226, %225 [1] : vector<1xi8> into vector<4x1xi8> | |
| %228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %229 = vector.insert %228, %227 [2] : vector<1xi8> into vector<4x1xi8> | |
| %230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %231 = vector.insert %230, %229 [3] : vector<1xi8> into vector<4x1xi8> | |
| %232 = arith.extsi %231 : vector<4x1xi8> to vector<4x1xi32> | |
| %233 = vector.extract %232[0, 0] : vector<4x1xi32> | |
| %234 = vector.insert %233, %cst_1 [0] : i32 into vector<4xi32> | |
| %235 = vector.extract %232[1, 0] : vector<4x1xi32> | |
| %236 = vector.insert %235, %234 [1] : i32 into vector<4xi32> | |
| %237 = vector.extract %232[2, 0] : vector<4x1xi32> | |
| %238 = vector.insert %237, %236 [2] : i32 into vector<4xi32> | |
| %239 = vector.extract %232[3, 0] : vector<4x1xi32> | |
| %240 = vector.insert %239, %238 [3] : i32 into vector<4xi32> | |
| %241 = vector.extract %arg8[1] : vector<4xi32> | |
| %242 = arith.muli %33, %240 : vector<4xi32> | |
| %243 = vector.reduction <add>, %242, %241 : vector<4xi32> into i32 | |
| %244 = vector.insert %243, %cst_0 [0] : i32 into vector<1xi32> | |
| %245 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %246 = vector.insert %245, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %247 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %248 = vector.insert %247, %246 [1] : vector<1xi8> into vector<4x1xi8> | |
| %249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %250 = vector.insert %249, %248 [2] : vector<1xi8> into vector<4x1xi8> | |
| %251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %252 = vector.insert %251, %250 [3] : vector<1xi8> into vector<4x1xi8> | |
| %253 = arith.extsi %252 : vector<4x1xi8> to vector<4x1xi32> | |
| %254 = vector.extract %253[0, 0] : vector<4x1xi32> | |
| %255 = vector.insert %254, %cst_1 [0] : i32 into vector<4xi32> | |
| %256 = vector.extract %253[1, 0] : vector<4x1xi32> | |
| %257 = vector.insert %256, %255 [1] : i32 into vector<4xi32> | |
| %258 = vector.extract %253[2, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %257 [2] : i32 into vector<4xi32> | |
| %260 = vector.extract %253[3, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [3] : i32 into vector<4xi32> | |
| %262 = vector.extract %arg8[2] : vector<4xi32> | |
| %263 = arith.muli %33, %261 : vector<4xi32> | |
| %264 = vector.reduction <add>, %263, %262 : vector<4xi32> into i32 | |
| %265 = vector.insert %264, %cst_0 [0] : i32 into vector<1xi32> | |
| %266 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %267 = vector.insert %266, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %268 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %269 = vector.insert %268, %267 [1] : vector<1xi8> into vector<4x1xi8> | |
| %270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %271 = vector.insert %270, %269 [2] : vector<1xi8> into vector<4x1xi8> | |
| %272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %273 = vector.insert %272, %271 [3] : vector<1xi8> into vector<4x1xi8> | |
| %274 = arith.extsi %273 : vector<4x1xi8> to vector<4x1xi32> | |
| %275 = vector.extract %274[0, 0] : vector<4x1xi32> | |
| %276 = vector.insert %275, %cst_1 [0] : i32 into vector<4xi32> | |
| %277 = vector.extract %274[1, 0] : vector<4x1xi32> | |
| %278 = vector.insert %277, %276 [1] : i32 into vector<4xi32> | |
| %279 = vector.extract %274[2, 0] : vector<4x1xi32> | |
| %280 = vector.insert %279, %278 [2] : i32 into vector<4xi32> | |
| %281 = vector.extract %274[3, 0] : vector<4x1xi32> | |
| %282 = vector.insert %281, %280 [3] : i32 into vector<4xi32> | |
| %283 = vector.extract %arg8[3] : vector<4xi32> | |
| %284 = arith.muli %33, %282 : vector<4xi32> | |
| %285 = vector.reduction <add>, %284, %283 : vector<4xi32> into i32 | |
| %286 = vector.insert %285, %cst_0 [0] : i32 into vector<1xi32> | |
| %287 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %288 = vector.insert %287, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %289 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %290 = vector.insert %289, %288 [1] : vector<1xi8> into vector<4x1xi8> | |
| %291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %292 = vector.insert %291, %290 [2] : vector<1xi8> into vector<4x1xi8> | |
| %293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %294 = vector.insert %293, %292 [3] : vector<1xi8> into vector<4x1xi8> | |
| %295 = arith.extsi %294 : vector<4x1xi8> to vector<4x1xi32> | |
| %296 = vector.extract %295[0, 0] : vector<4x1xi32> | |
| %297 = vector.insert %296, %cst_1 [0] : i32 into vector<4xi32> | |
| %298 = vector.extract %295[1, 0] : vector<4x1xi32> | |
| %299 = vector.insert %298, %297 [1] : i32 into vector<4xi32> | |
| %300 = vector.extract %295[2, 0] : vector<4x1xi32> | |
| %301 = vector.insert %300, %299 [2] : i32 into vector<4xi32> | |
| %302 = vector.extract %295[3, 0] : vector<4x1xi32> | |
| %303 = vector.insert %302, %301 [3] : i32 into vector<4xi32> | |
| %304 = vector.extract %arg7[0] : vector<4xi32> | |
| %305 = arith.muli %34, %303 : vector<4xi32> | |
| %306 = vector.reduction <add>, %305, %304 : vector<4xi32> into i32 | |
| %307 = vector.insert %306, %cst_0 [0] : i32 into vector<1xi32> | |
| %308 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %309 = vector.insert %308, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %310 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %311 = vector.insert %310, %309 [1] : vector<1xi8> into vector<4x1xi8> | |
| %312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %313 = vector.insert %312, %311 [2] : vector<1xi8> into vector<4x1xi8> | |
| %314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %315 = vector.insert %314, %313 [3] : vector<1xi8> into vector<4x1xi8> | |
| %316 = arith.extsi %315 : vector<4x1xi8> to vector<4x1xi32> | |
| %317 = vector.extract %316[0, 0] : vector<4x1xi32> | |
| %318 = vector.insert %317, %cst_1 [0] : i32 into vector<4xi32> | |
| %319 = vector.extract %316[1, 0] : vector<4x1xi32> | |
| %320 = vector.insert %319, %318 [1] : i32 into vector<4xi32> | |
| %321 = vector.extract %316[2, 0] : vector<4x1xi32> | |
| %322 = vector.insert %321, %320 [2] : i32 into vector<4xi32> | |
| %323 = vector.extract %316[3, 0] : vector<4x1xi32> | |
| %324 = vector.insert %323, %322 [3] : i32 into vector<4xi32> | |
| %325 = vector.extract %arg7[1] : vector<4xi32> | |
| %326 = arith.muli %34, %324 : vector<4xi32> | |
| %327 = vector.reduction <add>, %326, %325 : vector<4xi32> into i32 | |
| %328 = vector.insert %327, %cst_0 [0] : i32 into vector<1xi32> | |
| %329 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %330 = vector.insert %329, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %331 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %332 = vector.insert %331, %330 [1] : vector<1xi8> into vector<4x1xi8> | |
| %333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %334 = vector.insert %333, %332 [2] : vector<1xi8> into vector<4x1xi8> | |
| %335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %336 = vector.insert %335, %334 [3] : vector<1xi8> into vector<4x1xi8> | |
| %337 = arith.extsi %336 : vector<4x1xi8> to vector<4x1xi32> | |
| %338 = vector.extract %337[0, 0] : vector<4x1xi32> | |
| %339 = vector.insert %338, %cst_1 [0] : i32 into vector<4xi32> | |
| %340 = vector.extract %337[1, 0] : vector<4x1xi32> | |
| %341 = vector.insert %340, %339 [1] : i32 into vector<4xi32> | |
| %342 = vector.extract %337[2, 0] : vector<4x1xi32> | |
| %343 = vector.insert %342, %341 [2] : i32 into vector<4xi32> | |
| %344 = vector.extract %337[3, 0] : vector<4x1xi32> | |
| %345 = vector.insert %344, %343 [3] : i32 into vector<4xi32> | |
| %346 = vector.extract %arg7[2] : vector<4xi32> | |
| %347 = arith.muli %34, %345 : vector<4xi32> | |
| %348 = vector.reduction <add>, %347, %346 : vector<4xi32> into i32 | |
| %349 = vector.insert %348, %cst_0 [0] : i32 into vector<1xi32> | |
| %350 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %351 = vector.insert %350, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %352 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %353 = vector.insert %352, %351 [1] : vector<1xi8> into vector<4x1xi8> | |
| %354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %355 = vector.insert %354, %353 [2] : vector<1xi8> into vector<4x1xi8> | |
| %356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %357 = vector.insert %356, %355 [3] : vector<1xi8> into vector<4x1xi8> | |
| %358 = arith.extsi %357 : vector<4x1xi8> to vector<4x1xi32> | |
| %359 = vector.extract %358[0, 0] : vector<4x1xi32> | |
| %360 = vector.insert %359, %cst_1 [0] : i32 into vector<4xi32> | |
| %361 = vector.extract %358[1, 0] : vector<4x1xi32> | |
| %362 = vector.insert %361, %360 [1] : i32 into vector<4xi32> | |
| %363 = vector.extract %358[2, 0] : vector<4x1xi32> | |
| %364 = vector.insert %363, %362 [2] : i32 into vector<4xi32> | |
| %365 = vector.extract %358[3, 0] : vector<4x1xi32> | |
| %366 = vector.insert %365, %364 [3] : i32 into vector<4xi32> | |
| %367 = vector.extract %arg7[3] : vector<4xi32> | |
| %368 = arith.muli %34, %366 : vector<4xi32> | |
| %369 = vector.reduction <add>, %368, %367 : vector<4xi32> into i32 | |
| %370 = vector.insert %369, %cst_0 [0] : i32 into vector<1xi32> | |
| %371 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %372 = vector.insert_strided_slice %76, %371 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %373 = vector.insert_strided_slice %97, %372 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %374 = vector.insert_strided_slice %118, %373 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %375 = vector.insert_strided_slice %139, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %376 = vector.insert_strided_slice %160, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %377 = vector.insert_strided_slice %181, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %378 = vector.insert_strided_slice %202, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %379 = vector.insert_strided_slice %223, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %380 = vector.insert_strided_slice %244, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %381 = vector.insert_strided_slice %265, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %382 = vector.insert_strided_slice %286, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %383 = vector.insert_strided_slice %307, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %384 = vector.insert_strided_slice %328, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %385 = vector.insert_strided_slice %349, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %386 = vector.insert_strided_slice %370, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %386, %382, %378, %374 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| // -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = arith.extsi %20 : vector<4xi8> to vector<4xi32> | |
| %32 = arith.extsi %21 : vector<4xi8> to vector<4xi32> | |
| %33 = arith.extsi %22 : vector<4xi8> to vector<4xi32> | |
| %34 = arith.extsi %23 : vector<4xi8> to vector<4xi32> | |
| %35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8> | |
| %43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32> | |
| %44 = vector.extract %43[0, 0] : vector<4x1xi32> | |
| %45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32> | |
| %46 = vector.extract %43[1, 0] : vector<4x1xi32> | |
| %47 = vector.insert %46, %45 [1] : i32 into vector<4xi32> | |
| %48 = vector.extract %43[2, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %47 [2] : i32 into vector<4xi32> | |
| %50 = vector.extract %43[3, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [3] : i32 into vector<4xi32> | |
| %52 = vector.extract %arg10[0] : vector<4xi32> | |
| %53 = arith.muli %31, %51 : vector<4xi32> | |
| %54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32 | |
| %55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32> | |
| %56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32> | |
| %67 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %68 = vector.insert %67, %66 [1] : i32 into vector<4xi32> | |
| %69 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %68 [2] : i32 into vector<4xi32> | |
| %71 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [3] : i32 into vector<4xi32> | |
| %73 = vector.extract %arg10[1] : vector<4xi32> | |
| %74 = arith.muli %31, %72 : vector<4xi32> | |
| %75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32 | |
| %76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32> | |
| %77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8> | |
| %85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32> | |
| %86 = vector.extract %85[0, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32> | |
| %88 = vector.extract %85[1, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [1] : i32 into vector<4xi32> | |
| %90 = vector.extract %85[2, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %89 [2] : i32 into vector<4xi32> | |
| %92 = vector.extract %85[3, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [3] : i32 into vector<4xi32> | |
| %94 = vector.extract %arg10[2] : vector<4xi32> | |
| %95 = arith.muli %31, %93 : vector<4xi32> | |
| %96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32 | |
| %97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32> | |
| %98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8> | |
| %106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32> | |
| %107 = vector.extract %106[0, 0] : vector<4x1xi32> | |
| %108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32> | |
| %109 = vector.extract %106[1, 0] : vector<4x1xi32> | |
| %110 = vector.insert %109, %108 [1] : i32 into vector<4xi32> | |
| %111 = vector.extract %106[2, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %110 [2] : i32 into vector<4xi32> | |
| %113 = vector.extract %106[3, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [3] : i32 into vector<4xi32> | |
| %115 = vector.extract %arg10[3] : vector<4xi32> | |
| %116 = arith.muli %31, %114 : vector<4xi32> | |
| %117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32 | |
| %118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32> | |
| %119 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %120 = vector.insert %119, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %121 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %122 = vector.insert %121, %120 [1] : vector<1xi8> into vector<4x1xi8> | |
| %123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %124 = vector.insert %123, %122 [2] : vector<1xi8> into vector<4x1xi8> | |
| %125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %126 = vector.insert %125, %124 [3] : vector<1xi8> into vector<4x1xi8> | |
| %127 = arith.extsi %126 : vector<4x1xi8> to vector<4x1xi32> | |
| %128 = vector.extract %127[0, 0] : vector<4x1xi32> | |
| %129 = vector.insert %128, %cst_1 [0] : i32 into vector<4xi32> | |
| %130 = vector.extract %127[1, 0] : vector<4x1xi32> | |
| %131 = vector.insert %130, %129 [1] : i32 into vector<4xi32> | |
| %132 = vector.extract %127[2, 0] : vector<4x1xi32> | |
| %133 = vector.insert %132, %131 [2] : i32 into vector<4xi32> | |
| %134 = vector.extract %127[3, 0] : vector<4x1xi32> | |
| %135 = vector.insert %134, %133 [3] : i32 into vector<4xi32> | |
| %136 = vector.extract %arg9[0] : vector<4xi32> | |
| %137 = arith.muli %32, %135 : vector<4xi32> | |
| %138 = vector.reduction <add>, %137, %136 : vector<4xi32> into i32 | |
| %139 = vector.insert %138, %cst_0 [0] : i32 into vector<1xi32> | |
| %140 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %141 = vector.insert %140, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %142 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %143 = vector.insert %142, %141 [1] : vector<1xi8> into vector<4x1xi8> | |
| %144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %145 = vector.insert %144, %143 [2] : vector<1xi8> into vector<4x1xi8> | |
| %146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %147 = vector.insert %146, %145 [3] : vector<1xi8> into vector<4x1xi8> | |
| %148 = arith.extsi %147 : vector<4x1xi8> to vector<4x1xi32> | |
| %149 = vector.extract %148[0, 0] : vector<4x1xi32> | |
| %150 = vector.insert %149, %cst_1 [0] : i32 into vector<4xi32> | |
| %151 = vector.extract %148[1, 0] : vector<4x1xi32> | |
| %152 = vector.insert %151, %150 [1] : i32 into vector<4xi32> | |
| %153 = vector.extract %148[2, 0] : vector<4x1xi32> | |
| %154 = vector.insert %153, %152 [2] : i32 into vector<4xi32> | |
| %155 = vector.extract %148[3, 0] : vector<4x1xi32> | |
| %156 = vector.insert %155, %154 [3] : i32 into vector<4xi32> | |
| %157 = vector.extract %arg9[1] : vector<4xi32> | |
| %158 = arith.muli %32, %156 : vector<4xi32> | |
| %159 = vector.reduction <add>, %158, %157 : vector<4xi32> into i32 | |
| %160 = vector.insert %159, %cst_0 [0] : i32 into vector<1xi32> | |
| %161 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %162 = vector.insert %161, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %163 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %164 = vector.insert %163, %162 [1] : vector<1xi8> into vector<4x1xi8> | |
| %165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %166 = vector.insert %165, %164 [2] : vector<1xi8> into vector<4x1xi8> | |
| %167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %168 = vector.insert %167, %166 [3] : vector<1xi8> into vector<4x1xi8> | |
| %169 = arith.extsi %168 : vector<4x1xi8> to vector<4x1xi32> | |
| %170 = vector.extract %169[0, 0] : vector<4x1xi32> | |
| %171 = vector.insert %170, %cst_1 [0] : i32 into vector<4xi32> | |
| %172 = vector.extract %169[1, 0] : vector<4x1xi32> | |
| %173 = vector.insert %172, %171 [1] : i32 into vector<4xi32> | |
| %174 = vector.extract %169[2, 0] : vector<4x1xi32> | |
| %175 = vector.insert %174, %173 [2] : i32 into vector<4xi32> | |
| %176 = vector.extract %169[3, 0] : vector<4x1xi32> | |
| %177 = vector.insert %176, %175 [3] : i32 into vector<4xi32> | |
| %178 = vector.extract %arg9[2] : vector<4xi32> | |
| %179 = arith.muli %32, %177 : vector<4xi32> | |
| %180 = vector.reduction <add>, %179, %178 : vector<4xi32> into i32 | |
| %181 = vector.insert %180, %cst_0 [0] : i32 into vector<1xi32> | |
| %182 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %183 = vector.insert %182, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %184 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %185 = vector.insert %184, %183 [1] : vector<1xi8> into vector<4x1xi8> | |
| %186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %187 = vector.insert %186, %185 [2] : vector<1xi8> into vector<4x1xi8> | |
| %188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %189 = vector.insert %188, %187 [3] : vector<1xi8> into vector<4x1xi8> | |
| %190 = arith.extsi %189 : vector<4x1xi8> to vector<4x1xi32> | |
| %191 = vector.extract %190[0, 0] : vector<4x1xi32> | |
| %192 = vector.insert %191, %cst_1 [0] : i32 into vector<4xi32> | |
| %193 = vector.extract %190[1, 0] : vector<4x1xi32> | |
| %194 = vector.insert %193, %192 [1] : i32 into vector<4xi32> | |
| %195 = vector.extract %190[2, 0] : vector<4x1xi32> | |
| %196 = vector.insert %195, %194 [2] : i32 into vector<4xi32> | |
| %197 = vector.extract %190[3, 0] : vector<4x1xi32> | |
| %198 = vector.insert %197, %196 [3] : i32 into vector<4xi32> | |
| %199 = vector.extract %arg9[3] : vector<4xi32> | |
| %200 = arith.muli %32, %198 : vector<4xi32> | |
| %201 = vector.reduction <add>, %200, %199 : vector<4xi32> into i32 | |
| %202 = vector.insert %201, %cst_0 [0] : i32 into vector<1xi32> | |
| %203 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %204 = vector.insert %203, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %205 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %206 = vector.insert %205, %204 [1] : vector<1xi8> into vector<4x1xi8> | |
| %207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %208 = vector.insert %207, %206 [2] : vector<1xi8> into vector<4x1xi8> | |
| %209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %210 = vector.insert %209, %208 [3] : vector<1xi8> into vector<4x1xi8> | |
| %211 = arith.extsi %210 : vector<4x1xi8> to vector<4x1xi32> | |
| %212 = vector.extract %211[0, 0] : vector<4x1xi32> | |
| %213 = vector.insert %212, %cst_1 [0] : i32 into vector<4xi32> | |
| %214 = vector.extract %211[1, 0] : vector<4x1xi32> | |
| %215 = vector.insert %214, %213 [1] : i32 into vector<4xi32> | |
| %216 = vector.extract %211[2, 0] : vector<4x1xi32> | |
| %217 = vector.insert %216, %215 [2] : i32 into vector<4xi32> | |
| %218 = vector.extract %211[3, 0] : vector<4x1xi32> | |
| %219 = vector.insert %218, %217 [3] : i32 into vector<4xi32> | |
| %220 = vector.extract %arg8[0] : vector<4xi32> | |
| %221 = arith.muli %33, %219 : vector<4xi32> | |
| %222 = vector.reduction <add>, %221, %220 : vector<4xi32> into i32 | |
| %223 = vector.insert %222, %cst_0 [0] : i32 into vector<1xi32> | |
| %224 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %225 = vector.insert %224, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %226 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %227 = vector.insert %226, %225 [1] : vector<1xi8> into vector<4x1xi8> | |
| %228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %229 = vector.insert %228, %227 [2] : vector<1xi8> into vector<4x1xi8> | |
| %230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %231 = vector.insert %230, %229 [3] : vector<1xi8> into vector<4x1xi8> | |
| %232 = arith.extsi %231 : vector<4x1xi8> to vector<4x1xi32> | |
| %233 = vector.extract %232[0, 0] : vector<4x1xi32> | |
| %234 = vector.insert %233, %cst_1 [0] : i32 into vector<4xi32> | |
| %235 = vector.extract %232[1, 0] : vector<4x1xi32> | |
| %236 = vector.insert %235, %234 [1] : i32 into vector<4xi32> | |
| %237 = vector.extract %232[2, 0] : vector<4x1xi32> | |
| %238 = vector.insert %237, %236 [2] : i32 into vector<4xi32> | |
| %239 = vector.extract %232[3, 0] : vector<4x1xi32> | |
| %240 = vector.insert %239, %238 [3] : i32 into vector<4xi32> | |
| %241 = vector.extract %arg8[1] : vector<4xi32> | |
| %242 = arith.muli %33, %240 : vector<4xi32> | |
| %243 = vector.reduction <add>, %242, %241 : vector<4xi32> into i32 | |
| %244 = vector.insert %243, %cst_0 [0] : i32 into vector<1xi32> | |
| %245 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %246 = vector.insert %245, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %247 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %248 = vector.insert %247, %246 [1] : vector<1xi8> into vector<4x1xi8> | |
| %249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %250 = vector.insert %249, %248 [2] : vector<1xi8> into vector<4x1xi8> | |
| %251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %252 = vector.insert %251, %250 [3] : vector<1xi8> into vector<4x1xi8> | |
| %253 = arith.extsi %252 : vector<4x1xi8> to vector<4x1xi32> | |
| %254 = vector.extract %253[0, 0] : vector<4x1xi32> | |
| %255 = vector.insert %254, %cst_1 [0] : i32 into vector<4xi32> | |
| %256 = vector.extract %253[1, 0] : vector<4x1xi32> | |
| %257 = vector.insert %256, %255 [1] : i32 into vector<4xi32> | |
| %258 = vector.extract %253[2, 0] : vector<4x1xi32> | |
| %259 = vector.insert %258, %257 [2] : i32 into vector<4xi32> | |
| %260 = vector.extract %253[3, 0] : vector<4x1xi32> | |
| %261 = vector.insert %260, %259 [3] : i32 into vector<4xi32> | |
| %262 = vector.extract %arg8[2] : vector<4xi32> | |
| %263 = arith.muli %33, %261 : vector<4xi32> | |
| %264 = vector.reduction <add>, %263, %262 : vector<4xi32> into i32 | |
| %265 = vector.insert %264, %cst_0 [0] : i32 into vector<1xi32> | |
| %266 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %267 = vector.insert %266, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %268 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %269 = vector.insert %268, %267 [1] : vector<1xi8> into vector<4x1xi8> | |
| %270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %271 = vector.insert %270, %269 [2] : vector<1xi8> into vector<4x1xi8> | |
| %272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %273 = vector.insert %272, %271 [3] : vector<1xi8> into vector<4x1xi8> | |
| %274 = arith.extsi %273 : vector<4x1xi8> to vector<4x1xi32> | |
| %275 = vector.extract %274[0, 0] : vector<4x1xi32> | |
| %276 = vector.insert %275, %cst_1 [0] : i32 into vector<4xi32> | |
| %277 = vector.extract %274[1, 0] : vector<4x1xi32> | |
| %278 = vector.insert %277, %276 [1] : i32 into vector<4xi32> | |
| %279 = vector.extract %274[2, 0] : vector<4x1xi32> | |
| %280 = vector.insert %279, %278 [2] : i32 into vector<4xi32> | |
| %281 = vector.extract %274[3, 0] : vector<4x1xi32> | |
| %282 = vector.insert %281, %280 [3] : i32 into vector<4xi32> | |
| %283 = vector.extract %arg8[3] : vector<4xi32> | |
| %284 = arith.muli %33, %282 : vector<4xi32> | |
| %285 = vector.reduction <add>, %284, %283 : vector<4xi32> into i32 | |
| %286 = vector.insert %285, %cst_0 [0] : i32 into vector<1xi32> | |
| %287 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %288 = vector.insert %287, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %289 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %290 = vector.insert %289, %288 [1] : vector<1xi8> into vector<4x1xi8> | |
| %291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %292 = vector.insert %291, %290 [2] : vector<1xi8> into vector<4x1xi8> | |
| %293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %294 = vector.insert %293, %292 [3] : vector<1xi8> into vector<4x1xi8> | |
| %295 = arith.extsi %294 : vector<4x1xi8> to vector<4x1xi32> | |
| %296 = vector.extract %295[0, 0] : vector<4x1xi32> | |
| %297 = vector.insert %296, %cst_1 [0] : i32 into vector<4xi32> | |
| %298 = vector.extract %295[1, 0] : vector<4x1xi32> | |
| %299 = vector.insert %298, %297 [1] : i32 into vector<4xi32> | |
| %300 = vector.extract %295[2, 0] : vector<4x1xi32> | |
| %301 = vector.insert %300, %299 [2] : i32 into vector<4xi32> | |
| %302 = vector.extract %295[3, 0] : vector<4x1xi32> | |
| %303 = vector.insert %302, %301 [3] : i32 into vector<4xi32> | |
| %304 = vector.extract %arg7[0] : vector<4xi32> | |
| %305 = arith.muli %34, %303 : vector<4xi32> | |
| %306 = vector.reduction <add>, %305, %304 : vector<4xi32> into i32 | |
| %307 = vector.insert %306, %cst_0 [0] : i32 into vector<1xi32> | |
| %308 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %309 = vector.insert %308, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %310 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %311 = vector.insert %310, %309 [1] : vector<1xi8> into vector<4x1xi8> | |
| %312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %313 = vector.insert %312, %311 [2] : vector<1xi8> into vector<4x1xi8> | |
| %314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %315 = vector.insert %314, %313 [3] : vector<1xi8> into vector<4x1xi8> | |
| %316 = arith.extsi %315 : vector<4x1xi8> to vector<4x1xi32> | |
| %317 = vector.extract %316[0, 0] : vector<4x1xi32> | |
| %318 = vector.insert %317, %cst_1 [0] : i32 into vector<4xi32> | |
| %319 = vector.extract %316[1, 0] : vector<4x1xi32> | |
| %320 = vector.insert %319, %318 [1] : i32 into vector<4xi32> | |
| %321 = vector.extract %316[2, 0] : vector<4x1xi32> | |
| %322 = vector.insert %321, %320 [2] : i32 into vector<4xi32> | |
| %323 = vector.extract %316[3, 0] : vector<4x1xi32> | |
| %324 = vector.insert %323, %322 [3] : i32 into vector<4xi32> | |
| %325 = vector.extract %arg7[1] : vector<4xi32> | |
| %326 = arith.muli %34, %324 : vector<4xi32> | |
| %327 = vector.reduction <add>, %326, %325 : vector<4xi32> into i32 | |
| %328 = vector.insert %327, %cst_0 [0] : i32 into vector<1xi32> | |
| %329 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %330 = vector.insert %329, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %331 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %332 = vector.insert %331, %330 [1] : vector<1xi8> into vector<4x1xi8> | |
| %333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %334 = vector.insert %333, %332 [2] : vector<1xi8> into vector<4x1xi8> | |
| %335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %336 = vector.insert %335, %334 [3] : vector<1xi8> into vector<4x1xi8> | |
| %337 = arith.extsi %336 : vector<4x1xi8> to vector<4x1xi32> | |
| %338 = vector.extract %337[0, 0] : vector<4x1xi32> | |
| %339 = vector.insert %338, %cst_1 [0] : i32 into vector<4xi32> | |
| %340 = vector.extract %337[1, 0] : vector<4x1xi32> | |
| %341 = vector.insert %340, %339 [1] : i32 into vector<4xi32> | |
| %342 = vector.extract %337[2, 0] : vector<4x1xi32> | |
| %343 = vector.insert %342, %341 [2] : i32 into vector<4xi32> | |
| %344 = vector.extract %337[3, 0] : vector<4x1xi32> | |
| %345 = vector.insert %344, %343 [3] : i32 into vector<4xi32> | |
| %346 = vector.extract %arg7[2] : vector<4xi32> | |
| %347 = arith.muli %34, %345 : vector<4xi32> | |
| %348 = vector.reduction <add>, %347, %346 : vector<4xi32> into i32 | |
| %349 = vector.insert %348, %cst_0 [0] : i32 into vector<1xi32> | |
| %350 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %351 = vector.insert %350, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %352 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %353 = vector.insert %352, %351 [1] : vector<1xi8> into vector<4x1xi8> | |
| %354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %355 = vector.insert %354, %353 [2] : vector<1xi8> into vector<4x1xi8> | |
| %356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %357 = vector.insert %356, %355 [3] : vector<1xi8> into vector<4x1xi8> | |
| %358 = arith.extsi %357 : vector<4x1xi8> to vector<4x1xi32> | |
| %359 = vector.extract %358[0, 0] : vector<4x1xi32> | |
| %360 = vector.insert %359, %cst_1 [0] : i32 into vector<4xi32> | |
| %361 = vector.extract %358[1, 0] : vector<4x1xi32> | |
| %362 = vector.insert %361, %360 [1] : i32 into vector<4xi32> | |
| %363 = vector.extract %358[2, 0] : vector<4x1xi32> | |
| %364 = vector.insert %363, %362 [2] : i32 into vector<4xi32> | |
| %365 = vector.extract %358[3, 0] : vector<4x1xi32> | |
| %366 = vector.insert %365, %364 [3] : i32 into vector<4xi32> | |
| %367 = vector.extract %arg7[3] : vector<4xi32> | |
| %368 = arith.muli %34, %366 : vector<4xi32> | |
| %369 = vector.reduction <add>, %368, %367 : vector<4xi32> into i32 | |
| %370 = vector.insert %369, %cst_0 [0] : i32 into vector<1xi32> | |
| %371 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %372 = vector.insert_strided_slice %76, %371 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %373 = vector.insert_strided_slice %97, %372 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %374 = vector.insert_strided_slice %118, %373 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %375 = vector.insert_strided_slice %139, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %376 = vector.insert_strided_slice %160, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %377 = vector.insert_strided_slice %181, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %378 = vector.insert_strided_slice %202, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %379 = vector.insert_strided_slice %223, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %380 = vector.insert_strided_slice %244, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %381 = vector.insert_strided_slice %265, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %382 = vector.insert_strided_slice %286, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %383 = vector.insert_strided_slice %307, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %384 = vector.insert_strided_slice %328, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %385 = vector.insert_strided_slice %349, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %386 = vector.insert_strided_slice %370, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %386, %382, %378, %374 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After CSE (cse) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = arith.extsi %20 : vector<4xi8> to vector<4xi32> | |
| %32 = arith.extsi %21 : vector<4xi8> to vector<4xi32> | |
| %33 = arith.extsi %22 : vector<4xi8> to vector<4xi32> | |
| %34 = arith.extsi %23 : vector<4xi8> to vector<4xi32> | |
| %35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8> | |
| %43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32> | |
| %44 = vector.extract %43[0, 0] : vector<4x1xi32> | |
| %45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32> | |
| %46 = vector.extract %43[1, 0] : vector<4x1xi32> | |
| %47 = vector.insert %46, %45 [1] : i32 into vector<4xi32> | |
| %48 = vector.extract %43[2, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %47 [2] : i32 into vector<4xi32> | |
| %50 = vector.extract %43[3, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [3] : i32 into vector<4xi32> | |
| %52 = vector.extract %arg10[0] : vector<4xi32> | |
| %53 = arith.muli %31, %51 : vector<4xi32> | |
| %54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32 | |
| %55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32> | |
| %56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32> | |
| %67 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %68 = vector.insert %67, %66 [1] : i32 into vector<4xi32> | |
| %69 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %68 [2] : i32 into vector<4xi32> | |
| %71 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [3] : i32 into vector<4xi32> | |
| %73 = vector.extract %arg10[1] : vector<4xi32> | |
| %74 = arith.muli %31, %72 : vector<4xi32> | |
| %75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32 | |
| %76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32> | |
| %77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8> | |
| %85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32> | |
| %86 = vector.extract %85[0, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32> | |
| %88 = vector.extract %85[1, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [1] : i32 into vector<4xi32> | |
| %90 = vector.extract %85[2, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %89 [2] : i32 into vector<4xi32> | |
| %92 = vector.extract %85[3, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [3] : i32 into vector<4xi32> | |
| %94 = vector.extract %arg10[2] : vector<4xi32> | |
| %95 = arith.muli %31, %93 : vector<4xi32> | |
| %96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32 | |
| %97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32> | |
| %98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8> | |
| %106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32> | |
| %107 = vector.extract %106[0, 0] : vector<4x1xi32> | |
| %108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32> | |
| %109 = vector.extract %106[1, 0] : vector<4x1xi32> | |
| %110 = vector.insert %109, %108 [1] : i32 into vector<4xi32> | |
| %111 = vector.extract %106[2, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %110 [2] : i32 into vector<4xi32> | |
| %113 = vector.extract %106[3, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [3] : i32 into vector<4xi32> | |
| %115 = vector.extract %arg10[3] : vector<4xi32> | |
| %116 = arith.muli %31, %114 : vector<4xi32> | |
| %117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32 | |
| %118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32> | |
| %119 = vector.extract %arg9[0] : vector<4xi32> | |
| %120 = arith.muli %32, %51 : vector<4xi32> | |
| %121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32 | |
| %122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32> | |
| %123 = vector.extract %arg9[1] : vector<4xi32> | |
| %124 = arith.muli %32, %72 : vector<4xi32> | |
| %125 = vector.reduction <add>, %124, %123 : vector<4xi32> into i32 | |
| %126 = vector.insert %125, %cst_0 [0] : i32 into vector<1xi32> | |
| %127 = vector.extract %arg9[2] : vector<4xi32> | |
| %128 = arith.muli %32, %93 : vector<4xi32> | |
| %129 = vector.reduction <add>, %128, %127 : vector<4xi32> into i32 | |
| %130 = vector.insert %129, %cst_0 [0] : i32 into vector<1xi32> | |
| %131 = vector.extract %arg9[3] : vector<4xi32> | |
| %132 = arith.muli %32, %114 : vector<4xi32> | |
| %133 = vector.reduction <add>, %132, %131 : vector<4xi32> into i32 | |
| %134 = vector.insert %133, %cst_0 [0] : i32 into vector<1xi32> | |
| %135 = vector.extract %arg8[0] : vector<4xi32> | |
| %136 = arith.muli %33, %51 : vector<4xi32> | |
| %137 = vector.reduction <add>, %136, %135 : vector<4xi32> into i32 | |
| %138 = vector.insert %137, %cst_0 [0] : i32 into vector<1xi32> | |
| %139 = vector.extract %arg8[1] : vector<4xi32> | |
| %140 = arith.muli %33, %72 : vector<4xi32> | |
| %141 = vector.reduction <add>, %140, %139 : vector<4xi32> into i32 | |
| %142 = vector.insert %141, %cst_0 [0] : i32 into vector<1xi32> | |
| %143 = vector.extract %arg8[2] : vector<4xi32> | |
| %144 = arith.muli %33, %93 : vector<4xi32> | |
| %145 = vector.reduction <add>, %144, %143 : vector<4xi32> into i32 | |
| %146 = vector.insert %145, %cst_0 [0] : i32 into vector<1xi32> | |
| %147 = vector.extract %arg8[3] : vector<4xi32> | |
| %148 = arith.muli %33, %114 : vector<4xi32> | |
| %149 = vector.reduction <add>, %148, %147 : vector<4xi32> into i32 | |
| %150 = vector.insert %149, %cst_0 [0] : i32 into vector<1xi32> | |
| %151 = vector.extract %arg7[0] : vector<4xi32> | |
| %152 = arith.muli %34, %51 : vector<4xi32> | |
| %153 = vector.reduction <add>, %152, %151 : vector<4xi32> into i32 | |
| %154 = vector.insert %153, %cst_0 [0] : i32 into vector<1xi32> | |
| %155 = vector.extract %arg7[1] : vector<4xi32> | |
| %156 = arith.muli %34, %72 : vector<4xi32> | |
| %157 = vector.reduction <add>, %156, %155 : vector<4xi32> into i32 | |
| %158 = vector.insert %157, %cst_0 [0] : i32 into vector<1xi32> | |
| %159 = vector.extract %arg7[2] : vector<4xi32> | |
| %160 = arith.muli %34, %93 : vector<4xi32> | |
| %161 = vector.reduction <add>, %160, %159 : vector<4xi32> into i32 | |
| %162 = vector.insert %161, %cst_0 [0] : i32 into vector<1xi32> | |
| %163 = vector.extract %arg7[3] : vector<4xi32> | |
| %164 = arith.muli %34, %114 : vector<4xi32> | |
| %165 = vector.reduction <add>, %164, %163 : vector<4xi32> into i32 | |
| %166 = vector.insert %165, %cst_0 [0] : i32 into vector<1xi32> | |
| %167 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %168 = vector.insert_strided_slice %76, %167 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %169 = vector.insert_strided_slice %97, %168 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %170 = vector.insert_strided_slice %118, %169 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %171 = vector.insert_strided_slice %122, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %172 = vector.insert_strided_slice %126, %171 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %173 = vector.insert_strided_slice %130, %172 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %174 = vector.insert_strided_slice %134, %173 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %175 = vector.insert_strided_slice %138, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %176 = vector.insert_strided_slice %142, %175 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %177 = vector.insert_strided_slice %146, %176 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %178 = vector.insert_strided_slice %150, %177 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %179 = vector.insert_strided_slice %154, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %180 = vector.insert_strided_slice %158, %179 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %181 = vector.insert_strided_slice %162, %180 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %182 = vector.insert_strided_slice %166, %181 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %182, %178, %174, %170 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = arith.extsi %20 : vector<4xi8> to vector<4xi32> | |
| %32 = arith.extsi %21 : vector<4xi8> to vector<4xi32> | |
| %33 = arith.extsi %22 : vector<4xi8> to vector<4xi32> | |
| %34 = arith.extsi %23 : vector<4xi8> to vector<4xi32> | |
| %35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8> | |
| %43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32> | |
| %44 = vector.extract %43[0, 0] : vector<4x1xi32> | |
| %45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32> | |
| %46 = vector.extract %43[1, 0] : vector<4x1xi32> | |
| %47 = vector.insert %46, %45 [1] : i32 into vector<4xi32> | |
| %48 = vector.extract %43[2, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %47 [2] : i32 into vector<4xi32> | |
| %50 = vector.extract %43[3, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [3] : i32 into vector<4xi32> | |
| %52 = vector.extract %arg10[0] : vector<4xi32> | |
| %53 = arith.muli %31, %51 : vector<4xi32> | |
| %54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32 | |
| %55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32> | |
| %56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32> | |
| %67 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %68 = vector.insert %67, %66 [1] : i32 into vector<4xi32> | |
| %69 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %68 [2] : i32 into vector<4xi32> | |
| %71 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [3] : i32 into vector<4xi32> | |
| %73 = vector.extract %arg10[1] : vector<4xi32> | |
| %74 = arith.muli %31, %72 : vector<4xi32> | |
| %75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32 | |
| %76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32> | |
| %77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8> | |
| %85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32> | |
| %86 = vector.extract %85[0, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32> | |
| %88 = vector.extract %85[1, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [1] : i32 into vector<4xi32> | |
| %90 = vector.extract %85[2, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %89 [2] : i32 into vector<4xi32> | |
| %92 = vector.extract %85[3, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [3] : i32 into vector<4xi32> | |
| %94 = vector.extract %arg10[2] : vector<4xi32> | |
| %95 = arith.muli %31, %93 : vector<4xi32> | |
| %96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32 | |
| %97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32> | |
| %98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8> | |
| %106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32> | |
| %107 = vector.extract %106[0, 0] : vector<4x1xi32> | |
| %108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32> | |
| %109 = vector.extract %106[1, 0] : vector<4x1xi32> | |
| %110 = vector.insert %109, %108 [1] : i32 into vector<4xi32> | |
| %111 = vector.extract %106[2, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %110 [2] : i32 into vector<4xi32> | |
| %113 = vector.extract %106[3, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [3] : i32 into vector<4xi32> | |
| %115 = vector.extract %arg10[3] : vector<4xi32> | |
| %116 = arith.muli %31, %114 : vector<4xi32> | |
| %117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32 | |
| %118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32> | |
| %119 = vector.extract %arg9[0] : vector<4xi32> | |
| %120 = arith.muli %32, %51 : vector<4xi32> | |
| %121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32 | |
| %122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32> | |
| %123 = vector.extract %arg9[1] : vector<4xi32> | |
| %124 = arith.muli %32, %72 : vector<4xi32> | |
| %125 = vector.reduction <add>, %124, %123 : vector<4xi32> into i32 | |
| %126 = vector.insert %125, %cst_0 [0] : i32 into vector<1xi32> | |
| %127 = vector.extract %arg9[2] : vector<4xi32> | |
| %128 = arith.muli %32, %93 : vector<4xi32> | |
| %129 = vector.reduction <add>, %128, %127 : vector<4xi32> into i32 | |
| %130 = vector.insert %129, %cst_0 [0] : i32 into vector<1xi32> | |
| %131 = vector.extract %arg9[3] : vector<4xi32> | |
| %132 = arith.muli %32, %114 : vector<4xi32> | |
| %133 = vector.reduction <add>, %132, %131 : vector<4xi32> into i32 | |
| %134 = vector.insert %133, %cst_0 [0] : i32 into vector<1xi32> | |
| %135 = vector.extract %arg8[0] : vector<4xi32> | |
| %136 = arith.muli %33, %51 : vector<4xi32> | |
| %137 = vector.reduction <add>, %136, %135 : vector<4xi32> into i32 | |
| %138 = vector.insert %137, %cst_0 [0] : i32 into vector<1xi32> | |
| %139 = vector.extract %arg8[1] : vector<4xi32> | |
| %140 = arith.muli %33, %72 : vector<4xi32> | |
| %141 = vector.reduction <add>, %140, %139 : vector<4xi32> into i32 | |
| %142 = vector.insert %141, %cst_0 [0] : i32 into vector<1xi32> | |
| %143 = vector.extract %arg8[2] : vector<4xi32> | |
| %144 = arith.muli %33, %93 : vector<4xi32> | |
| %145 = vector.reduction <add>, %144, %143 : vector<4xi32> into i32 | |
| %146 = vector.insert %145, %cst_0 [0] : i32 into vector<1xi32> | |
| %147 = vector.extract %arg8[3] : vector<4xi32> | |
| %148 = arith.muli %33, %114 : vector<4xi32> | |
| %149 = vector.reduction <add>, %148, %147 : vector<4xi32> into i32 | |
| %150 = vector.insert %149, %cst_0 [0] : i32 into vector<1xi32> | |
| %151 = vector.extract %arg7[0] : vector<4xi32> | |
| %152 = arith.muli %34, %51 : vector<4xi32> | |
| %153 = vector.reduction <add>, %152, %151 : vector<4xi32> into i32 | |
| %154 = vector.insert %153, %cst_0 [0] : i32 into vector<1xi32> | |
| %155 = vector.extract %arg7[1] : vector<4xi32> | |
| %156 = arith.muli %34, %72 : vector<4xi32> | |
| %157 = vector.reduction <add>, %156, %155 : vector<4xi32> into i32 | |
| %158 = vector.insert %157, %cst_0 [0] : i32 into vector<1xi32> | |
| %159 = vector.extract %arg7[2] : vector<4xi32> | |
| %160 = arith.muli %34, %93 : vector<4xi32> | |
| %161 = vector.reduction <add>, %160, %159 : vector<4xi32> into i32 | |
| %162 = vector.insert %161, %cst_0 [0] : i32 into vector<1xi32> | |
| %163 = vector.extract %arg7[3] : vector<4xi32> | |
| %164 = arith.muli %34, %114 : vector<4xi32> | |
| %165 = vector.reduction <add>, %164, %163 : vector<4xi32> into i32 | |
| %166 = vector.insert %165, %cst_0 [0] : i32 into vector<1xi32> | |
| %167 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %168 = vector.insert_strided_slice %76, %167 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %169 = vector.insert_strided_slice %97, %168 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %170 = vector.insert_strided_slice %118, %169 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %171 = vector.insert_strided_slice %122, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %172 = vector.insert_strided_slice %126, %171 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %173 = vector.insert_strided_slice %130, %172 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %174 = vector.insert_strided_slice %134, %173 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %175 = vector.insert_strided_slice %138, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %176 = vector.insert_strided_slice %142, %175 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %177 = vector.insert_strided_slice %146, %176 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %178 = vector.insert_strided_slice %150, %177 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %179 = vector.insert_strided_slice %154, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %180 = vector.insert_strided_slice %158, %179 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %181 = vector.insert_strided_slice %162, %180 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %182 = vector.insert_strided_slice %166, %181 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %182, %178, %174, %170 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = arith.extsi %20 : vector<4xi8> to vector<4xi32> | |
| %32 = arith.extsi %21 : vector<4xi8> to vector<4xi32> | |
| %33 = arith.extsi %22 : vector<4xi8> to vector<4xi32> | |
| %34 = arith.extsi %23 : vector<4xi8> to vector<4xi32> | |
| %35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8> | |
| %43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32> | |
| %44 = vector.extract %43[0, 0] : vector<4x1xi32> | |
| %45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32> | |
| %46 = vector.extract %43[1, 0] : vector<4x1xi32> | |
| %47 = vector.insert %46, %45 [1] : i32 into vector<4xi32> | |
| %48 = vector.extract %43[2, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %47 [2] : i32 into vector<4xi32> | |
| %50 = vector.extract %43[3, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [3] : i32 into vector<4xi32> | |
| %52 = vector.extract %arg10[0] : vector<4xi32> | |
| %53 = arith.muli %31, %51 : vector<4xi32> | |
| %54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32 | |
| %55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32> | |
| %56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32> | |
| %67 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %68 = vector.insert %67, %66 [1] : i32 into vector<4xi32> | |
| %69 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %68 [2] : i32 into vector<4xi32> | |
| %71 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [3] : i32 into vector<4xi32> | |
| %73 = vector.extract %arg10[1] : vector<4xi32> | |
| %74 = arith.muli %31, %72 : vector<4xi32> | |
| %75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32 | |
| %76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32> | |
| %77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8> | |
| %85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32> | |
| %86 = vector.extract %85[0, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32> | |
| %88 = vector.extract %85[1, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [1] : i32 into vector<4xi32> | |
| %90 = vector.extract %85[2, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %89 [2] : i32 into vector<4xi32> | |
| %92 = vector.extract %85[3, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [3] : i32 into vector<4xi32> | |
| %94 = vector.extract %arg10[2] : vector<4xi32> | |
| %95 = arith.muli %31, %93 : vector<4xi32> | |
| %96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32 | |
| %97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32> | |
| %98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8> | |
| %106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32> | |
| %107 = vector.extract %106[0, 0] : vector<4x1xi32> | |
| %108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32> | |
| %109 = vector.extract %106[1, 0] : vector<4x1xi32> | |
| %110 = vector.insert %109, %108 [1] : i32 into vector<4xi32> | |
| %111 = vector.extract %106[2, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %110 [2] : i32 into vector<4xi32> | |
| %113 = vector.extract %106[3, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [3] : i32 into vector<4xi32> | |
| %115 = vector.extract %arg10[3] : vector<4xi32> | |
| %116 = arith.muli %31, %114 : vector<4xi32> | |
| %117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32 | |
| %118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32> | |
| %119 = vector.extract %arg9[0] : vector<4xi32> | |
| %120 = arith.muli %32, %51 : vector<4xi32> | |
| %121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32 | |
| %122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32> | |
| %123 = vector.extract %arg9[1] : vector<4xi32> | |
| %124 = arith.muli %32, %72 : vector<4xi32> | |
| %125 = vector.reduction <add>, %124, %123 : vector<4xi32> into i32 | |
| %126 = vector.insert %125, %cst_0 [0] : i32 into vector<1xi32> | |
| %127 = vector.extract %arg9[2] : vector<4xi32> | |
| %128 = arith.muli %32, %93 : vector<4xi32> | |
| %129 = vector.reduction <add>, %128, %127 : vector<4xi32> into i32 | |
| %130 = vector.insert %129, %cst_0 [0] : i32 into vector<1xi32> | |
| %131 = vector.extract %arg9[3] : vector<4xi32> | |
| %132 = arith.muli %32, %114 : vector<4xi32> | |
| %133 = vector.reduction <add>, %132, %131 : vector<4xi32> into i32 | |
| %134 = vector.insert %133, %cst_0 [0] : i32 into vector<1xi32> | |
| %135 = vector.extract %arg8[0] : vector<4xi32> | |
| %136 = arith.muli %33, %51 : vector<4xi32> | |
| %137 = vector.reduction <add>, %136, %135 : vector<4xi32> into i32 | |
| %138 = vector.insert %137, %cst_0 [0] : i32 into vector<1xi32> | |
| %139 = vector.extract %arg8[1] : vector<4xi32> | |
| %140 = arith.muli %33, %72 : vector<4xi32> | |
| %141 = vector.reduction <add>, %140, %139 : vector<4xi32> into i32 | |
| %142 = vector.insert %141, %cst_0 [0] : i32 into vector<1xi32> | |
| %143 = vector.extract %arg8[2] : vector<4xi32> | |
| %144 = arith.muli %33, %93 : vector<4xi32> | |
| %145 = vector.reduction <add>, %144, %143 : vector<4xi32> into i32 | |
| %146 = vector.insert %145, %cst_0 [0] : i32 into vector<1xi32> | |
| %147 = vector.extract %arg8[3] : vector<4xi32> | |
| %148 = arith.muli %33, %114 : vector<4xi32> | |
| %149 = vector.reduction <add>, %148, %147 : vector<4xi32> into i32 | |
| %150 = vector.insert %149, %cst_0 [0] : i32 into vector<1xi32> | |
| %151 = vector.extract %arg7[0] : vector<4xi32> | |
| %152 = arith.muli %34, %51 : vector<4xi32> | |
| %153 = vector.reduction <add>, %152, %151 : vector<4xi32> into i32 | |
| %154 = vector.insert %153, %cst_0 [0] : i32 into vector<1xi32> | |
| %155 = vector.extract %arg7[1] : vector<4xi32> | |
| %156 = arith.muli %34, %72 : vector<4xi32> | |
| %157 = vector.reduction <add>, %156, %155 : vector<4xi32> into i32 | |
| %158 = vector.insert %157, %cst_0 [0] : i32 into vector<1xi32> | |
| %159 = vector.extract %arg7[2] : vector<4xi32> | |
| %160 = arith.muli %34, %93 : vector<4xi32> | |
| %161 = vector.reduction <add>, %160, %159 : vector<4xi32> into i32 | |
| %162 = vector.insert %161, %cst_0 [0] : i32 into vector<1xi32> | |
| %163 = vector.extract %arg7[3] : vector<4xi32> | |
| %164 = arith.muli %34, %114 : vector<4xi32> | |
| %165 = vector.reduction <add>, %164, %163 : vector<4xi32> into i32 | |
| %166 = vector.insert %165, %cst_0 [0] : i32 into vector<1xi32> | |
| %167 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %168 = vector.insert_strided_slice %76, %167 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %169 = vector.insert_strided_slice %97, %168 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %170 = vector.insert_strided_slice %118, %169 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %171 = vector.insert_strided_slice %122, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %172 = vector.insert_strided_slice %126, %171 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %173 = vector.insert_strided_slice %130, %172 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %174 = vector.insert_strided_slice %134, %173 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %175 = vector.insert_strided_slice %138, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %176 = vector.insert_strided_slice %142, %175 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %177 = vector.insert_strided_slice %146, %176 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %178 = vector.insert_strided_slice %150, %177 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %179 = vector.insert_strided_slice %154, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %180 = vector.insert_strided_slice %158, %179 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %181 = vector.insert_strided_slice %162, %180 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| %182 = vector.insert_strided_slice %166, %181 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32> | |
| scf.yield %182, %178, %174, %170 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32> | |
| } | |
| %16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32> | |
| %inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32> | |
| scf.yield %inserted_slice : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 0 : index} | |
| scf.yield %14 : tensor<8x32xi32> | |
| } {iree.spirv.distribute_dim = 1 : index} | |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| } | |
| } | |
| return | |
| } | |
| } | |
| // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
| module { | |
| func.func @_main_dispatch_0_matmul_1024x1024x1024() { | |
| %cst = arith.constant dense<0> : vector<4x1xi8> | |
| %cst_0 = arith.constant dense<0> : vector<1xi32> | |
| %cst_1 = arith.constant dense<0> : vector<4xi32> | |
| %c1 = arith.constant 1 : index | |
| %c2 = arith.constant 2 : index | |
| %c3 = arith.constant 3 : index | |
| %c0_i8 = arith.constant 0 : i8 | |
| %c32 = arith.constant 32 : index | |
| %c8 = arith.constant 8 : index | |
| %c4 = arith.constant 4 : index | |
| %c1024 = arith.constant 1024 : index | |
| %c0 = arith.constant 0 : index | |
| %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> | |
| %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> | |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index | |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index | |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index | |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index | |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
| scf.for %arg0 = %3 to %c1024 step %4 { | |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8> | |
| scf.for %arg1 = %5 to %c1024 step %6 { | |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32> | |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8> | |
| %10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) { | |
| %11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
| %12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2) | |
| %13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2) | |
| %14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) { | |
| %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32> | |
| %15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) { | |
| %20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8> | |
| %24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6) | |
| %26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6) | |
| %28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6) | |
| %30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8> | |
| %31 = arith.extsi %20 : vector<4xi8> to vector<4xi32> | |
| %32 = arith.extsi %21 : vector<4xi8> to vector<4xi32> | |
| %33 = arith.extsi %22 : vector<4xi8> to vector<4xi32> | |
| %34 = arith.extsi %23 : vector<4xi8> to vector<4xi32> | |
| %35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8> | |
| %39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8> | |
| %41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8> | |
| %43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32> | |
| %44 = vector.extract %43[0, 0] : vector<4x1xi32> | |
| %45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32> | |
| %46 = vector.extract %43[1, 0] : vector<4x1xi32> | |
| %47 = vector.insert %46, %45 [1] : i32 into vector<4xi32> | |
| %48 = vector.extract %43[2, 0] : vector<4x1xi32> | |
| %49 = vector.insert %48, %47 [2] : i32 into vector<4xi32> | |
| %50 = vector.extract %43[3, 0] : vector<4x1xi32> | |
| %51 = vector.insert %50, %49 [3] : i32 into vector<4xi32> | |
| %52 = vector.extract %arg10[0] : vector<4xi32> | |
| %53 = arith.muli %31, %51 : vector<4xi32> | |
| %54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32 | |
| %55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32> | |
| %56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8> | |
| %60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8> | |
| %62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8> | |
| %64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32> | |
| %65 = vector.extract %64[0, 0] : vector<4x1xi32> | |
| %66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32> | |
| %67 = vector.extract %64[1, 0] : vector<4x1xi32> | |
| %68 = vector.insert %67, %66 [1] : i32 into vector<4xi32> | |
| %69 = vector.extract %64[2, 0] : vector<4x1xi32> | |
| %70 = vector.insert %69, %68 [2] : i32 into vector<4xi32> | |
| %71 = vector.extract %64[3, 0] : vector<4x1xi32> | |
| %72 = vector.insert %71, %70 [3] : i32 into vector<4xi32> | |
| %73 = vector.extract %arg10[1] : vector<4xi32> | |
| %74 = arith.muli %31, %72 : vector<4xi32> | |
| %75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32 | |
| %76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32> | |
| %77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8> | |
| %81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8> | |
| %83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8> | |
| %85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32> | |
| %86 = vector.extract %85[0, 0] : vector<4x1xi32> | |
| %87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32> | |
| %88 = vector.extract %85[1, 0] : vector<4x1xi32> | |
| %89 = vector.insert %88, %87 [1] : i32 into vector<4xi32> | |
| %90 = vector.extract %85[2, 0] : vector<4x1xi32> | |
| %91 = vector.insert %90, %89 [2] : i32 into vector<4xi32> | |
| %92 = vector.extract %85[3, 0] : vector<4x1xi32> | |
| %93 = vector.insert %92, %91 [3] : i32 into vector<4xi32> | |
| %94 = vector.extract %arg10[2] : vector<4xi32> | |
| %95 = arith.muli %31, %93 : vector<4xi32> | |
| %96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32 | |
| %97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32> | |
| %98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8> | |
| %100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8> | |
| %102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8> | |
| %104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8> | |
| %105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8> | |
| %106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32> | |
| %107 = vector.extract %106[0, 0] : vector<4x1xi32> | |
| %108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32> | |
| %109 = vector.extract %106[1, 0] : vector<4x1xi32> | |
| %110 = vector.insert %109, %108 [1] : i32 into vector<4xi32> | |
| %111 = vector.extract %106[2, 0] : vector<4x1xi32> | |
| %112 = vector.insert %111, %110 [2] : i32 into vector<4xi32> | |
| %113 = vector.extract %106[3, 0] : vector<4x1xi32> | |
| %114 = vector.insert %113, %112 [3] : i32 into vector<4xi32> | |
| %115 = vector.extract %arg10[3] : vector<4xi32> | |
| %116 = arith.muli %31, %114 : vector<4xi32> | |
| %117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32 | |
| %118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32> | |
| %119 = vector.extract %arg9[0] : vector<4xi32> | |
| %120 = arith.muli %32, %51 : vector<4xi32> | |
| %121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32 | |
| %122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32> | |
| %123 = vector.extract %arg9[1] : vector<4xi32> | |
| %124 = arith.muli %32, %72 : vector<4xi32> | |
| %125 = vector.reduction <add>, %124, %123 : vector<4xi32> into i32 | |
| %126 = vector.insert %125, %cst_0 [0] : i32 into vector<1xi32> | |
| %127 = vector.extract %arg9[2] : vector<4xi32> | |
| %128 = arith.muli %32, %93 : vector<4xi32> | |
| %129 = vector.reduction <add>, %128, %127 : vector<4xi32> into i32 | |
| %130 = vector.insert %129, %cst_0 [0] : i32 into vector<1xi32> | |
| %131 = vector.extract %arg9[3] : vector<4xi32> | |
| %132 = arith.muli %32, %114 : vector<4xi32> | |
| %133 = vector.reduction <add>, %132, %131 : vector<4xi32> into i32 | |
| %134 = vector.insert %133, %cst_0 [0] : i32 into vector<1xi32> | |
| %135 = vector.extract %arg8[0] : vector<4xi32> | |
| %136 = arith.muli %33, %51 : vector<4xi32> | |
| %137 = vector.reduction <add>, %136, %135 : vector<4xi32> into i32 | |
| %138 = vector.insert %137, %cst_0 [0] : i32 into vector<1xi32> | |
| %139 = vector.extract %arg8[1] : vector<4xi32> | |
| %140 = arith.muli %33, %72 : vector<4xi32> | |
| %141 = vector.reduction <add>, %140, %139 : vector<4xi32> into i32 | |
| %142 = vector.insert %141, %cst_0 [0] : i32 into vector<1xi32> | |
| %143 = vector.extract %arg8[2] : vector<4xi32> | |
| %144 = arith.muli %33, %93 : vector<4xi32> | |
| %145 = vector.reduction <add>, %144, %143 : vector<4xi32> into i32 | |
| %146 = vector.insert %145, %cst_0 [0] : i32 into vector<1xi32> | |
| %147 = vector.extract %arg8[3] : vector<4xi32> | |
| %148 = |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment