Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created January 22, 2026 23:50
Show Gist options
  • Select an option

  • Save bjacob/6e6259f24ac05011ba9603c29d8ee18b to your computer and use it in GitHub Desktop.

Select an option

Save bjacob/6e6259f24ac05011ba9603c29d8ee18b to your computer and use it in GitHub Desktop.
diff --git a/tmp/log-good.mlir b/tmp/log-bad.mlir
index 0408554..f00f377 100644
--- a/tmp/log-good.mlir
+++ b/tmp/log-bad.mlir
@@ -400,7 +400,7 @@ module {
%11 = iree_tensor_ext.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [48, 768, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<48x768x16x1xf16>> -> tensor<48x768x16x1xf16>
%12 = tensor.empty() : tensor<4x48x16x16xf32>
%13 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%12 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
- %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%13 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
+ %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%13 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
%15 = tensor.empty() : tensor<64x768xf32>
%unpack = linalg.unpack %14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1]>} : tensor<4x48x16x16xf32> -> tensor<64x768xf32>
iree_tensor_ext.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [64, 768], strides = [1, 1] : tensor<64x768xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<64x768xf32>>
@@ -667,7 +667,7 @@ module {
%19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
%20 = tensor.empty() : tensor<4x48x16x16xf32>
%21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
- %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
+ %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
%23 = tensor.empty() : tensor<4x48x16x16xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%23 : tensor<4x48x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
@@ -773,7 +773,7 @@ module {
%15 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0, 0], sizes = [192, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<192x16xf16>> -> tensor<192x16xf16>
%16 = tensor.empty() : tensor<4x192x16x16xf32>
%17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst_1 : f32) outs(%16 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
- %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x768x16x1xf16>, tensor<192x768x16x1xf16>) outs(%17 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
+ %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 192, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x768x16x1xf16>, tensor<192x768x16x1xf16>) outs(%17 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
%19 = tensor.empty() : tensor<4x192x16x16xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %15 : tensor<4x192x16x16xf32>, tensor<192x16xf16>) outs(%19 : tensor<4x192x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
^bb0(%in: f32, %in_2: f16, %out: f16):
@@ -830,7 +830,7 @@ module {
%19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
%20 = tensor.empty() : tensor<4x48x16x16xf32>
%21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
- %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
+ %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
%23 = tensor.empty() : tensor<4x48x16x16xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%23 : tensor<4x48x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
@@ -875,7 +875,7 @@ module {
%15 = iree_tensor_ext.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
%16 = tensor.empty() : tensor<4x48x16x16xf32>
%17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%16 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
- %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%12, %13 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%17 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
+ %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%12, %13 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%17 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
%19 = tensor.empty() : tensor<4x48x16x16xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %18, %14 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%19 : tensor<4x48x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
@@ -1040,7 +1040,7 @@ module {
%11 = iree_tensor_ext.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [80, 1280, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<80x1280x16x1xf16>> -> tensor<80x1280x16x1xf16>
%12 = tensor.empty() : tensor<4x80x16x16xf32>
%13 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%12 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
- %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%13 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
+ %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%13 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
%15 = tensor.empty() : tensor<64x1280xf32>
%unpack = linalg.unpack %14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1]>} : tensor<4x80x16x16xf32> -> tensor<64x1280xf32>
iree_tensor_ext.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [64, 1280], strides = [1, 1] : tensor<64x1280xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<64x1280xf32>>
@@ -1307,7 +1307,7 @@ module {
%19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 80, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x80x16x16xf16>> -> tensor<4x80x16x16xf16>
%20 = tensor.empty() : tensor<4x80x16x16xf32>
%21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
- %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
+ %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
%23 = tensor.empty() : tensor<4x80x16x16xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x80x16x16xf16>, tensor<4x80x16x16xf32>, tensor<80x16xf16>) outs(%23 : tensor<4x80x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
@@ -1414,7 +1414,7 @@ module {
%15 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0, 0], sizes = [320, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<320x16xf16>> -> tensor<320x16xf16>
%16 = tensor.empty() : tensor<4x320x16x16xf32>
%17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst_2 : f32) outs(%16 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
- %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x1280x16x1xf16>, tensor<320x1280x16x1xf16>) outs(%17 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
+ %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x1280x16x1xf16>, tensor<320x1280x16x1xf16>) outs(%17 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
%19 = tensor.empty() : tensor<4x320x16x16xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %15 : tensor<4x320x16x16xf32>, tensor<320x16xf16>) outs(%19 : tensor<4x320x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
^bb0(%in: f32, %in_3: f16, %out: f16):
@@ -1470,7 +1470,7 @@ module {
%19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 80, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x80x16x16xf16>> -> tensor<4x80x16x16xf16>
%20 = tensor.empty() : tensor<4x80x16x16xf32>
%21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
- %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x5120x16x1xf16>, tensor<80x5120x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
+ %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 20, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x5120x16x1xf16>, tensor<80x5120x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
%23 = tensor.empty() : tensor<4x80x16x16xf16>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x80x16x16xf16>, tensor<4x80x16x16xf32>, tensor<80x16xf16>) outs(%23 : tensor<4x80x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
@@ -1604,7 +1604,7 @@ module {
%9 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [80, 1280, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<80x1280x16x1xf16>> -> tensor<80x1280x16x1xf16>
%10 = tensor.empty() : tensor<1x80x1x16xf32>
%11 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 1, 16]>} ins(%cst : f32) outs(%10 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
- %12 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 1, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%8, %9 : tensor<1x1280x1x1xf16>, tensor<80x1280x16x1xf16>) outs(%11 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
+ %12 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 1, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%8, %9 : tensor<1x1280x1x1xf16>, tensor<80x1280x16x1xf16>) outs(%11 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
%13 = tensor.empty() : tensor<1x80x1x16xf16>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<1x80x1x16xf32>) outs(%13 : tensor<1x80x1x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 1, 16]>} {
^bb0(%in: f32, %out: f16):
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment