bjacob · January 22, 2026 23:50
diff --git a/a.diff b/a.diff
 diff --git a/tmp/log-good.mlir b/tmp/log-bad.mlir
 index 0408554..f00f377 100644
 --- a/tmp/log-good.mlir
 +++ b/tmp/log-bad.mlir
 @@ -400,7 +400,7 @@ module {
     %11 = iree_tensor_ext.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [48, 768, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<48x768x16x1xf16>> -> tensor<48x768x16x1xf16>
     %12 = tensor.empty() : tensor<4x48x16x16xf32>
     %13 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%12 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
 -    %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%13 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
 +    %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%13 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
     %15 = tensor.empty() : tensor<64x768xf32>
     %unpack = linalg.unpack %14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1]>} : tensor<4x48x16x16xf32> -> tensor<64x768xf32>
     iree_tensor_ext.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [64, 768], strides = [1, 1] : tensor<64x768xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<64x768xf32>>
 @@ -667,7 +667,7 @@ module {
     %19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
     %20 = tensor.empty() : tensor<4x48x16x16xf32>
     %21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
 -    %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
 +    %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
     %23 = tensor.empty() : tensor<4x48x16x16xf16>
     %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%23 : tensor<4x48x16x16xf16>) attrs =  {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
     ^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
 @@ -773,7 +773,7 @@ module {
     %15 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0, 0], sizes = [192, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<192x16xf16>> -> tensor<192x16xf16>
     %16 = tensor.empty() : tensor<4x192x16x16xf32>
     %17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst_1 : f32) outs(%16 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
 -    %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x768x16x1xf16>, tensor<192x768x16x1xf16>) outs(%17 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
 +    %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 192, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x768x16x1xf16>, tensor<192x768x16x1xf16>) outs(%17 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
     %19 = tensor.empty() : tensor<4x192x16x16xf16>
     %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %15 : tensor<4x192x16x16xf32>, tensor<192x16xf16>) outs(%19 : tensor<4x192x16x16xf16>) attrs =  {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
     ^bb0(%in: f32, %in_2: f16, %out: f16):
 @@ -830,7 +830,7 @@ module {
     %19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
     %20 = tensor.empty() : tensor<4x48x16x16xf32>
     %21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
 -    %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
 +    %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
     %23 = tensor.empty() : tensor<4x48x16x16xf16>
     %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%23 : tensor<4x48x16x16xf16>) attrs =  {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
     ^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
 @@ -875,7 +875,7 @@ module {
     %15 = iree_tensor_ext.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
     %16 = tensor.empty() : tensor<4x48x16x16xf32>
     %17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%16 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
 -    %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%12, %13 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%17 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
 +    %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%12, %13 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%17 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
     %19 = tensor.empty() : tensor<4x48x16x16xf16>
     %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %18, %14 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%19 : tensor<4x48x16x16xf16>) attrs =  {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
     ^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
 @@ -1040,7 +1040,7 @@ module {
     %11 = iree_tensor_ext.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [80, 1280, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<80x1280x16x1xf16>> -> tensor<80x1280x16x1xf16>
     %12 = tensor.empty() : tensor<4x80x16x16xf32>
     %13 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%12 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
 -    %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%13 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
 +    %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%13 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
     %15 = tensor.empty() : tensor<64x1280xf32>
     %unpack = linalg.unpack %14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1]>} : tensor<4x80x16x16xf32> -> tensor<64x1280xf32>
     iree_tensor_ext.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [64, 1280], strides = [1, 1] : tensor<64x1280xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<64x1280xf32>>
 @@ -1307,7 +1307,7 @@ module {
     %19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 80, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x80x16x16xf16>> -> tensor<4x80x16x16xf16>
     %20 = tensor.empty() : tensor<4x80x16x16xf32>
     %21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
 -    %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
 +    %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
     %23 = tensor.empty() : tensor<4x80x16x16xf16>
     %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x80x16x16xf16>, tensor<4x80x16x16xf32>, tensor<80x16xf16>) outs(%23 : tensor<4x80x16x16xf16>) attrs =  {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
     ^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
 @@ -1414,7 +1414,7 @@ module {
     %15 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0, 0], sizes = [320, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<320x16xf16>> -> tensor<320x16xf16>
     %16 = tensor.empty() : tensor<4x320x16x16xf32>
     %17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst_2 : f32) outs(%16 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
 -    %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x1280x16x1xf16>, tensor<320x1280x16x1xf16>) outs(%17 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
 +    %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x1280x16x1xf16>, tensor<320x1280x16x1xf16>) outs(%17 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
     %19 = tensor.empty() : tensor<4x320x16x16xf16>
     %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %15 : tensor<4x320x16x16xf32>, tensor<320x16xf16>) outs(%19 : tensor<4x320x16x16xf16>) attrs =  {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
     ^bb0(%in: f32, %in_3: f16, %out: f16):
 @@ -1470,7 +1470,7 @@ module {
     %19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 80, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x80x16x16xf16>> -> tensor<4x80x16x16xf16>
     %20 = tensor.empty() : tensor<4x80x16x16xf32>
     %21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
 -    %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x5120x16x1xf16>, tensor<80x5120x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
 +    %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 20, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x5120x16x1xf16>, tensor<80x5120x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
     %23 = tensor.empty() : tensor<4x80x16x16xf16>
     %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x80x16x16xf16>, tensor<4x80x16x16xf32>, tensor<80x16xf16>) outs(%23 : tensor<4x80x16x16xf16>) attrs =  {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
     ^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
 @@ -1604,7 +1604,7 @@ module {
     %9 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [80, 1280, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<80x1280x16x1xf16>> -> tensor<80x1280x16x1xf16>
     %10 = tensor.empty() : tensor<1x80x1x16xf32>
     %11 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 1, 16]>} ins(%cst : f32) outs(%10 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
 -    %12 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 1, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%8, %9 : tensor<1x1280x1x1xf16>, tensor<80x1280x16x1xf16>) outs(%11 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
 +    %12 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 1, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%8, %9 : tensor<1x1280x1x1xf16>, tensor<80x1280x16x1xf16>) outs(%11 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
     %13 = tensor.empty() : tensor<1x80x1x16xf16>
     %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<1x80x1x16xf32>) outs(%13 : tensor<1x80x1x16xf16>) attrs =  {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 1, 16]>} {
     ^bb0(%in: f32, %out: f16):
	diff --git a/tmp/log-good.mlir b/tmp/log-bad.mlir
	index 0408554..f00f377 100644
	--- a/tmp/log-good.mlir
	+++ b/tmp/log-bad.mlir
	@@ -400,7 +400,7 @@ module {
	%11 = iree_tensor_ext.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [48, 768, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<48x768x16x1xf16>> -> tensor<48x768x16x1xf16>
	%12 = tensor.empty() : tensor<4x48x16x16xf32>
	%13 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%12 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	- %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%13 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	+ %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%13 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	%15 = tensor.empty() : tensor<64x768xf32>
	%unpack = linalg.unpack %14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1]>} : tensor<4x48x16x16xf32> -> tensor<64x768xf32>
	iree_tensor_ext.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [64, 768], strides = [1, 1] : tensor<64x768xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<64x768xf32>>
	@@ -667,7 +667,7 @@ module {
	%19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
	%20 = tensor.empty() : tensor<4x48x16x16xf32>
	%21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	- %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	+ %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x768x16x1xf16>, tensor<48x768x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	%23 = tensor.empty() : tensor<4x48x16x16xf16>
	%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%23 : tensor<4x48x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
	^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
	@@ -773,7 +773,7 @@ module {
	%15 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0, 0], sizes = [192, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<192x16xf16>> -> tensor<192x16xf16>
	%16 = tensor.empty() : tensor<4x192x16x16xf32>
	%17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst_1 : f32) outs(%16 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
	- %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 4, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x768x16x1xf16>, tensor<192x768x16x1xf16>) outs(%17 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
	+ %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 192, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x768x16x1xf16>, tensor<192x768x16x1xf16>) outs(%17 : tensor<4x192x16x16xf32>) -> tensor<4x192x16x16xf32>
	%19 = tensor.empty() : tensor<4x192x16x16xf16>
	%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %15 : tensor<4x192x16x16xf32>, tensor<192x16xf16>) outs(%19 : tensor<4x192x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
	^bb0(%in: f32, %in_2: f16, %out: f16):
	@@ -830,7 +830,7 @@ module {
	%19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
	%20 = tensor.empty() : tensor<4x48x16x16xf32>
	%21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	- %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	+ %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%21 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	%23 = tensor.empty() : tensor<4x48x16x16xf16>
	%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%23 : tensor<4x48x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
	^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
	@@ -875,7 +875,7 @@ module {
	%15 = iree_tensor_ext.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [4, 48, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x48x16x16xf16>> -> tensor<4x48x16x16xf16>
	%16 = tensor.empty() : tensor<4x48x16x16xf32>
	%17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%16 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	- %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%12, %13 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%17 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	+ %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 48, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%12, %13 : tensor<4x3072x16x1xf16>, tensor<48x3072x16x1xf16>) outs(%17 : tensor<4x48x16x16xf32>) -> tensor<4x48x16x16xf32>
	%19 = tensor.empty() : tensor<4x48x16x16xf16>
	%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %18, %14 : tensor<4x48x16x16xf16>, tensor<4x48x16x16xf32>, tensor<48x16xf16>) outs(%19 : tensor<4x48x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
	^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
	@@ -1040,7 +1040,7 @@ module {
	%11 = iree_tensor_ext.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [80, 1280, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<80x1280x16x1xf16>> -> tensor<80x1280x16x1xf16>
	%12 = tensor.empty() : tensor<4x80x16x16xf32>
	%13 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%12 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	- %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%13 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	+ %14 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%10, %11 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%13 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	%15 = tensor.empty() : tensor<64x1280xf32>
	%unpack = linalg.unpack %14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1]>} : tensor<4x80x16x16xf32> -> tensor<64x1280xf32>
	iree_tensor_ext.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [64, 1280], strides = [1, 1] : tensor<64x1280xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<64x1280xf32>>
	@@ -1307,7 +1307,7 @@ module {
	%19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 80, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x80x16x16xf16>> -> tensor<4x80x16x16xf16>
	%20 = tensor.empty() : tensor<4x80x16x16xf32>
	%21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	- %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	+ %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x1280x16x1xf16>, tensor<80x1280x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	%23 = tensor.empty() : tensor<4x80x16x16xf16>
	%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x80x16x16xf16>, tensor<4x80x16x16xf32>, tensor<80x16xf16>) outs(%23 : tensor<4x80x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
	^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
	@@ -1414,7 +1414,7 @@ module {
	%15 = iree_tensor_ext.dispatch.tensor.load %11, offsets = [0, 0], sizes = [320, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<320x16xf16>> -> tensor<320x16xf16>
	%16 = tensor.empty() : tensor<4x320x16x16xf32>
	%17 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst_2 : f32) outs(%16 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
	- %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [2, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x1280x16x1xf16>, tensor<320x1280x16x1xf16>) outs(%17 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
	+ %18 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%13, %14 : tensor<4x1280x16x1xf16>, tensor<320x1280x16x1xf16>) outs(%17 : tensor<4x320x16x16xf32>) -> tensor<4x320x16x16xf32>
	%19 = tensor.empty() : tensor<4x320x16x16xf16>
	%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %15 : tensor<4x320x16x16xf32>, tensor<320x16xf16>) outs(%19 : tensor<4x320x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
	^bb0(%in: f32, %in_3: f16, %out: f16):
	@@ -1470,7 +1470,7 @@ module {
	%19 = iree_tensor_ext.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [4, 80, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x80x16x16xf16>> -> tensor<4x80x16x16xf16>
	%20 = tensor.empty() : tensor<4x80x16x16xf32>
	%21 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} ins(%cst : f32) outs(%20 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	- %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 1, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x5120x16x1xf16>, tensor<80x5120x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	+ %22 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [4, 20, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 16, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%16, %17 : tensor<4x5120x16x1xf16>, tensor<80x5120x16x1xf16>) outs(%21 : tensor<4x80x16x16xf32>) -> tensor<4x80x16x16xf32>
	%23 = tensor.empty() : tensor<4x80x16x16xf16>
	%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %22, %18 : tensor<4x80x16x16xf16>, tensor<4x80x16x16xf32>, tensor<80x16xf16>) outs(%23 : tensor<4x80x16x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 16, 16]>} {
	^bb0(%in: f16, %in_0: f32, %in_1: f16, %out: f16):
	@@ -1604,7 +1604,7 @@ module {
	%9 = iree_tensor_ext.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [80, 1280, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<80x1280x16x1xf16>> -> tensor<80x1280x16x1xf16>
	%10 = tensor.empty() : tensor<1x80x1x16xf32>
	%11 = linalg.fill {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 1, 16]>} ins(%cst : f32) outs(%10 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
	- %12 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 2, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 1, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%8, %9 : tensor<1x1280x1x1xf16>, tensor<80x1280x16x1xf16>) outs(%11 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
	+ %12 = linalg.mmt4d {lowering_config = #iree_cpu.lowering_config<distribution = [1, 80, 0, 0, 0, 0], vector_common_parallel = [1, 1, 0, 1, 16, 0], vector_reduction = [0, 0, 1, 0, 0, 1]>} ins(%8, %9 : tensor<1x1280x1x1xf16>, tensor<80x1280x16x1xf16>) outs(%11 : tensor<1x80x1x16xf32>) -> tensor<1x80x1x16xf32>
	%13 = tensor.empty() : tensor<1x80x1x16xf16>
	%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<1x80x1x16xf32>) outs(%13 : tensor<1x80x1x16xf16>) attrs = {lowering_config = #iree_cpu.lowering_config<vector_common_parallel = [1, 1, 1, 16]>} {
	^bb0(%in: f32, %out: f16):
No results found