awni · October 9, 2025 19:19
diff --git a/tiled_matmul.py b/tiled_matmul.py
 import mlx.core as mx

 # Possible tile size for tensor cores
 TS = 32

 # Matrix dimension (M = N = K = D)
 D = 2048

 A = mx.random.uniform(shape=(D, D))
 B = mx.random.uniform(shape=(D, D))

 # Reshape and transpose so a tile is in the last two dimensions
 A_tiled = A.reshape((2048 // TS, TS, D // TS, TS)).swapaxes(1, 2)
 B_tiled = B.reshape((2048 // TS, TS, D // TS, TS)).swapaxes(1, 2)

 # Each thread group computes one tile of the output:
 i = 1
 j = 1
 C_ij = sum(A_tiled[i, k] @ B_tiled[k, j] for k in range(D // TS))


 C = A @ B

 # Get the `i, j` tile of the output
 C_ij_expected = C[i * TS:(i+1) * TS, j * TS: (j+1) * TS]

 assert mx.allclose(C_ij, C_ij_expected)
	import mlx.core as mx

	# Possible tile size for tensor cores
	TS = 32

	# Matrix dimension (M = N = K = D)
	D = 2048

	A = mx.random.uniform(shape=(D, D))
	B = mx.random.uniform(shape=(D, D))

	# Reshape and transpose so a tile is in the last two dimensions
	A_tiled = A.reshape((2048 // TS, TS, D // TS, TS)).swapaxes(1, 2)
	B_tiled = B.reshape((2048 // TS, TS, D // TS, TS)).swapaxes(1, 2)

	# Each thread group computes one tile of the output:
	i = 1
	j = 1
	C_ij = sum(A_tiled[i, k] @ B_tiled[k, j] for k in range(D // TS))


	C = A @ B

	# Get the `i, j` tile of the output
	C_ij_expected = C[i * TS:(i+1) * TS, j * TS: (j+1) * TS]

	assert mx.allclose(C_ij, C_ij_expected)
No results found