Skip to content

Instantly share code, notes, and snippets.

@interestingLSY
Created August 15, 2025 08:08
Show Gist options
  • Select an option

  • Save interestingLSY/35c26212a9f12beec8ef8aa9db2a3dc4 to your computer and use it in GitHub Desktop.

Select an option

Save interestingLSY/35c26212a9f12beec8ef8aa9db2a3dc4 to your computer and use it in GitHub Desktop.
Meaning of "elementStride" in TMA descriptor
#include <cstdio>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cute/arch/copy_sm90_tma.hpp>
#include "utils.h"
__global__ void test_kernel(const __grid_constant__ CUtensorMap tensor_map) {
__shared__ int data[16][16];
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 16; ++j) {
data[i][j] = -1; // -1 means that TMA has not touched this element
}
}
__shared__ uint64_t bar_buf;
transac_bar_t* bar = reinterpret_cast<transac_bar_t*>(&bar_buf);
bar->init(1);
fence_view_async_shared();
cute::SM90_TMA_LOAD_2D::copy(
&tensor_map,
(uint64_t*)bar,
(uint64_t)cute::TMA::CacheHintSm90::EVICT_NORMAL,
(void*)(data),
0, 0
);
__nanosleep(1000 * 1000);
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 16; ++j)
printf("%4d ", data[i][j]);
printf("\n");
}
}
int main() {
/*
nvcc csrc/test-tensormap-elemstride.cu -gencode arch=compute_90a,code=sm_90a -o /tmp/a -lcuda -Icsrc/cutlass/include -std=c++17 --run
*/
int* h_data = new int[32*32];
for (int i = 0; i < 32 * 32; ++i) {
h_data[i] = i;
}
int* d_data;
CHECK_CUDA(cudaMalloc(&d_data, 32 * 32 * sizeof(int)));
CHECK_CUDA(cudaMemcpy(d_data, h_data, 32 * 32 * sizeof(int), cudaMemcpyHostToDevice));
CUtensorMap tensor_map{};
constexpr uint32_t rank = 2;
uint64_t size[rank] = {32, 32};
uint64_t stride[rank-1] = {32*sizeof(int)};
uint32_t box_size[rank] = {16, 16};
uint32_t elem_stride[rank] = {1, 8};
CUresult res = cuTensorMapEncodeTiled(
&tensor_map,
CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
rank,
d_data,
size,
stride,
box_size,
elem_stride,
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
);
assert(res == CUDA_SUCCESS);
test_kernel<<<1, 1>>>(tensor_map);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaDeviceSynchronize());
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment