Created
August 15, 2025 08:08
-
-
Save interestingLSY/35c26212a9f12beec8ef8aa9db2a3dc4 to your computer and use it in GitHub Desktop.
Meaning of "elementStride" in TMA descriptor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <cstdio> | |
| #include <cuda_runtime.h> | |
| #include <cuda.h> | |
| #include <cute/arch/copy_sm90_tma.hpp> | |
| #include "utils.h" | |
| __global__ void test_kernel(const __grid_constant__ CUtensorMap tensor_map) { | |
| __shared__ int data[16][16]; | |
| for (int i = 0; i < 16; ++i) { | |
| for (int j = 0; j < 16; ++j) { | |
| data[i][j] = -1; // -1 means that TMA has not touched this element | |
| } | |
| } | |
| __shared__ uint64_t bar_buf; | |
| transac_bar_t* bar = reinterpret_cast<transac_bar_t*>(&bar_buf); | |
| bar->init(1); | |
| fence_view_async_shared(); | |
| cute::SM90_TMA_LOAD_2D::copy( | |
| &tensor_map, | |
| (uint64_t*)bar, | |
| (uint64_t)cute::TMA::CacheHintSm90::EVICT_NORMAL, | |
| (void*)(data), | |
| 0, 0 | |
| ); | |
| __nanosleep(1000 * 1000); | |
| for (int i = 0; i < 16; ++i) { | |
| for (int j = 0; j < 16; ++j) | |
| printf("%4d ", data[i][j]); | |
| printf("\n"); | |
| } | |
| } | |
| int main() { | |
| /* | |
| nvcc csrc/test-tensormap-elemstride.cu -gencode arch=compute_90a,code=sm_90a -o /tmp/a -lcuda -Icsrc/cutlass/include -std=c++17 --run | |
| */ | |
| int* h_data = new int[32*32]; | |
| for (int i = 0; i < 32 * 32; ++i) { | |
| h_data[i] = i; | |
| } | |
| int* d_data; | |
| CHECK_CUDA(cudaMalloc(&d_data, 32 * 32 * sizeof(int))); | |
| CHECK_CUDA(cudaMemcpy(d_data, h_data, 32 * 32 * sizeof(int), cudaMemcpyHostToDevice)); | |
| CUtensorMap tensor_map{}; | |
| constexpr uint32_t rank = 2; | |
| uint64_t size[rank] = {32, 32}; | |
| uint64_t stride[rank-1] = {32*sizeof(int)}; | |
| uint32_t box_size[rank] = {16, 16}; | |
| uint32_t elem_stride[rank] = {1, 8}; | |
| CUresult res = cuTensorMapEncodeTiled( | |
| &tensor_map, | |
| CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32, | |
| rank, | |
| d_data, | |
| size, | |
| stride, | |
| box_size, | |
| elem_stride, | |
| CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE, | |
| CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE, | |
| CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE, | |
| CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE | |
| ); | |
| assert(res == CUDA_SUCCESS); | |
| test_kernel<<<1, 1>>>(tensor_map); | |
| CHECK_CUDA(cudaGetLastError()); | |
| CHECK_CUDA(cudaDeviceSynchronize()); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment