interestingLSY · August 15, 2025 08:08
diff --git a/element_stride.cu b/element_stride.cu
 #include <cstdio>
 #include <cuda_runtime.h>
 #include <cuda.h>

 #include <cute/arch/copy_sm90_tma.hpp>

 #include "utils.h"

 __global__ void test_kernel(const __grid_constant__ CUtensorMap tensor_map) {
    __shared__ int data[16][16];
    for (int i = 0; i < 16; ++i) {
        for (int j = 0; j < 16; ++j) {
            data[i][j] = -1;    // -1 means that TMA has not touched this element
        }
    }

    __shared__ uint64_t bar_buf;
    transac_bar_t* bar = reinterpret_cast<transac_bar_t*>(&bar_buf);
    bar->init(1);
    fence_view_async_shared();

    cute::SM90_TMA_LOAD_2D::copy(
        &tensor_map,
        (uint64_t*)bar,
        (uint64_t)cute::TMA::CacheHintSm90::EVICT_NORMAL,
        (void*)(data),
        0, 0
    );
    __nanosleep(1000 * 1000);
    
    for (int i = 0; i < 16; ++i) {
        for (int j = 0; j < 16; ++j)
            printf("%4d ", data[i][j]);
        printf("\n");
    }
 }

 int main() {
    /*
    nvcc csrc/test-tensormap-elemstride.cu -gencode arch=compute_90a,code=sm_90a -o /tmp/a -lcuda -Icsrc/cutlass/include -std=c++17 --run
    */
    int* h_data = new int[32*32];
    for (int i = 0; i < 32 * 32; ++i) {
        h_data[i] = i;
    }
    int* d_data;
    CHECK_CUDA(cudaMalloc(&d_data, 32 * 32 * sizeof(int)));
    CHECK_CUDA(cudaMemcpy(d_data, h_data, 32 * 32 * sizeof(int), cudaMemcpyHostToDevice));
    
    CUtensorMap tensor_map{};
    constexpr uint32_t rank = 2;
    uint64_t size[rank] = {32, 32};
    uint64_t stride[rank-1] = {32*sizeof(int)};
    uint32_t box_size[rank] = {16, 16};
    uint32_t elem_stride[rank] = {1, 8};
    CUresult res = cuTensorMapEncodeTiled(
        &tensor_map,
        CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
        rank,
        d_data,
        size,
        stride,
        box_size,
        elem_stride,
        CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
        CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
        CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
        CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
    );
    assert(res == CUDA_SUCCESS);

    test_kernel<<<1, 1>>>(tensor_map);
    CHECK_CUDA(cudaGetLastError());
    CHECK_CUDA(cudaDeviceSynchronize());
    return 0;
 }
	#include <cstdio>
	#include <cuda_runtime.h>
	#include <cuda.h>

	#include <cute/arch/copy_sm90_tma.hpp>

	#include "utils.h"

	__global__ void test_kernel(const __grid_constant__ CUtensorMap tensor_map) {
	__shared__ int data[16][16];
	for (int i = 0; i < 16; ++i) {
	for (int j = 0; j < 16; ++j) {
	data[i][j] = -1; // -1 means that TMA has not touched this element
	}
	}

	__shared__ uint64_t bar_buf;
	transac_bar_t* bar = reinterpret_cast<transac_bar_t*>(&bar_buf);
	bar->init(1);
	fence_view_async_shared();

	cute::SM90_TMA_LOAD_2D::copy(
	&tensor_map,
	(uint64_t*)bar,
	(uint64_t)cute::TMA::CacheHintSm90::EVICT_NORMAL,
	(void*)(data),
	0, 0
	);
	__nanosleep(1000 * 1000);

	for (int i = 0; i < 16; ++i) {
	for (int j = 0; j < 16; ++j)
	printf("%4d ", data[i][j]);
	printf("\n");
	}
	}

	int main() {
	/*
	nvcc csrc/test-tensormap-elemstride.cu -gencode arch=compute_90a,code=sm_90a -o /tmp/a -lcuda -Icsrc/cutlass/include -std=c++17 --run
	*/
	int* h_data = new int[32*32];
	for (int i = 0; i < 32 * 32; ++i) {
	h_data[i] = i;
	}
	int* d_data;
	CHECK_CUDA(cudaMalloc(&d_data, 32 * 32 * sizeof(int)));
	CHECK_CUDA(cudaMemcpy(d_data, h_data, 32 * 32 * sizeof(int), cudaMemcpyHostToDevice));

	CUtensorMap tensor_map{};
	constexpr uint32_t rank = 2;
	uint64_t size[rank] = {32, 32};
	uint64_t stride[rank-1] = {32*sizeof(int)};
	uint32_t box_size[rank] = {16, 16};
	uint32_t elem_stride[rank] = {1, 8};
	CUresult res = cuTensorMapEncodeTiled(
	&tensor_map,
	CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
	rank,
	d_data,
	size,
	stride,
	box_size,
	elem_stride,
	CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
	CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
	CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
	CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
	);
	assert(res == CUDA_SUCCESS);

	test_kernel<<<1, 1>>>(tensor_map);
	CHECK_CUDA(cudaGetLastError());
	CHECK_CUDA(cudaDeviceSynchronize());
	return 0;
	}
No results found