getianao · August 14, 2023 14:45
diff --git a/test_gpu_l2.cu b/test_gpu_l2.cu


 #define BLOCK_SIZE 1024
 #define STRIDE 16

 __global__ void kernel(float *A, float *B) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx * STRIDE < BLOCK_SIZE)
    B[idx] = A[idx * STRIDE];
  //  STRIDE * 4 bytes stride read (STRIDE * 4 bytes float)

 }

 int main() {
  float *A, *B;
  cudaMalloc(&A, BLOCK_SIZE * sizeof(float));
  cudaMalloc(&B, BLOCK_SIZE * sizeof(float));
  kernel<<<1, BLOCK_SIZE>>>(A, B);
  cudaDeviceSynchronize();
 }

 // nvcc test_l2.cu -O3 -o test_l2
 // 1024, 1024/16=64 sectors/threads, 64/32=2 requests,  64*32=2048 bytes (from L1)

 // https://forums.developer.nvidia.com/t/pascal-l1-cache/49571/20?u=cache
 // https://stackoverflow.com/questions/63497910/cache-behaviour-in-compute-capability-7-5


	#define BLOCK_SIZE 1024
	#define STRIDE 16

	__global__ void kernel(float A, float B) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx * STRIDE < BLOCK_SIZE)
	B[idx] = A[idx * STRIDE];
	// STRIDE * 4 bytes stride read (STRIDE * 4 bytes float)

	}

	int main() {
	float A, B;
	cudaMalloc(&A, BLOCK_SIZE * sizeof(float));
	cudaMalloc(&B, BLOCK_SIZE * sizeof(float));
	kernel<<<1, BLOCK_SIZE>>>(A, B);
	cudaDeviceSynchronize();
	}

	// nvcc test_l2.cu -O3 -o test_l2
	// 1024, 1024/16=64 sectors/threads, 64/32=2 requests, 64*32=2048 bytes (from L1)

	// https://forums.developer.nvidia.com/t/pascal-l1-cache/49571/20?u=cache
	// https://stackoverflow.com/questions/63497910/cache-behaviour-in-compute-capability-7-5
No results found