Skip to content

Instantly share code, notes, and snippets.

@yati-sagade
Created January 9, 2025 13:55
Show Gist options
  • Select an option

  • Save yati-sagade/b43ce57c891aff839bf5d3c4ab86c353 to your computer and use it in GitHub Desktop.

Select an option

Save yati-sagade/b43ce57c891aff839bf5d3c4ab86c353 to your computer and use it in GitHub Desktop.
cudamult
#include <cuda_runtime.h>
#include <stdio.h>
#include <math.h>
__global__ void mult_each(float *A, float *B, float *C, int n) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
printf("Thread idx %d\n", idx);
if (idx < n * n) {
int i = floor(idx/((float)n));
int j = idx % n;
C[idx] = 0;
for (int k = 0; k < n; ++k) {
printf("T%d: i=%d, j=%d, k=%d, a=%f, b=%f\n",
idx, i, j, k, A[i*n+k], B[k*n+j]);
C[idx] += A[i * n + k] * B[k * n + j];
}
printf("Thread idx %d completes with %f\n", idx, C[idx]);
}
}
void mult_elementwise(float *h_A, float *h_B, float *h_C, int n) {
float *d_A, *d_B, *d_C;
int sz = n * n * sizeof(float);
cudaError_t e = cudaMalloc((void**)&d_A, sz);
if (e != cudaSuccess) {
printf("Error allocating d_A! %d\n", e); [0/127]
exit(1);
}
e = cudaMemcpy(d_A, h_A, sz, cudaMemcpyHostToDevice);
if (e != cudaSuccess) {
printf("Error copying h_A to device! %d\n", e);
exit(1);
}
e = cudaMalloc((void**)&d_B, sz);
if (e != cudaSuccess) {
printf("Error allocating d_B! %d\n", e);
exit(1);
}
e = cudaMemcpy(d_B, h_B, sz, cudaMemcpyHostToDevice);
if (e != cudaSuccess) {
printf("Error copying h_B to device! %d\n", e);
exit(1);
}
e = cudaMalloc((void**)&d_C, sz);
if (e != cudaSuccess) {
printf("Error allocating d_C! %d\n", e);
exit(1);
}
mult_each<<<ceil((n * n) / 128.0), 128>>>(d_A, d_B, d_C, n);
e = cudaMemcpy(h_C, d_C, sz, cudaMemcpyDeviceToHost);
if (e != cudaSuccess) {
printf("Error copying back d_C! %d\n", e);
exit(1);
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
int main(int argc, char *argv[]) {
float A[] = {
1.0, 2.0,
3.0, 4.0
};
float B[] = {
5.0, 7.0,
8.0, 9.0
};
float C[] = {0.0, 0.0, 0.0, 0.0};
int n = 2;
mult_elementwise(A, B, C, n);
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
printf(" %f", C[i*n+j]);
}
printf("\n");
}
printf("\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment