yati-sagade · January 9, 2025 13:55
diff --git a/mult.cu b/mult.cu
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include <math.h>

 __global__ void mult_each(float *A, float *B, float *C, int n) {
        int idx = blockDim.x * blockIdx.x + threadIdx.x;
        printf("Thread idx %d\n", idx);
        if (idx < n * n) {
                int i = floor(idx/((float)n));
                int j = idx % n;
                C[idx] = 0;
                for (int k = 0; k < n; ++k) {
                        printf("T%d: i=%d, j=%d, k=%d, a=%f, b=%f\n",
                                        idx, i, j, k, A[i*n+k], B[k*n+j]);
                        C[idx] += A[i * n + k] * B[k * n + j];
                }
                printf("Thread idx %d completes with %f\n", idx, C[idx]);
        }
 }

 void mult_elementwise(float *h_A, float *h_B, float *h_C, int n) {
        float *d_A, *d_B, *d_C;
        int sz = n * n * sizeof(float);
        cudaError_t e = cudaMalloc((void**)&d_A, sz);
        if (e != cudaSuccess) {
                printf("Error allocating d_A! %d\n", e);                                                                                                                                                                                    [0/127]
                exit(1);
        }
        e = cudaMemcpy(d_A, h_A, sz, cudaMemcpyHostToDevice);
        if (e != cudaSuccess) {
                printf("Error copying h_A to device! %d\n", e);
                exit(1);
        }

        e = cudaMalloc((void**)&d_B, sz);
        if (e != cudaSuccess) {
                printf("Error allocating d_B! %d\n", e);
                exit(1);
        }
        e = cudaMemcpy(d_B, h_B, sz, cudaMemcpyHostToDevice);
        if (e != cudaSuccess) {
                printf("Error copying h_B to device! %d\n", e);
                exit(1);
        }

        e = cudaMalloc((void**)&d_C, sz);
        if (e != cudaSuccess) {
                printf("Error allocating d_C! %d\n", e);
                exit(1);
        }

        mult_each<<<ceil((n * n) / 128.0), 128>>>(d_A, d_B, d_C, n);

        e = cudaMemcpy(h_C, d_C, sz, cudaMemcpyDeviceToHost);
        if (e != cudaSuccess) {
                printf("Error copying back d_C! %d\n", e);
                exit(1);
        }

        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
 }


 int main(int argc, char *argv[]) {
        float A[] = {
                1.0, 2.0,
                3.0, 4.0
        };
        float B[] = {
                5.0, 7.0,
                8.0, 9.0
        };
        float C[] = {0.0, 0.0, 0.0, 0.0};
        int n = 2;
        mult_elementwise(A, B, C, n);
        for (int i = 0; i < n; ++i) {
                for (int j = 0; j < n; ++j) {
                        printf(" %f", C[i*n+j]);
                }
                printf("\n");
        }
        printf("\n");
        return 0;
 }
	#include <cuda_runtime.h>
	#include <stdio.h>
	#include <math.h>

	__global__ void mult_each(float A, float B, float *C, int n) {
	int idx = blockDim.x * blockIdx.x + threadIdx.x;
	printf("Thread idx %d\n", idx);
	if (idx < n * n) {
	int i = floor(idx/((float)n));
	int j = idx % n;
	C[idx] = 0;
	for (int k = 0; k < n; ++k) {
	printf("T%d: i=%d, j=%d, k=%d, a=%f, b=%f\n",
	idx, i, j, k, A[in+k], B[kn+j]);
	C[idx] += A[i * n + k] * B[k * n + j];
	}
	printf("Thread idx %d completes with %f\n", idx, C[idx]);
	}
	}

	void mult_elementwise(float h_A, float h_B, float *h_C, int n) {
	float d_A, d_B, *d_C;
	int sz = n * n * sizeof(float);
	cudaError_t e = cudaMalloc((void**)&d_A, sz);
	if (e != cudaSuccess) {
	printf("Error allocating d_A! %d\n", e); [0/127]
	exit(1);
	}
	e = cudaMemcpy(d_A, h_A, sz, cudaMemcpyHostToDevice);
	if (e != cudaSuccess) {
	printf("Error copying h_A to device! %d\n", e);
	exit(1);
	}

	e = cudaMalloc((void**)&d_B, sz);
	if (e != cudaSuccess) {
	printf("Error allocating d_B! %d\n", e);
	exit(1);
	}
	e = cudaMemcpy(d_B, h_B, sz, cudaMemcpyHostToDevice);
	if (e != cudaSuccess) {
	printf("Error copying h_B to device! %d\n", e);
	exit(1);
	}

	e = cudaMalloc((void**)&d_C, sz);
	if (e != cudaSuccess) {
	printf("Error allocating d_C! %d\n", e);
	exit(1);
	}

	mult_each<<<ceil((n * n) / 128.0), 128>>>(d_A, d_B, d_C, n);

	e = cudaMemcpy(h_C, d_C, sz, cudaMemcpyDeviceToHost);
	if (e != cudaSuccess) {
	printf("Error copying back d_C! %d\n", e);
	exit(1);
	}

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
	}


	int main(int argc, char *argv[]) {
	float A[] = {
	1.0, 2.0,
	3.0, 4.0
	};
	float B[] = {
	5.0, 7.0,
	8.0, 9.0
	};
	float C[] = {0.0, 0.0, 0.0, 0.0};
	int n = 2;
	mult_elementwise(A, B, C, n);
	for (int i = 0; i < n; ++i) {
	for (int j = 0; j < n; ++j) {
	printf(" %f", C[i*n+j]);
	}
	printf("\n");
	}
	printf("\n");
	return 0;
	}
No results found