Question: Use Google Colab and write a CUDA program to multiply two vectors (e.g., a, b) element wise, and store them to another vector (e.g., c).
Use Google Colab and write a CUDA program to multiply two vectors (e.g., a, b) element wise, and store them to another vector (e.g., c). Each vector should have 2000000 elements, and initialize first, and second vector values with 4, and 9 respectively. Print the last element of the multiplied matrix for verification (i.e., 36).
This is the guideline code:
#include #include #include #include #include #include
#define N 2000000 #define MAX_ERR 1e-6
__global__ void vector_add(float *out, float *a, float *b, int n) { for(int i = 0; i out[i] = a[i] + b[i]; } }
int main(){ float *a, *b, *out; float *d_a, *d_b, *d_out;
// Allocate host memory a = (float*)malloc(sizeof(float) * N); b = (float*)malloc(sizeof(float) * N); out = (float*)malloc(sizeof(float) * N);
// Initialize host arrays for(int i = 0; i a[i] = 1.0f; b[i] = 2.0f; }
// Allocate device memory cudaMalloc((void**)&d_a, sizeof(float) * N); cudaMalloc((void**)&d_b, sizeof(float) * N); cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// Executing kernel vector_add>>(d_out, d_a, d_b, N); // Transfer data back to host memory cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);1,1>
// Verification for(int i = 0; i assert(fabs(out[i] - a[i] - b[i]) } printf("out[0] = %f ", out[0]); printf("PASSED ");
// Deallocate device memory cudaFree(d_a); cudaFree(d_b); cudaFree(d_out);
// Deallocate host memory free(a); free(b); free(out); }
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
