Question: Modify the reduction algorithm of kernel.cu so that the maximum partial sum is achieved on the GPU. (kernel.cu below) #include cuda_runtime.h #include device_launch_parameters.h #include #include

Modify the reduction algorithm of kernel.cu so that the maximum partial sum is achieved on the GPU. (kernel.cu below)

#include "cuda_runtime.h" #include "device_launch_parameters.h" #include #include

#define BLOCK_SIZE 512

__global__ void reduction(float *input, float *output, int len) {

__shared__ float partialSum[2 * BLOCK_SIZE]; unsigned int t = threadIdx.x; unsigned int start = 2 * blockIdx.x * blockDim.x; partialSum[t] = (t < len) ? input[start + t] : 0; partialSum[blockDim.x + t] = ((blockDim.x + t) < len) ? input[start + blockDim.x + t] : 0;

for (int stride = blockDim.x; stride >= 1; stride >>= 1) { __syncthreads(); if (t < stride) partialSum[t] += partialSum[t + stride]; }

if (t == 0) { output[blockIdx.x + t] = partialSum[t]; }

}

void initVector(float **vec_h, int size) { *vec_h = (float*)malloc(size * sizeof(float));

if (*vec_h == NULL) { printf("Unable to allocate host"); }

for (unsigned int i = 0; i < size; i++) { (*vec_h)[i] = (rand() % 100) / 100.00; }

}

void verify(float* input, unsigned num_elements, float result) {

const float relativeTolerance = 2e-5;

float sum = 0.0f; for (int i = 0; i < num_elements; ++i) { sum += input[i]; } float relativeError = (sum - result) / sum; if (relativeError > relativeTolerance || relativeError < -relativeTolerance) { printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result); exit(0); } printf("TEST PASSED, cpu = %0.3f, gpu = %0.3f ", sum, result); }

int main(int argc, char ** argv) { int i; float *hostInput; float *hostOutput; float *deviceInput; float *deviceOutput; unsigned numInputElements; unsigned numOutputElements; dim3 dimGrid, dimBlock;

if (argc == 1) { numInputElements = 1000000; }

else if (argc == 2) { numInputElements = atoi(argv[1]); } initVector(&hostInput, numInputElements);

numOutputElements = numInputElements / (BLOCK_SIZE << 1);

if (numInputElements % (BLOCK_SIZE << 1)) { numOutputElements++; }

hostOutput = (float*)malloc(numOutputElements * sizeof(float)); cudaMalloc((void**)&deviceInput, numInputElements * sizeof(float)); cudaMalloc((void**)&deviceOutput, numOutputElements * sizeof(float)); cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice); dimBlock.x = BLOCK_SIZE; dimBlock.y = dimBlock.z = 1; dimGrid.x = numOutputElements; dimGrid.y = dimGrid.z = 1; reduction<<>>(deviceInput, deviceOutput, numInputElements); cudaDeviceSynchronize(); cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);

int count = 0; for (i = 1; i < numOutputElements; i++) { hostOutput[0] += hostOutput[i]; count++; printf("%.3f ", hostOutput[0]); }

printf("Sums accumulated on host: %i ", count - 1);

verify(hostInput, numInputElements, hostOutput[0]);

cudaFree(deviceInput); cudaFree(deviceOutput); free(hostInput); free(hostOutput);

return 0;

}

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!