Question: Modify the reduction algorithm of kernel.cu so that the maximum partial sum is achieved on the GPU. (kernel.cu below) #include cuda_runtime.h #include device_launch_parameters.h #include #include

Modify the reduction algorithm of kernel.cu so that the maximum partial sum is achieved on the GPU. (kernel.cu below)

#include "cuda_runtime.h" #include "device_launch_parameters.h" #include #include

#define BLOCK_SIZE 512

__global__ void reduction(float *input, float *output, int len) {

__shared__ float partialSum[2 * BLOCK_SIZE]; unsigned int t = threadIdx.x; unsigned int start = 2 * blockIdx.x * blockDim.x; partialSum[t] = (t < len) ? input[start + t] : 0; partialSum[blockDim.x + t] = ((blockDim.x + t) < len) ? input[start + blockDim.x + t] : 0;

for (int stride = blockDim.x; stride >= 1; stride >>= 1) { __syncthreads(); if (t < stride) partialSum[t] += partialSum[t + stride]; }

if (t == 0) { output[blockIdx.x + t] = partialSum[t]; }

}

void initVector(float **vec_h, int size) { *vec_h = (float*)malloc(size * sizeof(float));

if (*vec_h == NULL) { printf("Unable to allocate host"); }

for (unsigned int i = 0; i < size; i++) { (*vec_h)[i] = (rand() % 100) / 100.00; }

}

void verify(float* input, unsigned num_elements, float result) {

const float relativeTolerance = 2e-5;

float sum = 0.0f; for (int i = 0; i < num_elements; ++i) { sum += input[i]; } float relativeError = (sum - result) / sum; if (relativeError > relativeTolerance || relativeError < -relativeTolerance) { printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result); exit(0); } printf("TEST PASSED, cpu = %0.3f, gpu = %0.3f ", sum, result); }

int main(int argc, char ** argv) { int i; float *hostInput; float *hostOutput; float *deviceInput; float *deviceOutput; unsigned numInputElements; unsigned numOutputElements; dim3 dimGrid, dimBlock;

if (argc == 1) { numInputElements = 1000000; }

else if (argc == 2) { numInputElements = atoi(argv[1]); } initVector(&hostInput, numInputElements);

numOutputElements = numInputElements / (BLOCK_SIZE << 1);

if (numInputElements % (BLOCK_SIZE << 1)) { numOutputElements++; }

hostOutput = (float*)malloc(numOutputElements * sizeof(float)); cudaMalloc((void**)&deviceInput, numInputElements * sizeof(float)); cudaMalloc((void**)&deviceOutput, numOutputElements * sizeof(float)); cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(float), cudaMemcpyHostToDevice); dimBlock.x = BLOCK_SIZE; dimBlock.y = dimBlock.z = 1; dimGrid.x = numOutputElements; dimGrid.y = dimGrid.z = 1; reduction<<>>(deviceInput, deviceOutput, numInputElements); cudaDeviceSynchronize(); cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(float), cudaMemcpyDeviceToHost);

int count = 0; for (i = 1; i < numOutputElements; i++) { hostOutput[0] += hostOutput[i]; count++; printf("%.3f ", hostOutput[0]); }

printf("Sums accumulated on host: %i ", count - 1);

verify(hostInput, numInputElements, hostOutput[0]);

cudaFree(deviceInput); cudaFree(deviceOutput); free(hostInput); free(hostOutput);

return 0;

}

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer

Step: 1 Unlock blur-text-image

Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock

Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!

Optimize the reduction algorithm in kernel.cu so that the max value of the data is obtained. (other related files are provided) kernel.cu #define BLOCK_SIZE 512 #define SIMPLE __global__ void...

Assignment 4 Statement: Write a program in a file named Scan4.cpp that reads the inventory for a hardware store from a sequential access file named scan.txt. Each line of the input file contains a...

3. Suppose that we modify the Partition algorithm in QuickSort in such a way that on alternating levels of the recursion tree, Partition either chooses the best possible pivot or the worst possible...

MATLAB function help In this exercise you will write a function that reduces an m x n matrix A to the reduced echelon form by implementing the Row Reduction Algorithm in the way it will be outlined...

please answer Q5 please let me know if you need more information. thanks 01.10) Min Max recursive algorithm 1) [10] Write a recursive alonths in pseudocode. Min Max. for finding both the minimum...

Consider the straightforward (nonpolynomial-time) reduction in the proof of Theorem 34.9. Describe a circuit of size n that, when converted to a formula by this method, yields a formula whose size is...

Maxsub algorithm in the textbook is to find the subarray whose sum is the maximum. Similarly, you can modify it to Minsub algorithm to find the subarray whose sum is the minimum. By modify the...

In class, we introduced the RecursiveHuffman algorithm that given any alphabet A returns an optimal tree T (corresponding to an optimal pre-fix free code E). The same optimal tree T can also be...

1. (60) [Counting the number of shortest paths] (Based on Textbook Exercise 10 in Chapter 3.) Consider an undirected graph G=(V,E). Design an algorithm that computes the number of shortest paths from...

1. The Perceptron algorithm a. Separable case: (1) Use Python to generate a 2D (x ER2) linear separable data set with 50 positive instances and 50 negative instances and create a scatter plot to...

Perform a radix sort on the following sequence of values by using an iterative bucket sort that operates by using the characters in the Strings as the keys. You can assume that only the characters in...

Answer the following questions and explain briefly your answers using what you have learned in this class. (a) Is it possible to have the growth rate of real GDP higher than that of nominal GDP? If...

Insertion Sort Practice You are using the Insertion Sort algorithm on the following array: Draw the array after the first pass: Draw the array after the second pass: Draw the array after the third...

QUESTION ONE (20 marks) Mahogany Air is a major local carrier in Zambia, and has made a strategy of cutting fares drastically on certain routes with large effects on air traffic in those markets. For...

Question What are the requirements for a safe harbor 401(k) plan?30

Question Can S corporation ownership of employer stock be used as part of a plan to convert nondeductible benefits for corporate shareholders into deductible items?

Question What is a hardship for purposes of a Section 401(k) plan withdrawal?