Question: Optimize the reduction algorithm in kernel.cu so that the max value of the data is obtained. (other related files are provided) kernel.cu #define BLOCK_SIZE 512

Optimize the reduction algorithm in kernel.cu so that the max value of the data is obtained. (other related files are provided)

kernel.cu

#define BLOCK_SIZE 512 #define SIMPLE

__global__ void reduction(float *out, float *in, unsigned size) { /******************************************************************** Load segment of input vector into shared memory Traverse reduction tree Write computed sum to output vector at correct index ********************************************************************/

// INSERT KERNEL CODE HERE

#ifdef SIMPLE __shared__ float in_s[2 * BLOCK_SIZE]; int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;

in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f); in_s[threadIdx.x + BLOCK_SIZE] = ((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);

for (int stride = 1; stride < BLOCK_SIZE << 1; stride <<= 1) { __syncthreads(); if (threadIdx.x % stride == 0) in_s[2 * threadIdx.x] += in_s[2 * threadIdx.x + stride]; }

#else __shared__ float in_s[BLOCK_SIZE]; int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;

in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f) + ((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);

for (int stride = BLOCK_SIZE >> 1; stride > 0; stride >>= 1) { __syncthreads(); if (threadIdx.x < stride) in_s[threadIdx.x] += in_s[threadIdx.x + stride]; } #endif

if (threadIdx.x == 0) out[blockIdx.x] = in_s[0]; }

main.cu

#include

#include "support.h" #include "kernel.cu"

int main(int argc, char* argv[]) { Timer timer;

// Initialize host variables ----------------------------------------------

printf(" Setting up the problem..."); fflush(stdout); startTime(&timer);

float *in_h, *out_h; float *in_d, *out_d; unsigned in_elements, out_elements; cudaError_t cuda_ret; dim3 dim_grid, dim_block; int i;

// Allocate and initialize host memory if (argc == 1) { in_elements = 1000000; } else if (argc == 2) { in_elements = atoi(argv[1]); } else { printf(" Invalid input parameters!" " Usage: ./reduction # Input of size 1,000,000 is used" " Usage: ./reduction # Input of size m is used" " "); exit(0); } initVector(&in_h, in_elements);

out_elements = in_elements / (BLOCK_SIZE << 1); if (in_elements % (BLOCK_SIZE << 1)) out_elements++;

out_h = (float*)malloc(out_elements * sizeof(float)); if (out_h == NULL) FATAL("Unable to allocate host");

stopTime(&timer); printf("%f s ", elapsedTime(timer)); printf(" Input size = %u ", in_elements);

// Allocate device variables ----------------------------------------------

printf("Allocating device variables..."); fflush(stdout); startTime(&timer);

cuda_ret = cudaMalloc((void**)&in_d, in_elements * sizeof(float)); if (cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory"); cuda_ret = cudaMalloc((void**)&out_d, out_elements * sizeof(float)); if (cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");

cudaDeviceSynchronize(); stopTime(&timer); printf("%f s ", elapsedTime(timer));

// Copy host variables to device ------------------------------------------

printf("Copying data from host to device..."); fflush(stdout); startTime(&timer);

cuda_ret = cudaMemcpy(in_d, in_h, in_elements * sizeof(float), cudaMemcpyHostToDevice); if (cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");

cuda_ret = cudaMemset(out_d, 0, out_elements * sizeof(float)); if (cuda_ret != cudaSuccess) FATAL("Unable to set device memory");

cudaDeviceSynchronize(); stopTime(&timer); printf("%f s ", elapsedTime(timer));

// Launch kernel ---------------------------------------------------------- printf("Launching kernel..."); fflush(stdout); startTime(&timer);

dim_block.x = BLOCK_SIZE; dim_block.y = dim_block.z = 1; dim_grid.x = out_elements; dim_grid.y = dim_grid.z = 1; reduction << > >(out_d, in_d, in_elements); cuda_ret = cudaDeviceSynchronize(); if (cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");

stopTime(&timer); printf("%f s ", elapsedTime(timer));

// Copy device variables from host ----------------------------------------

printf("Copying data from device to host..."); fflush(stdout); startTime(&timer);

cuda_ret = cudaMemcpy(out_h, out_d, out_elements * sizeof(float), cudaMemcpyDeviceToHost); if (cuda_ret != cudaSuccess) FATAL("Unable to copy memory to host");

cudaDeviceSynchronize(); stopTime(&timer); printf("%f s ", elapsedTime(timer));

// Verify correctness -----------------------------------------------------

printf("Verifying results..."); fflush(stdout);

/* Accumulate partial sums on host */ for (i = 1; i out_h[0] += out_h[i]; }

/* Verify the result */ verify(in_h, in_elements, out_h[0]);

// Free memory ------------------------------------------------------------

cudaFree(in_d); cudaFree(out_d); free(in_h); free(out_h);

return 0; }

support.cu

#include #include

#include "support.h"

void initVector(float **vec_h, unsigned size) { *vec_h = (float*)malloc(size * sizeof(float));

if (*vec_h == NULL) { FATAL("Unable to allocate host"); }

for (unsigned int i = 0; i < size; i++) { (*vec_h)[i] = (rand() % 100) / 100.00; }

}

void verify(float* input, unsigned num_elements, float result) {

const float relativeTolerance = 2e-5;

float sum = 0.0f; for (int i = 0; i < num_elements; ++i) { sum += input[i]; } float relativeError = (sum - result) / sum; if (relativeError > relativeTolerance || relativeError < -relativeTolerance) { printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result); exit(0); } printf("TEST PASSED ");

}

void startTime(Timer* timer) { gettimeofday(&(timer->startTime), NULL); }

void stopTime(Timer* timer) { gettimeofday(&(timer->endTime), NULL); }

float elapsedTime(Timer timer) { return ((float)((timer.endTime.tv_sec - timer.startTime.tv_sec) \ + (timer.endTime.tv_usec - timer.startTime.tv_usec) / 1.0e6)); }

support.h

#ifndef __FILEH__ #define __FILEH__

#include

typedef struct { struct timeval startTime; struct timeval endTime; } Timer;

#ifdef __cplusplus extern "C" { #endif void initVector(float **vec_h, unsigned size); void verify(float* input, unsigned num_elements, float result); void startTime(Timer* timer); void stopTime(Timer* timer); float elapsedTime(Timer timer); #ifdef __cplusplus } #endif

#define FATAL(msg, ...) \ do {\ fprintf(stderr, "[%s:%d] "msg" ", __FILE__, __LINE__, ##__VA_ARGS__);\ exit(-1);\ } while(0)

#if __BYTE_ORDER != __LITTLE_ENDIAN # error "File I/O is not implemented for this system: wrong endianness." #endif

#endif

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer

Step: 1 Unlock blur-text-image

Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock

Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!

I want you to summerize these 7 items. !Please be different from other answers! !Please get a little quick! Thanks. You should summarize the 7 items in the photo. Max 125 words! reading 1....

MATLAB function help In this exercise you will write a function that reduces an m x n matrix A to the reduced echelon form by implementing the Row Reduction Algorithm in the way it will be outlined...

Consider the straightforward (nonpolynomial-time) reduction in the proof of Theorem 34.9. Describe a circuit of size n that, when converted to a formula by this method, yields a formula whose size is...

math 79 linear algebra Question 8, 1.2.43 HW Score: 0%, 0 of 15 points Save O Points: 0 of 1 A system of linear equations with more equations than unknowns is sometimes called an overdetermined...

I have this assignment for linear algebra and I am not sure how to do it. I was able to get the first 2 parts of this assignment, but the last part I am not sure how to do. Here is the function that...

Different dimension reduction techniques can have quite different computational complexity. Beyond the algorithm itself there is also the question of how exactly it is implemented. These two factors...

Theory: In this exercise, you will analyze the solution set of a homogeneous linear system Ax=0, where A is an mxn matrix and xe". If the system has only the trivial solution (there are no free...

8 . Your friends preschool - age daughter Madison has recently learned to spell some simple words. To help encourage this, her parents got her a colorful set of refrigerator magnets featuring the...

Accuracy in Matlab and Condition Number For most systems of linear equations, we can get an answer with enough accuracy fast enough in a number of different ways, and we don't even bother with trying...

I. Introduction An instance of the 2048-puzzle game is played on a 44 grid, with numbered tiles that slide in all four directions when a player moves them. Every turn, a new tile will randomly appear...

Given the lattice network shown in fig 11.58 determine what type of filter this network represents by determining the voltage transfer function. R1 R2

Why must the user be cautious in analyzing bank holding companies?

A firm's cost of _ _ _ _ - capital reflects both its cost of _ _ _ _ _ capital and its cost of capital.

QUESTION 3 ABC Variance Analysis (10 marks) Dough-eez Donuts makes donuts for food trucks in the area. The donuts are made in standard batches of 100 donuts. To bake a batch of donuts, the production...

=+employees encounter and design a crisis management program to deal with them.

=+3 Describe the kinds of health, safety, and security problems that international

=+4 Why is designing and implementing a global human resource information system so difficult?