Question: Optimize the reduction algorithm in kernel.cu so that the max value of the data is obtained. (other related files are provided) kernel.cu #define BLOCK_SIZE 512

Optimize the reduction algorithm in kernel.cu so that the max value of the data is obtained. (other related files are provided)

kernel.cu

#define BLOCK_SIZE 512 #define SIMPLE

__global__ void reduction(float *out, float *in, unsigned size) { /******************************************************************** Load segment of input vector into shared memory Traverse reduction tree Write computed sum to output vector at correct index ********************************************************************/

// INSERT KERNEL CODE HERE

#ifdef SIMPLE __shared__ float in_s[2 * BLOCK_SIZE]; int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;

in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f); in_s[threadIdx.x + BLOCK_SIZE] = ((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);

for (int stride = 1; stride < BLOCK_SIZE << 1; stride <<= 1) { __syncthreads(); if (threadIdx.x % stride == 0) in_s[2 * threadIdx.x] += in_s[2 * threadIdx.x + stride]; }

#else __shared__ float in_s[BLOCK_SIZE]; int idx = 2 * blockIdx.x * blockDim.x + threadIdx.x;

in_s[threadIdx.x] = ((idx < size) ? in[idx] : 0.0f) + ((idx + BLOCK_SIZE < size) ? in[idx + BLOCK_SIZE] : 0.0f);

for (int stride = BLOCK_SIZE >> 1; stride > 0; stride >>= 1) { __syncthreads(); if (threadIdx.x < stride) in_s[threadIdx.x] += in_s[threadIdx.x + stride]; } #endif

if (threadIdx.x == 0) out[blockIdx.x] = in_s[0]; }

main.cu

#include

#include "support.h" #include "kernel.cu"

int main(int argc, char* argv[]) { Timer timer;

// Initialize host variables ----------------------------------------------

printf(" Setting up the problem..."); fflush(stdout); startTime(&timer);

float *in_h, *out_h; float *in_d, *out_d; unsigned in_elements, out_elements; cudaError_t cuda_ret; dim3 dim_grid, dim_block; int i;

// Allocate and initialize host memory if (argc == 1) { in_elements = 1000000; } else if (argc == 2) { in_elements = atoi(argv[1]); } else { printf(" Invalid input parameters!" " Usage: ./reduction # Input of size 1,000,000 is used" " Usage: ./reduction # Input of size m is used" " "); exit(0); } initVector(&in_h, in_elements);

out_elements = in_elements / (BLOCK_SIZE << 1); if (in_elements % (BLOCK_SIZE << 1)) out_elements++;

out_h = (float*)malloc(out_elements * sizeof(float)); if (out_h == NULL) FATAL("Unable to allocate host");

stopTime(&timer); printf("%f s ", elapsedTime(timer)); printf(" Input size = %u ", in_elements);

// Allocate device variables ----------------------------------------------

printf("Allocating device variables..."); fflush(stdout); startTime(&timer);

cuda_ret = cudaMalloc((void**)&in_d, in_elements * sizeof(float)); if (cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory"); cuda_ret = cudaMalloc((void**)&out_d, out_elements * sizeof(float)); if (cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");

cudaDeviceSynchronize(); stopTime(&timer); printf("%f s ", elapsedTime(timer));

// Copy host variables to device ------------------------------------------

printf("Copying data from host to device..."); fflush(stdout); startTime(&timer);

cuda_ret = cudaMemcpy(in_d, in_h, in_elements * sizeof(float), cudaMemcpyHostToDevice); if (cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");

cuda_ret = cudaMemset(out_d, 0, out_elements * sizeof(float)); if (cuda_ret != cudaSuccess) FATAL("Unable to set device memory");

cudaDeviceSynchronize(); stopTime(&timer); printf("%f s ", elapsedTime(timer));

// Launch kernel ---------------------------------------------------------- printf("Launching kernel..."); fflush(stdout); startTime(&timer);

dim_block.x = BLOCK_SIZE; dim_block.y = dim_block.z = 1; dim_grid.x = out_elements; dim_grid.y = dim_grid.z = 1; reduction << > >(out_d, in_d, in_elements); cuda_ret = cudaDeviceSynchronize(); if (cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");

stopTime(&timer); printf("%f s ", elapsedTime(timer));

// Copy device variables from host ----------------------------------------

printf("Copying data from device to host..."); fflush(stdout); startTime(&timer);

cuda_ret = cudaMemcpy(out_h, out_d, out_elements * sizeof(float), cudaMemcpyDeviceToHost); if (cuda_ret != cudaSuccess) FATAL("Unable to copy memory to host");

cudaDeviceSynchronize(); stopTime(&timer); printf("%f s ", elapsedTime(timer));

// Verify correctness -----------------------------------------------------

printf("Verifying results..."); fflush(stdout);

/* Accumulate partial sums on host */ for (i = 1; i out_h[0] += out_h[i]; }

/* Verify the result */ verify(in_h, in_elements, out_h[0]);

// Free memory ------------------------------------------------------------

cudaFree(in_d); cudaFree(out_d); free(in_h); free(out_h);

return 0; }

support.cu

#include #include

#include "support.h"

void initVector(float **vec_h, unsigned size) { *vec_h = (float*)malloc(size * sizeof(float));

if (*vec_h == NULL) { FATAL("Unable to allocate host"); }

for (unsigned int i = 0; i < size; i++) { (*vec_h)[i] = (rand() % 100) / 100.00; }

}

void verify(float* input, unsigned num_elements, float result) {

const float relativeTolerance = 2e-5;

float sum = 0.0f; for (int i = 0; i < num_elements; ++i) { sum += input[i]; } float relativeError = (sum - result) / sum; if (relativeError > relativeTolerance || relativeError < -relativeTolerance) { printf("TEST FAILED, cpu = %0.3f, gpu = %0.3f ", sum, result); exit(0); } printf("TEST PASSED ");

}

void startTime(Timer* timer) { gettimeofday(&(timer->startTime), NULL); }

void stopTime(Timer* timer) { gettimeofday(&(timer->endTime), NULL); }

float elapsedTime(Timer timer) { return ((float)((timer.endTime.tv_sec - timer.startTime.tv_sec) \ + (timer.endTime.tv_usec - timer.startTime.tv_usec) / 1.0e6)); }

support.h

#ifndef __FILEH__ #define __FILEH__

#include

typedef struct { struct timeval startTime; struct timeval endTime; } Timer;

#ifdef __cplusplus extern "C" { #endif void initVector(float **vec_h, unsigned size); void verify(float* input, unsigned num_elements, float result); void startTime(Timer* timer); void stopTime(Timer* timer); float elapsedTime(Timer timer); #ifdef __cplusplus } #endif

#define FATAL(msg, ...) \ do {\ fprintf(stderr, "[%s:%d] "msg" ", __FILE__, __LINE__, ##__VA_ARGS__);\ exit(-1);\ } while(0)

#if __BYTE_ORDER != __LITTLE_ENDIAN # error "File I/O is not implemented for this system: wrong endianness." #endif

#endif

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!