Question: Task 1: Generalization Create a file called new_sse.c based on unroll_sse.c . The new file, new_sse.c can do the following: Accept two command line arguments.

Task 1: Generalization

Create a file called new_sse.c based on unroll_sse.c. The new file, new_sse.c can do the following:

Accept two command line arguments.

The first argument is a value between 10 and 10000000. It provides the size of the arrays a and b. This means that a and b are now dynamically allocated.

The second argument can either be 0 or 1. If it is 0, the content of a and b will be printed to screen. If it is 1, there will be no output to the screen.

Task 2: Timing

Create a file called timed_sse.c based on new_sse.c. Regardless of the second argument, this file should first print out on a single line the number of seconds it takes to finish the two for loops used for the calculation.

Task 3: Improvement

The departmental server, molly, supports AVX2, which provides 256-bit registers. An example of this is shown in the file intrinsic_sum.c. Create a file called timed_avx2.c from timed_sse.c that switch to using __m256 and corresponding intrinsics instead of __m128.

To compile and test the AVX2 intrinsic, -mavx2 flag need to be added to the gcc command.

unroll_sse.c:

#include
#include
#include
int main() {
int i = 0;
float a[10] ={1,2,3,4,5,6,7,8,9,10};
float b[10] ={0,0,0,0,0,0,0,0,0,0};
int n = 10;
int unroll = (n/4)*4;
for (i = 0; i < unroll; i+=4) {
__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_storeu_ps(&b[i],ai2_v);
}
for (; i
b[i] = a[i]*2;
}
for (i = 0; i < 10; i++)
printf("%2.0f\t", a[i]);
printf(" ");
for (i = 0; i < 10; i++)
printf("%2.0f\t", b[i]);
printf(" ");
return 0;
}

intrinsic_sum.c :

#include
#include
#include
#include
int intrinsic_sum(const int* array, unsigned int len) {
register int i = 0, s = 0;
__m256i sm = _mm256_loadu_si256((void *)(array+i));
__m256i sm2 = _mm256_loadu_si256((void *)(array+i+8));
i += 16;
for (; i+16 < len; i += 16) {
const __m256i x = _mm256_loadu_si256((void *)(array+i));
sm = _mm256_add_epi32(sm, x);
const __m256i y = _mm256_loadu_si256((void *)(array+i+8));
sm2 = _mm256_add_epi32(sm2, y);
}
sm = _mm256_add_epi32(sm, sm2);
sm = _mm256_hadd_epi32(sm, sm);
sm = _mm256_hadd_epi32(sm, sm);
s += _mm256_extract_epi32(sm, 0);
s += _mm256_extract_epi32(sm, 4);
for(; i < len; ++i) s += array[i];
return s;
}
int normal_sum(int* array, int len) {
int s = 0;
for (int i = 0; i < len; i++) {
s += array[i];
}
return s;
}
int main(int argc, char* argv[]) {
int N = atoi(argv[1]);
int sum1 = 0, sum2 = 0;
clock_t start, end;
int* nums = malloc(sizeof(int) * N);
for (int i = 0; i < N; i++) {
nums[i] = 1;
}
start = clock();
sum1 = intrinsic_sum((const int*) nums, (unsigned int)N);
end = clock();
printf("Sum 1 is %d and takes %f seconds ", sum1,((double)(end - start))/ CLOCKS_PER_SEC);
start = clock();
sum2 = normal_sum(nums, N);
end = clock();
printf("Sum 2 is %d and takes %f seconds ", sum1,((double)(end - start))/ CLOCKS_PER_SEC);
return 0;
}

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!