Question: Task 1: Generalization Create a file called new_sse.c based on unroll_sse.c . The new file, new_sse.c can do the following: Accept two command line arguments.
Task 1: Generalization
Create a file called new_sse.c based on unroll_sse.c. The new file, new_sse.c can do the following:
Accept two command line arguments.
The first argument is a value between 10 and 10000000. It provides the size of the arrays a and b. This means that a and b are now dynamically allocated.
The second argument can either be 0 or 1. If it is 0, the content of a and b will be printed to screen. If it is 1, there will be no output to the screen.
Task 2: Timing
Create a file called timed_sse.c based on new_sse.c. Regardless of the second argument, this file should first print out on a single line the number of seconds it takes to finish the two for loops used for the calculation.
Task 3: Improvement
The departmental server, molly, supports AVX2, which provides 256-bit registers. An example of this is shown in the file intrinsic_sum.c. Create a file called timed_avx2.c from timed_sse.c that switch to using __m256 and corresponding intrinsics instead of __m128.
To compile and test the AVX2 intrinsic, -mavx2 flag need to be added to the gcc command.
unroll_sse.c:
| #include | |
| #include | |
| #include | |
| int main() { | |
| int i = 0; | |
| float a[10] ={1,2,3,4,5,6,7,8,9,10}; | |
| float b[10] ={0,0,0,0,0,0,0,0,0,0}; | |
| int n = 10; | |
| int unroll = (n/4)*4; | |
| for (i = 0; i < unroll; i+=4) { | |
| __m128 ai_v = _mm_loadu_ps(&a[i]); | |
| __m128 two_v = _mm_set1_ps(2); | |
| __m128 ai2_v = _mm_mul_ps(ai_v,two_v); | |
| _mm_storeu_ps(&b[i],ai2_v); | |
| } | |
| for (; i | |
| b[i] = a[i]*2; | |
| } | |
| for (i = 0; i < 10; i++) | |
| printf("%2.0f\t", a[i]); | |
| printf(" "); | |
| for (i = 0; i < 10; i++) | |
| printf("%2.0f\t", b[i]); | |
| printf(" "); | |
| return 0; | |
| } |
intrinsic_sum.c :
| #include | |
| #include | |
| #include | |
| #include | |
| int intrinsic_sum(const int* array, unsigned int len) { | |
| register int i = 0, s = 0; | |
| __m256i sm = _mm256_loadu_si256((void *)(array+i)); | |
| __m256i sm2 = _mm256_loadu_si256((void *)(array+i+8)); | |
| i += 16; | |
| for (; i+16 < len; i += 16) { | |
| const __m256i x = _mm256_loadu_si256((void *)(array+i)); | |
| sm = _mm256_add_epi32(sm, x); | |
| const __m256i y = _mm256_loadu_si256((void *)(array+i+8)); | |
| sm2 = _mm256_add_epi32(sm2, y); | |
| } | |
| sm = _mm256_add_epi32(sm, sm2); | |
| sm = _mm256_hadd_epi32(sm, sm); | |
| sm = _mm256_hadd_epi32(sm, sm); | |
| s += _mm256_extract_epi32(sm, 0); | |
| s += _mm256_extract_epi32(sm, 4); | |
| for(; i < len; ++i) s += array[i]; | |
| return s; | |
| } | |
| int normal_sum(int* array, int len) { | |
| int s = 0; | |
| for (int i = 0; i < len; i++) { | |
| s += array[i]; | |
| } | |
| return s; | |
| } | |
| int main(int argc, char* argv[]) { | |
| int N = atoi(argv[1]); | |
| int sum1 = 0, sum2 = 0; | |
| clock_t start, end; | |
| int* nums = malloc(sizeof(int) * N); | |
| for (int i = 0; i < N; i++) { | |
| nums[i] = 1; | |
| } | |
| start = clock(); | |
| sum1 = intrinsic_sum((const int*) nums, (unsigned int)N); | |
| end = clock(); | |
| printf("Sum 1 is %d and takes %f seconds ", sum1,((double)(end - start))/ CLOCKS_PER_SEC); | |
| start = clock(); | |
| sum2 = normal_sum(nums, N); | |
| end = clock(); | |
| printf("Sum 2 is %d and takes %f seconds ", sum1,((double)(end - start))/ CLOCKS_PER_SEC); | |
| return 0; | |
| } |
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
