From da10cd87ba8a5cabcceddd4cba52891ea4866416 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sat, 25 Feb 2023 00:43:36 +0100 Subject: [PATCH] Use posix_memalign version of STREAM Taken from https://www.cs.virginia.edu/stream/FTP/Code/Versions/stream_5-10_posix_memalign.c. This version avoids statically allocated arrays by using heap allocations. This avoids limitations of ELF/PE. --- examples/stream/CMakeLists.txt | 2 +- examples/stream/README.md | 6 + examples/stream/stream.cpp | 609 ++++++++++++++++----------------- 3 files changed, 309 insertions(+), 308 deletions(-) create mode 100644 examples/stream/README.md diff --git a/examples/stream/CMakeLists.txt b/examples/stream/CMakeLists.txt index 4da3e8a6b4..730d3dd95c 100644 --- a/examples/stream/CMakeLists.txt +++ b/examples/stream/CMakeLists.txt @@ -4,7 +4,7 @@ cmake_minimum_required (VERSION 3.18.3) find_package(OpenMP REQUIRED) -set(STREAM_ARRAY_SIZE 80000000) # if we make this too large, we will get linker errors due to ELF/PE limits +set(STREAM_ARRAY_SIZE 400000000 CACHE STRING "STREAM benchmark array size") project(stream C) diff --git a/examples/stream/README.md b/examples/stream/README.md new file mode 100644 index 0000000000..30a52af770 --- /dev/null +++ b/examples/stream/README.md @@ -0,0 +1,6 @@ +This is a compliant version of the [STREAM benchmark](https://www.cs.virginia.edu/stream/). +There are a few official such versions, which you can download [here](https://www.cs.virginia.edu/stream/FTP/Code/). +The [original version](https://www.cs.virginia.edu/stream/FTP/Code/stream.c) places the input and ouput arrays into the static program segment. +This causes the linker to complain because it breaks limits of ELF/PE (Linux/Windows executable file format) for big array sizes. +We therefore took the [version using `posix_memalign`](https://www.cs.virginia.edu/stream/FTP/Code/Versions/stream_5-10_posix_memalign.c) for allocation. +Some fixes were applied to make it compile with MSVC. diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index fdb8905d44..f69d699f3c 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -1,6 +1,8 @@ +// NOLINTBEGIN +// clang-format off /*-----------------------------------------------------------------------*/ /* Program: STREAM */ -/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ +/* Revision: $Id: stream.c,v 5.10.1 2014/06/17 08:16:08 mccalpin Exp mccalpin $ */ /* Original code developed by John D. McCalpin */ /* Programmers: John D. McCalpin */ /* Joe R. Zagar */ @@ -40,13 +42,14 @@ /* program constitutes acceptance of these licensing restrictions. */ /* 5. Absolutely no warranty is expressed or implied. */ /*-----------------------------------------------------------------------*/ -// NOLINTBEGIN -#include -#include -#include -#include // !!! bgruber: added header -#include - +# include +# include +//# include // !!! bgruber: removed header +# include +# include +# include +//# include // !!! bgruber: removed header +# include // !!! bgruber: added header /*----------------------------------------------------------------------- * INSTRUCTIONS: @@ -63,13 +66,13 @@ * Example 1: One Xeon E3 with 8 MB L3 cache * STREAM_ARRAY_SIZE should be >= 4 million, giving * an array size of 30.5 MB and a total memory requirement - * of 91.5 MB. + * of 91.5 MB. * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) * STREAM_ARRAY_SIZE should be >= 20 million, giving * an array size of 153 MB and a total memory requirement - * of 458 MB. + * of 458 MB. * (b) The size should be large enough so that the 'timing calibration' - * output by the program is at least 20 clock-ticks. + * output by the program is at least 20 clock-ticks. * Example: most versions of Windows have a 10 millisecond timer * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. @@ -78,7 +81,7 @@ * Version 5.10 increases the default array size from 2 million * elements to 10 million elements in response to the increasing * size of L3 caches. The new default size is large enough for caches - * up to 20 MB. + * up to 20 MB. * Version 5.10 changes the loop index variables from "register int" * to "ssize_t", which allows array indices >2^32 (4 billion) * on properly configured 64-bit systems. Additional compiler options @@ -92,42 +95,45 @@ * per array. */ #ifndef STREAM_ARRAY_SIZE -# define STREAM_ARRAY_SIZE 10000000 +# define STREAM_ARRAY_SIZE 10000000 #endif /* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result * for any iteration after the first, therefore the minimum value * for NTIMES is 2. * There are no rules on maximum allowable values for NTIMES, but - * values larger than the default are unlikely to noticeably + * when running with STREAM_TYPE=float, the results will overflow + * if NTIMES exceeds 32. Results will probably overflow at some + * point with STREAM_TYPE=double, but I have not checked the exact value. + * Values larger than the default are unlikely to noticeably * increase the reported performance. * NTIMES can also be set on the compile line without changing the source * code using, for example, "-DNTIMES=7". */ #ifdef NTIMES -# if NTIMES <= 1 -# define NTIMES 10 -# endif +#if NTIMES<=1 +# define NTIMES 10 +#endif #endif #ifndef NTIMES -# define NTIMES 10 +# define NTIMES 10 #endif /* Users are allowed to modify the "OFFSET" variable, which *may* change the - * relative alignment of the arrays (though compilers may change the - * effective offset by making the arrays non-contiguous on some systems). + * relative alignment of the arrays (though compilers may change the + * effective offset by making the arrays non-contiguous on some systems). * Use of non-zero values for OFFSET can be especially helpful if the * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. * OFFSET can also be set on the compile line without changing the source * code using, for example, "-DOFFSET=56". */ #ifndef OFFSET -# define OFFSET 0 +# define OFFSET 0 #endif /* * 3) Compile the code with optimization. Many compilers generate - * unreasonably bad code before the optimizer tightens things up. + * unreasonably bad code before the optimizer tightens things up. * If the results are unreasonably good, on the other hand, the * optimizer might be too smart for me! * @@ -138,7 +144,7 @@ * To use multiple cores, you need to tell the compiler to obey the OpenMP * directives in the code. This varies by compiler, but a common example is * gcc -O -fopenmp stream.c -o stream_omp - * The environment variable OMP_NUM_THREADS allows runtime control of the + * The environment variable OMP_NUM_THREADS allows runtime control of the * number of threads/cores used when the resulting "stream_omp" program * is executed. * @@ -147,9 +153,9 @@ * to the compile line. * Note that this changes the minimum array sizes required --- see (1) above. * - * The preprocessor directive "TUNED" does not do much -- it simply causes the + * The preprocessor directive "TUNED" does not do much -- it simply causes the * code to call separate functions to execute each kernel. Trivial versions - * of these functions are provided, but they are *not* tuned -- they just + * of these functions are provided, but they are *not* tuned -- they just * provide predefined interfaces to be replaced with tuned code. * * @@ -164,36 +170,45 @@ * *-----------------------------------------------------------------------*/ -#define HLINE "-------------------------------------------------------------\n" +# define HLINE "-------------------------------------------------------------\n" -#ifndef MIN -# define MIN(x, y) ((x) < (y) ? (x) : (y)) -#endif -#ifndef MAX -# define MAX(x, y) ((x) > (y) ? (x) : (y)) -#endif +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif #ifndef STREAM_TYPE -# define STREAM_TYPE double +#define STREAM_TYPE double #endif + // !!! bgruber: added definitions for ssize_t and posix_memalign on MSVC #ifdef _MSC_VER # include using ssize_t = std::make_signed_t; + +// from: https://stackoverflow.com/questions/33696092/whats-the-correct-replacement-for-posix-memalign-in-windows +#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) #endif -static STREAM_TYPE a[STREAM_ARRAY_SIZE + OFFSET], b[STREAM_ARRAY_SIZE + OFFSET], c[STREAM_ARRAY_SIZE + OFFSET]; +//static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], +// b[STREAM_ARRAY_SIZE+OFFSET], +// c[STREAM_ARRAY_SIZE+OFFSET]; +double *a,*b,*c; -static double avgtime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}; +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; -static const char* label[4] - = {"Copy: ", "Scale: ", "Add: ", "Triad: "}; // !!! bgruber: added `const` to fix warning +static const char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; // !!! bgruber: added `const` to fix warning/error -static double bytes[4] - = {2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE}; +static double bytes[4] = { + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE + }; extern double mysecond(); extern void checkSTREAMresults(); @@ -203,19 +218,20 @@ extern void tuned_STREAM_Scale(STREAM_TYPE scalar); extern void tuned_STREAM_Add(); extern void tuned_STREAM_Triad(STREAM_TYPE scalar); #endif -// !!! bgruber: commented out because this signature is differnt from the one in -// #ifdef _OPENMP -// extern int omp_get_num_threads(); -// #endif -int checktick(); // !!! bgruber: moved out of main to fix warning -int main() -{ - int quantum; - int BytesPerWord; - int k; - ssize_t j; - STREAM_TYPE scalar; - double t, times[4][NTIMES]; +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif +int checktick(); // !!! bgruber: moved function declaration out of main() +int +main() + { + int quantum; + int BytesPerWord; + int k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + size_t arraybytes,arrayalignment; /* --- SETUP --- determine precision and check timing --- */ @@ -223,85 +239,95 @@ int main() printf("STREAM version $Revision: 5.10 $\n"); printf(HLINE); BytesPerWord = sizeof(STREAM_TYPE); - printf("This system uses %d bytes per array element.\n", BytesPerWord); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + arraybytes = (STREAM_ARRAY_SIZE + OFFSET)*sizeof(STREAM_TYPE); + arrayalignment = 64; + k = posix_memalign((void **)&a, arrayalignment, arraybytes); + if (k != 0) { + printf("Allocation of array a failed, return code is %d\n",k); + exit(1); + } + k = posix_memalign((void **)&b, arrayalignment, arraybytes); + if (k != 0) { + printf("Allocation of array b failed, return code is %d\n",k); + exit(1); + } + k = posix_memalign((void **)&c, arrayalignment, arraybytes); + if (k != 0) { + printf("Allocation of array c failed, return code is %d\n",k); + exit(1); + } printf(HLINE); #ifdef N printf("***** WARNING: ******\n"); printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); - printf( - " This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); - printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n", (unsigned long long) STREAM_ARRAY_SIZE); + printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); printf("***** WARNING: ******\n"); #endif - printf("Array size = %llu (elements), Offset = %d (elements)\n", (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); - printf( - "Memory per array = %.1f MiB (= %.1f GiB).\n", - BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0), - BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0 / 1024.0)); - printf( - "Total memory required = %.1f MiB (= %.1f GiB).\n", - (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.), - (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024. / 1024.)); + printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); + printf("Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Each kernel will be executed %d times.\n", NTIMES); - printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); #ifdef _OPENMP printf(HLINE); -# pragma omp parallel +#pragma omp parallel { -# pragma omp master - { - k = omp_get_num_threads(); - printf("Number of Threads requested = %i\n", k); +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); } } #endif #ifdef _OPENMP - k = 0; -# pragma omp parallel -# pragma omp atomic - k++; - printf("Number of Threads counted = %i\n", k); + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted = %i\n",k); #endif /* Get initial value for system clock. */ #pragma omp parallel for - for(j = 0; j < STREAM_ARRAY_SIZE; j++) - { - a[j] = 1.0; - b[j] = 2.0; - c[j] = 0.0; - } + for (j=0; j= 1) - printf( - "Your clock granularity/precision appears to be " - "%d microseconds.\n", - quantum); - else - { - printf("Your clock granularity appears to be " - "less than one microsecond.\n"); - quantum = 1; + if ( (quantum = checktick()) >= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; } t = mysecond(); #pragma omp parallel for - for(j = 0; j < STREAM_ARRAY_SIZE; j++) - a[j] = 2.0E0 * a[j]; + for (j = 0; j < STREAM_ARRAY_SIZE; j++) + a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); - printf( - "Each test below will take on the order" - " of %d microseconds.\n", - (int) t); - printf(" (= %d clock ticks)\n", (int) (t / quantum)); + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); @@ -311,77 +337,74 @@ int main() printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); - + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; - for(k = 0; k < NTIMES; k++) - { - times[0][k] = mysecond(); + for (k=0; k #include double mysecond() @@ -438,158 +462,129 @@ double mysecond() } #ifndef abs -# define abs(a) ((a) >= 0 ? (a) : -(a)) +#define abs(a) ((a) >= 0 ? (a) : -(a)) #endif -void checkSTREAMresults() +void checkSTREAMresults () { - STREAM_TYPE aj, bj, cj, scalar; - STREAM_TYPE aSumErr, bSumErr, cSumErr; - STREAM_TYPE aAvgErr, bAvgErr, cAvgErr; - double epsilon; - ssize_t j; - int k, ierr, err; + STREAM_TYPE aj,bj,cj,scalar; + STREAM_TYPE aSumErr,bSumErr,cSumErr; + STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; + double epsilon; + ssize_t j; + int k,ierr,err; /* reproduce initialization */ - aj = 1.0; - bj = 2.0; - cj = 0.0; + aj = 1.0; + bj = 2.0; + cj = 0.0; /* a[] is modified during timing check */ - aj = 2.0E0 * aj; + aj = 2.0E0 * aj; /* now execute timing loop */ - scalar = 3.0; - for(k = 0; k < NTIMES; k++) - { - cj = aj; - bj = scalar * cj; - cj = aj + bj; - aj = bj + scalar * cj; - } + scalar = 3.0; + for (k=0; k epsilon) - { - err++; - printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", epsilon); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", aj, aAvgErr, abs(aAvgErr) / aj); - ierr = 0; - for(j = 0; j < STREAM_ARRAY_SIZE; j++) - { - if(abs(a[j] / aj - 1.0) > epsilon) - { - ierr++; + aSumErr = 0.0; + bSumErr = 0.0; + cSumErr = 0.0; + for (j=0; j epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; #ifdef VERBOSE - if(ierr < 10) - { - printf( - " array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j, - aj, - a[j], - abs((aj - a[j]) / aAvgErr)); - } + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } #endif - } - } - printf(" For array a[], %d errors were found.\n", ierr); - } - if(abs(bAvgErr / bj) > epsilon) - { - err++; - printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", epsilon); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", bj, bAvgErr, abs(bAvgErr) / bj); - printf(" AvgRelAbsErr > Epsilon (%e)\n", epsilon); - ierr = 0; - for(j = 0; j < STREAM_ARRAY_SIZE; j++) - { - if(abs(b[j] / bj - 1.0) > epsilon) - { - ierr++; + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; #ifdef VERBOSE - if(ierr < 10) - { - printf( - " array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j, - bj, - b[j], - abs((bj - b[j]) / bAvgErr)); - } + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } #endif - } - } - printf(" For array b[], %d errors were found.\n", ierr); - } - if(abs(cAvgErr / cj) > epsilon) - { - err++; - printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", epsilon); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", cj, cAvgErr, abs(cAvgErr) / cj); - printf(" AvgRelAbsErr > Epsilon (%e)\n", epsilon); - ierr = 0; - for(j = 0; j < STREAM_ARRAY_SIZE; j++) - { - if(abs(c[j] / cj - 1.0) > epsilon) - { - ierr++; + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; #ifdef VERBOSE - if(ierr < 10) - { - printf( - " array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j, - cj, - c[j], - abs((cj - c[j]) / cAvgErr)); - } + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } #endif - } - } - printf(" For array c[], %d errors were found.\n", ierr); - } - if(err == 0) - { - printf("Solution Validates: avg error less than %e on all three arrays\n", epsilon); - } + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } #ifdef VERBOSE - printf("Results Validation Verbose Results: \n"); - printf(" Expected a(1), b(1), c(1): %f %f %f \n", aj, bj, cj); - printf(" Observed a(1), b(1), c(1): %f %f %f \n", a[1], b[1], c[1]); - printf(" Rel Errors on a, b, c: %e %e %e \n", abs(aAvgErr / aj), abs(bAvgErr / bj), abs(cAvgErr / cj)); + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); #endif } -// NOLINTEND - #ifdef TUNED +// NOLINTEND +// clang-format on // !!! bgruber: From here on forward is LLAMA code # undef abs