diff --git a/README.md b/README.md index 1fc4fed..37b31bf 100644 --- a/README.md +++ b/README.md @@ -116,18 +116,3 @@ SDaxpy: 46822.63 23411.32 0.0281 0.0273 0.0325 Solution Validates ``` -## Benchmarking skript - -A perl wrapper script (bench.pl) is also provided to scan ranges of thread counts and determine the absolute highest sustained main memory bandwidth. In order to use it `likwid-pin` has to be in your path. The script has three required and one optional command line arguments: -``` -$./bench.pl [] -``` -Example usage: -``` -$./bench.pl ./bwbench-GCC 2-8 6 -``` -The script will always use physical cores only, where two SMT threads is the default. For different SMT thread counts use the 4th command line argument. Example for a processor without SMT: -``` -$./bench.pl ./bwbench-GCC 14-24 10 1 -``` - diff --git a/util/README.md b/util/README.md new file mode 100644 index 0000000..acfe608 --- /dev/null +++ b/util/README.md @@ -0,0 +1,20 @@ +# Single file teaching version + +bwBench.c contains a single file version of The Bandwidth Benchmark that is tailored for usage in Tutorials or Courses. + +It should compile with any C99 compiler. + +# Benchmarking skript + +A wrapper scripts in perl (bench.pl) and python (bench.py) are also provided to scan ranges of thread counts and determine the absolute highest sustained main memory bandwidth. In order to use it `likwid-pin` has to be in your path. The script has three required and one optional command line arguments: +``` +$./bench.pl [] +``` +Example usage: +``` +$./bench.pl ./bwbench-GCC 2-8 6 +``` +The script will always use physical cores only, where two SMT threads is the default. For different SMT thread counts use the 4th command line argument. Example for a processor without SMT: +``` +$./bench.pl ./bwbench-GCC 14-24 10 1 +``` diff --git a/bench.pl b/util/bench.pl similarity index 100% rename from bench.pl rename to util/bench.pl diff --git a/bench.py b/util/bench.py similarity index 100% rename from bench.py rename to util/bench.py diff --git a/util/bwBench.c b/util/bwBench.c new file mode 100644 index 0000000..eb17d6d --- /dev/null +++ b/util/bwBench.c @@ -0,0 +1,426 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2019 RRZE, University Erlangen-Nuremberg + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * ======================================================================================= + */ + +#include +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +#define SIZE 40000000ull +#define NTIMES 10 +#define ARRAY_ALIGNMENT 64 +#define HLINE "----------------------------------------------------------------------------\n" + +#ifndef MIN +#define MIN(x,y) ((x)<(y)?(x):(y)) +#endif +#ifndef MAX +#define MAX(x,y) ((x)>(y)?(x):(y)) +#endif +#ifndef ABS +#define ABS(a) ((a) >= 0 ? (a) : -(a)) +#endif + +typedef enum benchmark { + INIT = 0, + SUM, + COPY, + UPDATE, + TRIAD, + DAXPY, + STRIAD, + SDAXPY, + NUMBENCH +} benchmark; + +typedef struct { + char* label; + int words; + int flops; +} benchmarkType; + +extern double init(double*, double, int); +extern double sum(double*, int); +extern double copy(double*, double*, int); +extern double update(double*, double, int); +extern double triad(double*, double*, double*, double, int); +extern double daxpy(double*, double*, double, int); +extern double striad(double*, double*, double*, double*, int); +extern double sdaxpy(double*, double*, double*, int); +extern void check(double*, double*, double*, double*, int); +extern double getTimeStamp(); + +int main (int argc, char** argv) +{ + size_t bytesPerWord = sizeof(double); + size_t N = SIZE; + double *a, *b, *c, *d; + double scalar, tmp; + double E, S; + + double avgtime[NUMBENCH], + maxtime[NUMBENCH], + mintime[NUMBENCH]; + + double times[NUMBENCH][NTIMES]; + + benchmarkType benchmarks[NUMBENCH] = { + {"Init: ", 1, 0}, + {"Sum: ", 1, 1}, + {"Copy: ", 2, 0}, + {"Update: ", 2, 1}, + {"Triad: ", 3, 2}, + {"Daxpy: ", 3, 2}, + {"STriad: ", 4, 2}, + {"SDaxpy: ", 4, 2} + }; + + posix_memalign((void**) &a, ARRAY_ALIGNMENT, N * bytesPerWord ); + posix_memalign((void**) &b, ARRAY_ALIGNMENT, N * bytesPerWord ); + posix_memalign((void**) &c, ARRAY_ALIGNMENT, N * bytesPerWord ); + posix_memalign((void**) &d, ARRAY_ALIGNMENT, N * bytesPerWord ); + + for (int i=0; i 0){ + printf("%s%11.2f %11.2f %11.4f %11.4f %11.4f\n", benchmarks[j].label, + 1.0E-06 * bytes/mintime[j], + 1.0E-06 * flops/mintime[j], + avgtime[j], + mintime[j], + maxtime[j]); + } else { + printf("%s%11.2f - %11.4f %11.4f %11.4f\n", benchmarks[j].label, + 1.0E-06 * bytes/mintime[j], + avgtime[j], + mintime[j], + maxtime[j]); + } + } + printf(HLINE); + check(a, b, c, d, N); + + return EXIT_SUCCESS; +} + +void check( + double * a, + double * b, + double * c, + double * d, + int N + ) +{ + double aj, bj, cj, dj, scalar; + double asum, bsum, csum, dsum; + double epsilon; + + /* reproduce initialization */ + aj = 2.0; + bj = 2.0; + cj = 0.5; + dj = 1.0; + + /* now execute timing loop */ + scalar = 3.0; + + for (int k=0; k epsilon) { + printf ("Failed Validation on array a[]\n"); + printf (" Expected : %f \n",aj); + printf (" Observed : %f \n",asum); + } + else if (ABS(bj-bsum)/bsum > epsilon) { + printf ("Failed Validation on array b[]\n"); + printf (" Expected : %f \n",bj); + printf (" Observed : %f \n",bsum); + } + else if (ABS(cj-csum)/csum > epsilon) { + printf ("Failed Validation on array c[]\n"); + printf (" Expected : %f \n",cj); + printf (" Observed : %f \n",csum); + } + else if (ABS(dj-dsum)/dsum > epsilon) { + printf ("Failed Validation on array d[]\n"); + printf (" Expected : %f \n",dj); + printf (" Observed : %f \n",dsum); + } + else { + printf ("Solution Validates\n"); + } +} + +double getTimeStamp() +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9; +} + +double init( + double * restrict a, + double scalar, + int N + ) +{ + double S, E; + + S = getTimeStamp(); +#pragma omp parallel for + for (int i=0; i