Merge branch 'master' of https://github.com/RRZE-HPC/TheBandwidthBenchmark

2021-01-18 07:54:21 +01:00
parent 86e57fd554 3729067798
commit 3ed9b9590f
26 changed files with 1717 additions and 476 deletions
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ It contains C modules for:
 * Accurate timing
 Moreover the benchmark showcases a simple generic Makefile that can be used in other projects.
 You may want to have a look at https://github.com/RRZE-HPC/TheBandwidthBenchmark/wiki for a collection of results that were created using TheBandwidthBenchmark.
 ## Overview
@@ -88,9 +89,9 @@ To run the benchmark call:
 The benchmark will output the results similar to the stream benchmark. Results are validated.
 For threaded execution it is recommended to control thread affinity.
-We recommend to use likwid-pin for benchmarking:
+We recommend to use likwid-pin for setting the number of threads used and to control thread affinity:
 ```
-likwid-pin -c 0-3 ./bwbench-GCC
+likwid-pin -C 0-3 ./bwbench-GCC
 ```
 Example output for threaded execution:
@@ -118,3 +119,42 @@ SDaxpy:        46822.63    23411.32      0.0281       0.0273       0.0325
 Solution Validates
 ```
 ## Scaling runs
 Apart from the highest sustained memory bandwidth also the scaling behavior within memory domains is a important system property.
 There is a helper script included in util (```extractResults.pl```) that creates a text result file from multiple runs that can be used as input to plotting applications as gnuplot and xmgrace.
 This involves two steps: Executing the benchmark runs and creating the data file.
 To run the benchmark for different thread counts within a memory domain execute (this assumes bash or zsh):
 ```
 $ for nt in 1 2 4 6 8 10; do likwid-pin -q -C E:M0:$nt:1:2 ./bwbench-ICC > dat/emmy-$nt.txt; done
 ```
 It is recommended to just use one thread per core in case the processor supports hyperthreading.
 Use whatever stepping you like, here a stepping of two was used.
 The ```-q``` option suppresses output from ```likwid-pin```.
 Above line uses the expression based syntax, on systems with hyperthreading enabled (check with, e.g., ```likwid-topology```) you have to skip the other hardware threads on each core.
 For above system with 2 hardware threads per core this results in ```-C E:M0:$nt:1:2```, on a system with 4 hardware threads per core you would need ```-C E:M0:$nt:1:4```.
 The string before the dash (here emmy) can be arbitrary, but the the extraction script expects the thread count after the dash.
 Also the file ending has to be ```.txt```.
 Please check with a text editor on some result files if everything worked as expected.
 To extract the results and output in a plottable format execute:
 ```
 ./extractResults.pl ./dat
 ```
 The script will pick up all result files in the directory specified and create a column format output file.
 In this case:
 ```
 #nt	Init	Sum	Copy	Update	Triad	Daxpy	STriad	SDaxpy
 1	4109	11900	5637	8025	7407	9874	8981	11288
 2	8057	22696	11011	15174	14821	18786	17599	21475
 4	15602	39327	21020	28197	27287	33633	31939	37146
 6	22592	45877	29618	37155	36664	40259	39911	41546
 8	28641	46878	35763	40111	40106	41293	41022	41950
 10	33151	46741	38187	40269	39960	40922	40567	41606
 ```
 Please be aware the the single core memory bandwidth as well as the scaling behavior depends on the frequency settings.
--- a/include_CLANG.mk
+++ b/include_CLANG.mk
@@ -3,7 +3,8 @@ GCC  = gcc
 LINKER = $(CC)
 ifeq ($(ENABLE_OPENMP),true)
-OPENMP   = -fopenmp
+OPENMP   = -Xpreprocessor -fopenmp
 LIBS     = -lomp
 endif
 VERSION  = --version
@@ -12,4 +13,3 @@ CFLAGS   = -Ofast -std=c99 $(OPENMP)
 LFLAGS   = $(OPENMP)
 DEFINES  = -D_GNU_SOURCE
 INCLUDES =
 LIBS     =
--- a/src/affinity.c
+++ b/src/affinity.c
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #ifdef __linux__
 #ifdef _OPENMP
 #include <stdlib.h>
@@ -38,8 +37,7 @@
 #define MAX_NUM_THREADS 128
 #define gettid() syscall(SYS_gettid)
-static int
+static int getProcessorID(cpu_set_t* cpu_set)
 getProcessorID(cpu_set_t* cpu_set)
 {
    int processorId;
@@ -53,8 +51,7 @@ getProcessorID(cpu_set_t* cpu_set)
    return processorId;
 }
-int
+int affinity_getProcessorId()
 affinity_getProcessorId()
 {
    cpu_set_t  cpu_set;
    CPU_ZERO(&cpu_set);
@@ -63,8 +60,7 @@ affinity_getProcessorId()
    return getProcessorID(&cpu_set);
 }
-void
+void affinity_pinThread(int processorId)
 affinity_pinThread(int processorId)
 {
    cpu_set_t cpuset;
    pthread_t thread;
@@ -75,8 +71,7 @@ affinity_pinThread(int processorId)
    pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 }
-void
+void affinity_pinProcess(int processorId)
 affinity_pinProcess(int processorId)
 {
    cpu_set_t cpuset;
--- a/src/allocate.c
+++ b/src/allocate.c
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include <errno.h>
--- a/src/copy.c
+++ b/src/copy.c
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <timing.h>
 double copy(
--- a/src/daxpy.c
+++ b/src/daxpy.c
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <timing.h>
 double daxpy(
--- a/src/includes/affinity.h
+++ b/src/includes/affinity.h
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #ifndef AFFINITY_H
 #define AFFINITY_H
--- a/src/includes/allocate.h
+++ b/src/includes/allocate.h
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #ifndef __ALLOCATE_H_
 #define __ALLOCATE_H_
--- a/src/includes/likwid-marker.h
+++ b/src/includes/likwid-marker.h
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #ifndef LIKWID_MARKERS_H
 #define LIKWID_MARKERS_H
--- a/src/includes/timing.h
+++ b/src/includes/timing.h
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #ifndef __TIMING_H_
 #define __TIMING_H_
--- a/src/init.c
+++ b/src/init.c
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <timing.h>
 double init(
--- a/src/main.c
+++ b/src/main.c
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -59,7 +58,6 @@
   _Pragma ("omp parallel") \
   {LIKWID_MARKER_STOP(#tag);}
 typedef enum benchmark {
    INIT = 0,
    SUM,
@@ -115,7 +113,7 @@ int main (int argc, char** argv)
    };
    LIKWID_MARKER_INIT;
-_Pragma("omp parallel")
+    _Pragma("omp parallel")
    {
        LIKWID_MARKER_REGISTER("INIT");
        LIKWID_MARKER_REGISTER("SUM");
@@ -146,7 +144,7 @@ _Pragma("omp parallel")
 #ifdef _OPENMP
    printf(HLINE);
-_Pragma("omp parallel")
+    _Pragma("omp parallel")
    {
        int k = omp_get_num_threads();
        int i = omp_get_thread_num();
@@ -177,13 +175,10 @@ _Pragma("omp parallel")
    scalar = 3.0;
    for ( int k=0; k < NTIMES; k++) {
        LIKWID_PROFILE(INIT,init(b, scalar, N));
        tmp = a[10];
        LIKWID_PROFILE(SUM,sum(a, N));
        a[10] = tmp;
        LIKWID_PROFILE(COPY,copy(c, a, N));
        LIKWID_PROFILE(UPDATE,update(a, scalar, N));
        LIKWID_PROFILE(TRIAD,triad(a, b, c, scalar, N));
--- a/src/sdaxpy.c
+++ b/src/sdaxpy.c
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <timing.h>
 double sdaxpy(
--- a/src/striad.c
+++ b/src/striad.c
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <timing.h>
 double striad(
--- a/src/sum.c
+++ b/src/sum.c
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <timing.h>
 double sum(
--- a/src/timing.c
+++ b/src/timing.c
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <stdlib.h>
 #include <time.h>
--- a/src/triad.c
+++ b/src/triad.c
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <timing.h>
 double triad(
--- a/src/update.c
+++ b/src/update.c
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #include <timing.h>
 double update(
--- a/util/README.md
+++ b/util/README.md
@@ -4,7 +4,9 @@ bwBench.c contains a single file version of The Bandwidth Benchmark that is tail
 It should compile with any C99 compiler.
-# Benchmarking skript
+# Benchmarking skripts
 ## bench.pl to determine the absolute highest main memory bandwidth
 A wrapper scripts in perl (bench.pl) and python (bench.py) are also provided to scan ranges of thread counts and determine the absolute highest sustained main memory bandwidth. In order to use it `likwid-pin` has to be in your path. The script has three required and one optional command line arguments:
 ```
@@ -18,3 +20,26 @@ The script will always use physical cores only, where two SMT threads is the def
 ```
 $./bench.pl ./bwbench-GCC 14-24  10  1
 ```
 ## extractResults.pl to generate a plottable output files from multiple scaling runs
 Please see how to use it in the toplevel [README](https://github.com/RRZE-HPC/TheBandwidthBenchmark#scaling-runs).
 ## benchmarkSystem.pl to benchmark a system and generate plots and markdown for the result wiki
 **Please use with care!**
 The script is designed to be used from the root of TheBandwidthBenchmark.
 This script cleans and builds the currently configured toolchain. It expects that all Likwid tools are in the path!
 Desired frequency settings must be already in place.
 Usage:
 ```
 perl ./benchmarkSystem.pl <DATA-DIR> <EXECUTABLE> <PREFIX>
 ```
 where ```<DATA-DIR>``` is the directory where you want to store all results and generated output.
 ```<EXECUTABLE>``` is the bwBench executable name, this must be in accordance to the configured tool chain in ```config.mk```. E.g. ```./bwBench-CLANG```.
 ```<PREFIX>``` is the file prefix for all generated output, e.g. Intel-Haswell .
--- a/util/bench.py
+++ b/util/bench.py
@@ -1,73 +0,0 @@
 #!/usr/bin/env python3
 # =======================================================================================
 #
 #      Author:   Thomas Gruber (tg), thomas.gruber@googlemail.com
 #      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
 #
 #      Permission is hereby granted, free of charge, to any person obtaining a copy
 #      of this software and associated documentation files (the "Software"), to deal
 #      in the Software without restriction, including without limitation the rights
 #      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 #      copies of the Software, and to permit persons to whom the Software is
 #      furnished to do so, subject to the following conditions:
 #
 #      The above copyright notice and this permission notice shall be included in all
 #      copies or substantial portions of the Software.
 #
 #      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 #      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 #      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 #      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 #      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 #      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 #      SOFTWARE.
 #
 # =======================================================================================
 import sys, subprocess, re
 default_regex = "^(\w+):\s+([\d\.]+)"
 default_smt = 2
 if len(sys.argv) < 4 or len(sys.argv) > 5:
    print("{} <command> <minthreads>-<maxthreads> <repeats> (<smt>)".format(sys.argv[0]))
    print("Default <smt> value is {}".format(default_smt))
    sys.exit(1)
 cmd = str(sys.argv[1])
 minthreads = maxthreads = 0
 try:
    minthreads, maxthreads = sys.argv[2].split("-")
    minthreads = int(minthreads)
    maxthreads = int(maxthreads)
    if (minthreads == 0 or minthreads > maxthreads):
        print("Cannot use threads range values: {} {}".format(minthreads, maxthreads))
        sys.exit(1)
 except:
    print("<minthreads>-<maxthreads> option not readable: {}".format(sys.argv[2]))
    sys.exit(1)
 repeats = int(sys.argv[3])
 smt = int(sys.argv[4]) if len(sys.argv) == 5 else default_smt
 maximum = bestthreads = 0
 bestkernel = "None"
 for numthreads in range(int(minthreads), int(maxthreads)+1):
    runcmd = "likwid-pin -c E:S0:{}:1:{} {}".format(numthreads, smt, cmd)
    for rep in range(repeats):
        p = subprocess.Popen(runcmd, stdout=subprocess.PIPE,
                                     stderr=subprocess.STDOUT,
                                     shell=True)
        p.wait()
        if p.returncode == 0:
            lines = [ l for l in p.stdout.read().decode('utf-8').split("\n") ]
            for l in lines:
                m = re.search(default_regex, l)
                if m and maximum < float(m.group(2)):
                    maximum = float(m.group(2))
                    bestthreads = numthreads
                    bestkernel = m.group(1)
        else:
            print("Execution failed: {}".format(runcmd))
 print("{} was best using {} threads: {}".format(bestkernel, bestthreads, maximum))
--- a/util/benchmarkSystem.pl
+++ b/util/benchmarkSystem.pl
--- a/util/bwBench-likwid.c
+++ b/util/bwBench-likwid.c
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,7 @@
 *
 * =======================================================================================
 */
-
+#define _GNU_SOURCE
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -53,6 +53,13 @@
 #define ABS(a) ((a) >= 0 ? (a) : -(a))
 #endif
 #define LIKWID_PROFILE(tag,call) \
    _Pragma ("omp parallel") \
   {LIKWID_MARKER_START(#tag);} \
   times[tag][k]  = call; \
   _Pragma ("omp parallel") \
   {LIKWID_MARKER_STOP(#tag);}
 typedef enum benchmark {
    INIT = 0,
    COPY,
@@ -139,7 +146,7 @@ int main (int argc, char** argv)
    }
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = 2.0;
        b[i] = 2.0;
@@ -150,13 +157,13 @@ int main (int argc, char** argv)
    scalar = 3.0;
    for ( int k=0; k < NTIMES; k++) {
-        times[INIT][k]   = init(b, scalar, N);
+        LIKWID_PROFILE(INIT,init(b, scalar, N));
-        times[COPY][k]   = copy(c, a, N);
+        LIKWID_PROFILE(COPY,copy(c, a, N));
-        times[UPDATE][k] = update(a, scalar, N);
+        LIKWID_PROFILE(UPDATE,update(a, scalar, N));
-        times[TRIAD][k]  = triad(a, b, c, scalar, N);
+        LIKWID_PROFILE(TRIAD,triad(a, b, c, scalar, N));
-        times[DAXPY][k]  = daxpy(a, b, scalar, N);
+        LIKWID_PROFILE(DAXPY,daxpy(a, b, scalar, N));
-        times[STRIAD][k] = striad(a, b, c, d, N);
+        LIKWID_PROFILE(STRIAD,striad(a, b, c, d, N));
-        times[SDAXPY][k] = sdaxpy(a, b, c, N);
+        LIKWID_PROFILE(SDAXPY,sdaxpy(a, b, c, N));
    }
    for (int j=0; j<NUMBENCH; j++) {
@@ -290,15 +297,10 @@ double init(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel
+#pragma omp parallel for schedule(static)
    {
        LIKWID_MARKER_START("INIT");
 #pragma omp for
    for (int i=0; i<N; i++) {
        a[i] = scalar;
    }
        LIKWID_MARKER_STOP("INIT");
    }
    E = getTimeStamp();
    return E-S;
@@ -313,15 +315,10 @@ double copy(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel
+#pragma omp parallel for schedule(static)
    {
        LIKWID_MARKER_START("COPY");
 #pragma omp for
    for (int i=0; i<N; i++) {
        a[i] = b[i];
    }
        LIKWID_MARKER_STOP("COPY");
    }
    E = getTimeStamp();
    return E-S;
@@ -336,15 +333,10 @@ double update(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel
+#pragma omp parallel for schedule(static)
    {
        LIKWID_MARKER_START("UPDATE");
 #pragma omp for
    for (int i=0; i<N; i++) {
        a[i] = a[i] * scalar;
    }
        LIKWID_MARKER_STOP("UPDATE");
    }
    E = getTimeStamp();
    return E-S;
@@ -361,15 +353,10 @@ double triad(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel
+#pragma omp parallel for schedule(static)
    {
        LIKWID_MARKER_START("TRIAD");
 #pragma omp for
    for (int i=0; i<N; i++) {
        a[i] = b[i] + scalar * c[i];
    }
        LIKWID_MARKER_STOP("TRIAD");
    }
    E = getTimeStamp();
    return E-S;
@@ -385,15 +372,10 @@ double daxpy(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel
+#pragma omp parallel for schedule(static)
    {
        LIKWID_MARKER_START("DAXPY");
 #pragma omp for
    for (int i=0; i<N; i++) {
        a[i] = a[i] + scalar * b[i];
    }
        LIKWID_MARKER_STOP("DAXPY");
    }
    E = getTimeStamp();
    return E-S;
@@ -410,15 +392,10 @@ double striad(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel
+#pragma omp parallel for schedule(static)
    {
        LIKWID_MARKER_START("STRIAD");
 #pragma omp for
    for (int i=0; i<N; i++) {
        a[i] = b[i] + d[i] * c[i];
    }
        LIKWID_MARKER_STOP("STRIAD");
    }
    E = getTimeStamp();
    return E-S;
@@ -434,15 +411,10 @@ double sdaxpy(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel
+#pragma omp parallel for schedule(static)
    {
        LIKWID_MARKER_START("SDAXPY");
 #pragma omp for
    for (int i=0; i<N; i++) {
        a[i] = a[i] + b[i] * c[i];
    }
        LIKWID_MARKER_STOP("SDAXPY");
    }
    E = getTimeStamp();
    return E-S;
--- a/util/bwBench.c
+++ b/util/bwBench.c
@@ -2,7 +2,7 @@
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *      Copyright (c) 2019 RRZE, University Erlangen-Nuremberg
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,6 @@
 *
 * =======================================================================================
 */
 #define _GNU_SOURCE
 #include <stdlib.h>
 #include <stdio.h>
@@ -126,7 +125,7 @@ int main (int argc, char** argv)
    }
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = 2.0;
        b[i] = 2.0;
@@ -276,7 +275,7 @@ double init(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = scalar;
    }
@@ -294,7 +293,7 @@ double copy(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = b[i];
    }
@@ -312,7 +311,7 @@ double update(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = a[i] * scalar;
    }
@@ -332,7 +331,7 @@ double triad(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = b[i] + scalar * c[i];
    }
@@ -351,7 +350,7 @@ double daxpy(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = a[i] + scalar * b[i];
    }
@@ -371,7 +370,7 @@ double striad(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = b[i] + d[i] * c[i];
    }
@@ -390,7 +389,7 @@ double sdaxpy(
    double S, E;
    S = getTimeStamp();
-#pragma omp parallel for
+#pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) {
        a[i] = a[i] + b[i] * c[i];
    }
--- a/util/extractResults.pl
+++ b/util/extractResults.pl
@@ -0,0 +1,91 @@
 #!/usr/bin/env perl
 # =======================================================================================
 #
 #      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 #      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 #
 #      Permission is hereby granted, free of charge, to any person obtaining a copy
 #      of this software and associated documentation files (the "Software"), to deal
 #      in the Software without restriction, including without limitation the rights
 #      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 #      copies of the Software, and to permit persons to whom the Software is
 #      furnished to do so, subject to the following conditions:
 #
 #      The above copyright notice and this permission notice shall be included in all
 #      copies or substantial portions of the Software.
 #
 #      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 #      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 #      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 #      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 #      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 #      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 #      SOFTWARE.
 #
 # =======================================================================================
 use strict;
 use warnings;
 use utf8;
 my ($DIR, $UNIT) = @ARGV;
 if (not defined $DIR) {
  die "Need directory: $0 <DIR>\n";
 }
 if (not defined $UNIT) {
    $UNIT = 1.0;
 } else {
    if ( $UNIT eq 'GB' ) {
        $UNIT = 0.001;
    }
 }
 my %RES;
 my @testcases = ('Init', 'Sum', 'Copy', 'Update', 'Triad',  'Daxpy', 'STriad', 'SDaxpy');
 while( defined( my $file = glob($DIR . '/*' ) ) ) {
    my $nt = 1;
    open(my $fh, "<","$file");
    if ($file =~ /.*-([0-9]+)\.txt/) {
        $nt = $1;
    }
    $RES{$nt} = {};
    while ( <$fh> ) {
        my $cnt = split(/[ ]+/, $_);
        if ( $cnt == 6 ) {
            my @fields = split(/[ ]+/, $_);
            if ( $fields[1] =~ /[0-9]+/ ) {
                $fields[0] =~ s/://;
                $RES{$nt}->{$fields[0]} = $fields[1] * $UNIT;
            }
        }
    }
    close $fh or die "can't close file $!";
 }
 printf "#nt";
 foreach my $test ( @testcases ) {
    printf "\t%s", $test;
 }
 printf "\n";
 foreach my $key (sort {$a <=> $b} keys %RES) {
    printf "%d", $key;
    foreach my $test ( @testcases ) {
        if ( $UNIT > 0.1 ) {
            printf "\t%.0f", $RES{$key}->{$test};
        } else {
            printf "\t%.2f", $RES{$key}->{$test};
        }
    }
    printf "\n";
 }
--- a/util/golang/README.md
+++ b/util/golang/README.md
@@ -1,9 +0,0 @@
 # Initialize Go module
 `go mod init github.com/RRZE-HPC/TheBandwidthBenchmark/tree/master/util/golang`
 # Run
 Choose nt option to set number of threads used.
 `go run bwBench.go -nt 4`
--- a/util/golang/bwBench.go
+++ b/util/golang/bwBench.go
@@ -1,266 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 package main
 import (
 	"flag"
 	"fmt"
 	"math"
 	"sync"
 	"time"
 )
 type bench func(int, *sync.WaitGroup)
 type benchmark struct {
 	label string
 	words float64
 	flops float64
 	fct   bench
 }
 func Min(x, y int) int {
 	if x < y {
 		return x
 	}
 	return y
 }
 func getChunk(N int, tid int, numThreads int) (is, ie int) {
 	cs := N / numThreads
 	is = tid * cs
 	ie = Min(N, is+cs)
 	return
 }
 func main() {
 	const NTIMES int = 4
 	var N int = 40000000
 	var scalar float64 = 3.0
 	a := make([]float64, N)
 	b := make([]float64, N)
 	c := make([]float64, N)
 	d := make([]float64, N)
 	numThreads := flag.Int("nt", 4, "Number of threads")
 	flag.Parse()
 	for i := 0; i < N; i++ {
 		a[i] = 2.0
 		b[i] = 2.0
 		c[i] = 0.5
 		d[i] = 1.0
 	}
 	benchmarks := [...]benchmark{
 		{label: "Init", words: 1, flops: 0,
 			fct: func(threadId int, wg *sync.WaitGroup) {
 				defer wg.Done()
 				is, ie := getChunk(N, threadId, *numThreads)
 				for i := is; i < ie; i++ {
 					b[i] = scalar
 				}
 			}},
 		{label: "Copy", words: 2, flops: 0,
 			fct: func(threadId int, wg *sync.WaitGroup) {
 				defer wg.Done()
 				is, ie := getChunk(N, threadId, *numThreads)
 				for i := is; i < ie; i++ {
 					c[i] = a[i]
 				}
 			}},
 		{label: "Update", words: 2, flops: 1,
 			fct: func(threadId int, wg *sync.WaitGroup) {
 				defer wg.Done()
 				is, ie := getChunk(N, threadId, *numThreads)
 				for i := is; i < ie; i++ {
 					a[i] = a[i] * scalar
 				}
 			}},
 		{label: "Triad", words: 3, flops: 2,
 			fct: func(threadId int, wg *sync.WaitGroup) {
 				defer wg.Done()
 				is, ie := getChunk(N, threadId, *numThreads)
 				for i := is; i < ie; i++ {
 					a[i] = b[i] + scalar*c[i]
 				}
 			}},
 		{label: "Daxpy", words: 3, flops: 2,
 			fct: func(threadId int, wg *sync.WaitGroup) {
 				defer wg.Done()
 				is, ie := getChunk(N, threadId, *numThreads)
 				for i := is; i < ie; i++ {
 					a[i] = a[i] + scalar*b[i]
 				}
 			}},
 		{label: "STriad", words: 4, flops: 2,
 			fct: func(threadId int, wg *sync.WaitGroup) {
 				defer wg.Done()
 				is, ie := getChunk(N, threadId, *numThreads)
 				for i := is; i < ie; i++ {
 					a[i] = b[i] + d[i]*c[i]
 				}
 			}},
 		{label: "SDaxpy", words: 4, flops: 2,
 			fct: func(threadId int, wg *sync.WaitGroup) {
 				defer wg.Done()
 				is, ie := getChunk(N, threadId, *numThreads)
 				for i := is; i < ie; i++ {
 					a[i] = a[i] + b[i]*c[i]
 				}
 			}}}
 	var min, max, avg [len(benchmarks)]float64
 	var times [len(benchmarks)][NTIMES]float64
 	for i := 0; i < len(benchmarks); i++ {
 		avg[i], max[i] = 0.0, 0.0
 		min[i] = math.MaxFloat64
 	}
 	for k := 0; k < NTIMES; k++ {
 		for j := 0; j < len(benchmarks); j++ {
 			times[j][k] = execBench(*numThreads, benchmarks[j].fct)
 		}
 	}
 	for j := 0; j < len(benchmarks); j++ {
 		for k := 0; k < NTIMES; k++ {
 			avg[j] = avg[j] + times[j][k]
 			min[j] = math.Min(min[j], times[j][k])
 			max[j] = math.Max(max[j], times[j][k])
 		}
 	}
 	fmt.Println("----------------------------------------------------------------------------")
 	fmt.Printf("Function      Rate(MB/s)  Rate(MFlop/s)  Avg time     Min time     Max time\n")
 	for j := 0; j < len(benchmarks); j++ {
 		avg[j] = avg[j] / float64(NTIMES-1)
 		bytes := benchmarks[j].words * 8.0 * float64(N)
 		flops := benchmarks[j].flops * float64(N)
 		if flops > 0 {
 			fmt.Printf("%s%11.2f %11.2f %11.4f  %11.4f  %11.4f\n", benchmarks[j].label,
 				1.0E-06*bytes/min[j], 1.0E-06*flops/min[j],
 				avg[j], min[j], max[j])
 		} else {
 			fmt.Printf("%s%11.2f    -        %11.4f  %11.4f  %11.4f\n", benchmarks[j].label,
 				1.0E-06*bytes/min[j], avg[j], min[j], max[j])
 		}
 	}
 	fmt.Println("----------------------------------------------------------------------------")
 	check(a, b, c, d, N, NTIMES)
 }
 func check(
 	a []float64,
 	b []float64,
 	c []float64,
 	d []float64,
 	N int, NTIMES int) {
 	var aj, bj, cj, dj, scalar float64
 	var asum, bsum, csum, dsum float64
 	var epsilon float64
 	/* reproduce initialization */
 	aj = 2.0
 	bj = 2.0
 	cj = 0.5
 	dj = 1.0
 	/* now execute timing loop */
 	scalar = 3.0
 	for k := 0; k < NTIMES; k++ {
 		bj = scalar
 		cj = aj
 		aj = aj * scalar
 		aj = bj + scalar*cj
 		aj = aj + scalar*bj
 		aj = bj + cj*dj
 		aj = aj + bj*cj
 	}
 	aj = aj * float64(N)
 	bj = bj * float64(N)
 	cj = cj * float64(N)
 	dj = dj * float64(N)
 	asum = 0.0
 	bsum = 0.0
 	csum = 0.0
 	dsum = 0.0
 	for i := 0; i < N; i++ {
 		asum += a[i]
 		bsum += b[i]
 		csum += c[i]
 		dsum += d[i]
 	}
 	epsilon = 1.e-8
 	if math.Abs(aj-asum)/asum > epsilon {
 		fmt.Printf("Failed Validation on array a[]\n")
 		fmt.Printf("        Expected  : %f \n", aj)
 		fmt.Printf("        Observed  : %f \n", asum)
 	} else if math.Abs(bj-bsum)/bsum > epsilon {
 		fmt.Printf("Failed Validation on array b[]\n")
 		fmt.Printf("        Expected  : %f \n", bj)
 		fmt.Printf("        Observed  : %f \n", bsum)
 	} else if math.Abs(cj-csum)/csum > epsilon {
 		fmt.Printf("Failed Validation on array c[]\n")
 		fmt.Printf("        Expected  : %f \n", cj)
 		fmt.Printf("        Observed  : %f \n", csum)
 	} else if math.Abs(dj-dsum)/dsum > epsilon {
 		fmt.Printf("Failed Validation on array d[]\n")
 		fmt.Printf("        Expected  : %f \n", dj)
 		fmt.Printf("        Observed  : %f \n", dsum)
 	} else {
 		fmt.Printf("Solution Validates\n")
 	}
 }
 func execBench(
 	numThreads int,
 	fnc bench) float64 {
 	var wg sync.WaitGroup
 	wg.Add(numThreads)
 	S := time.Now()
 	for id := 0; id < numThreads; id++ {
 		go fnc(id, &wg)
 	}
 	wg.Wait()
 	E := time.Now()
 	return E.Sub(S).Seconds()
 }