From c774eb25a3bd225048fdf9a34ffd51bb50c909ba Mon Sep 17 00:00:00 2001 From: reble Date: Tue, 1 May 2018 10:29:46 -0500 Subject: [PATCH 001/325] Add TBB Flow Graph version of p2p Pipeline kernel --- Cxx11/p2p-tasks-tbb.cc | 290 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 Cxx11/p2p-tasks-tbb.cc diff --git a/Cxx11/p2p-tasks-tbb.cc b/Cxx11/p2p-tasks-tbb.cc new file mode 100644 index 000000000..02a2d3cc9 --- /dev/null +++ b/Cxx11/p2p-tasks-tbb.cc @@ -0,0 +1,290 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an m*n grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// TBB implementation by Pablo Reble, April 2018. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#include "tbb/flow_graph.h" +#include "tbb/parallel_for.h" + +inline void sweep_tile(int startm, int endm, + int startn, int endn, + int n, double grid[]) +{ + for (auto i=startm; i [ ]"; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + m = std::atoi(argv[2]); + n = std::atoi(argv[3]); + if (m < 1 || n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( static_cast(m)*static_cast(n) > INT_MAX) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // grid chunk dimensions + mc = (argc > 4) ? std::atoi(argv[4]) : m; + nc = (argc > 5) ? std::atoi(argv[5]) : n; + if (mc < 1 || mc > m || nc < 1 || nc > n) { + std::cout << "WARNING: grid chunk dimensions invalid: " << mc << nc << " (ignoring)" << std::endl; + mc = m; + nc = n; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + const char* envvar = std::getenv("TBB_NUM_THREADS"); + int num_threads = (envvar!=NULL) ? std::atoi(envvar) : tbb::task_scheduler_init::default_num_threads(); + tbb::task_scheduler_init init(num_threads); + + std::cout << "Number of threads = " << num_threads << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << m << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << mc << ", " << nc << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Create Grid and allocate space + ////////////////////////////////////////////////////////////////////// + // calculate number of tiles in n and m direction to create grid. + int num_blocks_n = (n / nc); + if(n%nc != 0) num_blocks_n++; + int num_blocks_m = (m / mc); + if(m%mc != 0) num_blocks_m++; + + auto pipeline_time = 0.0; // silence compiler warning + + double * grid = new double[m*n]; + + typedef tbb::flow::continue_node< tbb::flow::continue_msg > block_node_t; + + graph g; + block_node_t *nodes[ num_blocks_n * num_blocks_m ]; + // To enable tracing support for Flow Graph Analyzer + // set following MACRO and link against TBB preview library (-ltbb_preview) +#if TBB_PREVIEW_FLOW_GRAPH_TRACE + char buffer[1024]; + g.set_name("Pipeline"); +#endif + + bool first_iter=true; + block_node_t b(g, [&](const tbb::flow::continue_msg &){ + grid[0*n+0] = -grid[(m-1)*n+(n-1)]; + if(first_iter) pipeline_time = prk::wtime(); + first_iter = false; + }); + for (int i=0; iset_name( buffer ); +#endif + nodes[i*num_blocks_n + j] = tmp; + if (i>0) + make_edge(*nodes[(i-1)*num_blocks_n + j ], *tmp ); + if (j>0) + make_edge(*nodes[ i *num_blocks_n + j-1], *tmp ); + // Transitive dependencies from OpenMP task version: + //make_edge( *tmp, b ); + //if (i>0 && j>0) + // make_edge(*nodes[(i-1)*num_blocks_n + j-1], *tmp ); + } + } + auto start = true; + source_node s(g, [&](continue_msg &v) -> bool { + if(start) { + v = continue_msg(); + start = false; + return true; + } + return false; + }, false); + + limiter_node l(g, iterations+1, 1); + + make_edge( s, l ); + make_edge( l, *nodes[0] ); + make_edge( *nodes[(num_blocks_n * num_blocks_m) - 1], b); + make_edge( b, l ); + +#if TBB_PREVIEW_FLOW_GRAPH_TRACE + s.set_name("Source"); + b.set_name("Iteration Barrier"); + l.set_name("Limiter"); +#endif + + ////////////////////////////////////////////////////////////////////// + // Perform the computation + ////////////////////////////////////////////////////////////////////// + + { + + tbb::blocked_range2d range(0, m, mc, 0, n, nc); + tbb::parallel_for( range, [&](decltype(range)& r) { + for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { + for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) { + grid[i*n+j] = 0.0; + } + } + }, tbb_partitioner); + for (auto j=0; j(j); + } + for (auto i=0; i(i); + } + + s.activate(); + g.wait_for_all(); + + pipeline_time = prk::wtime() - pipeline_time; + + } + + ////////////////////////////////////////////////////////////////////// + // Cleanup Flow Graph + ////////////////////////////////////////////////////////////////////// + + for (int i=0; i epsilon) { + std::cout << "ERROR: checksum " << grid[(m-1)*n+(n-1)] + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + return 0; +} From d273ab6a62690ac083615b08d8624846f4c92ba0 Mon Sep 17 00:00:00 2001 From: reble Date: Wed, 2 May 2018 16:23:14 -0500 Subject: [PATCH 002/325] add target to Makefile --- Cxx11/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 3a26ead0b..484a232b4 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -69,7 +69,7 @@ all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sy p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ - p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc + p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc p2p-tasks-tbb stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \ stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \ @@ -102,7 +102,7 @@ opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \ - p2p-hyperplane-vector-tbb + p2p-hyperplane-vector-tbb p2p-tasks-tbb stl: stencil-vector-stl transpose-vector-stl nstream-vector-stl From b0b2aea9398d48a9d209e3614980938a6dedf4cc Mon Sep 17 00:00:00 2001 From: reble Date: Wed, 2 May 2018 16:32:53 -0500 Subject: [PATCH 003/325] Add TBB flags for Flow Graph Analyzer tracing support --- common/make.defs.intel | 1 + 1 file changed, 1 insertion(+) diff --git a/common/make.defs.intel b/common/make.defs.intel index 0dea4bb44..bd71a8ac2 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -63,6 +63,7 @@ CILKFLAG=-intel-extensions # default # TBB # TBBFLAG=-DUSE_TBB -tbb +#TBBFLAG=-DUSE_TBB -tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE # # Parallel STL, Boost, etc. # From 7824e260a473ae8d98f942e9d9e12c7397d6086b Mon Sep 17 00:00:00 2001 From: reble Date: Thu, 3 May 2018 18:26:25 -0500 Subject: [PATCH 004/325] replace block_node_body with lambda expression --- Cxx11/p2p-tasks-tbb.cc | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/Cxx11/p2p-tasks-tbb.cc b/Cxx11/p2p-tasks-tbb.cc index 02a2d3cc9..3142a1e42 100644 --- a/Cxx11/p2p-tasks-tbb.cc +++ b/Cxx11/p2p-tasks-tbb.cc @@ -76,27 +76,6 @@ inline void sweep_tile(int startm, int endm, } } -class block_node_body { - const int startm, endm; - const int startn, endn; - const int n; - double * const grid; - -public: - - block_node_body( int _startm, int _endm, - int _startn, int _endn, - int _n, double _grid[] ) : - grid(_grid), - startm(_startm), endm(_endm), - startn(_startn), endn(_endn), - n(_n) { } - - void operator()( const tbb::flow::continue_msg & ) { - sweep_tile(startm, endm, startn, endn, n, grid); - } -}; - int main(int argc, char* argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; @@ -187,7 +166,9 @@ int main(int argc, char* argv[]) }); for (int i=0; iset_name( buffer ); From f5595d379b4f9e2b768c8096d67903722a59b3e1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Feb 2020 13:14:17 -0800 Subject: [PATCH 005/325] remove HPX-5, HPX-3 is the only HPX from now on --- travis/build-run-prk.sh | 3 --- travis/install-deps.sh | 9 ++----- travis/install-hpx.sh | 40 +++++++++++++++++++++++++++++ travis/install-hpx3.sh | 57 ----------------------------------------- travis/install-hpx5.sh | 39 ---------------------------- 5 files changed, 42 insertions(+), 106 deletions(-) create mode 100755 travis/install-hpx.sh delete mode 100755 travis/install-hpx3.sh delete mode 100755 travis/install-hpx5.sh diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index c9e98f2f0..14e505265 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -1134,9 +1134,6 @@ case "$PRK_TARGET" in allhpx3) echo "Nothing to do yet" ;; - allhpx5) - echo "Nothing to do yet" - ;; alllegion) echo "Legion" echo "LEGIONTOP=${TRAVIS_ROOT}/legion" > common/make.defs diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 433ebc44a..3c4fc29a1 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -169,16 +169,11 @@ case "$PRK_TARGET" in echo "Chapel" sh ./travis/install-chapel.sh $TRAVIS_ROOT ;; - allhpx3) - echo "HPX-3" + allhpx) + echo "HPX" sh ./travis/install-cmake.sh $TRAVIS_ROOT sh ./travis/install-hpx3.sh $TRAVIS_ROOT ;; - allhpx5) - echo "HPX-5" - sh ./travis/install-autotools.sh $TRAVIS_ROOT - sh ./travis/install-hpx5.sh $TRAVIS_ROOT - ;; alllegion) echo "Legion" # GASNet is not needed, it seems diff --git a/travis/install-hpx.sh b/travis/install-hpx.sh new file mode 100755 index 000000000..5fa0cda89 --- /dev/null +++ b/travis/install-hpx.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +set -e +set -x + +if [ -f ~/use-intel-compilers ] ; then + export CC=icc + export CXX=icpc + export FC=ifort +fi + +TRAVIS_ROOT="$1" + +case "$TRAVIS_OS_NAME" in + linux) + ;; + osx) + set +e + brew update + for p in boost jemalloc gperftools ; do + brew install $p || brew upgrade $p + done + set -e + ;; +esac + +if [ ! -d "$TRAVIS_ROOT/hpx" ]; then + cd $TRAVIS_ROOT + git clone --depth 1 https://github.com/STEllAR-GROUP/hpx.git hpx-source + cd hpx-source + mkdir build + cd build + cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$TRAVIS_ROOT/hpx -DCMAKE_MACOSX_RPATH=YES -DHPX_WITH_HWLOC=OFF + make -j2 + # make check # target does not exist + make install +else + echo "HPX installed..." + find $TRAVIS_ROOT/hpx +fi diff --git a/travis/install-hpx3.sh b/travis/install-hpx3.sh deleted file mode 100755 index 50bf6878d..000000000 --- a/travis/install-hpx3.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/sh - -set -e -set -x - -if [ -f ~/use-intel-compilers ] ; then - export CC=icc - export CXX=icpc - export FC=ifort -fi - -TRAVIS_ROOT="$1" - -case "$TRAVIS_OS_NAME" in - linux) - ;; - osx) - set +e - brew update - if [ "$USE_HPX_TARBALL" ] ; then - export HPX_BOOST="homebrew/versions/boost155" - else - export HPX_BOOST="boost" - fi - for p in $HPX_BOOST jemalloc gperftools ; do - brew install $p || brew upgrade $p - done - set -e - ;; -esac - -if [ ! -d "$TRAVIS_ROOT/hpx3" ]; then - cd $TRAVIS_ROOT - #if [ "$USE_HPX_TARBALL" ] ; then - # wget -q --no-check-certificate http://stellar.cct.lsu.edu/files/hpx_0.9.11.tar.bz2 - # if [ `which md5` ] ; then - # echo "MD5 signature is:" - # md5 hpx_0.9.11.tar.bz2 - # echo "MD5 signature should be:" - # echo "86a71189fb6344d27bf53d6aa2b33122" - # fi - # tar -xjf hpx_0.9.11.tar.bz2 - # cd hpx_0.9.11 - #else - git clone --depth 1 https://github.com/STEllAR-GROUP/hpx.git hpx3-source - cd hpx3-source - #fi - mkdir build - cd build - cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$TRAVIS_ROOT/hpx3 -DCMAKE_MACOSX_RPATH=YES -DHPX_WITH_HWLOC=OFF - make -j2 - # make check # target does not exist - make install -else - echo "HPX-3 installed..." - find $TRAVIS_ROOT/hpx3 -fi diff --git a/travis/install-hpx5.sh b/travis/install-hpx5.sh deleted file mode 100755 index 6fa6f29d9..000000000 --- a/travis/install-hpx5.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/sh - -set -e -set -x - -if [ -f ~/use-intel-compilers ] ; then - export CC=icc - export CXX=icpc - export FC=ifort -fi - -TRAVIS_ROOT="$1" - -if [ ! -d "$TRAVIS_ROOT/hpx5" ] ; then - cd $TRAVIS_ROOT - if [ "0" = "1" ] ; then - wget -q --no-check-certificate http://hpx.crest.iu.edu/release/HPX_Release_v2.0.0.tar.gz - if [ `which shasum` ] ; then - echo "SHA-256 signature is:" - shasum -a 256 HPX_Release_v2.0.0.tar.gz - echo "SHA-256 signature should be:" - echo "647c5f0ef3618f734066c91d741021d7bd38cf21" - fi - tar -xzf HPX_Release_v2.0.0.tar.gz - cd HPX_Release_v2.0.0/hpx - else - export GIT_SSL_NO_VERIFY=1 - git clone --depth 1 http://gitlab.crest.iu.edu/extreme/hpx.git hpx5-source - cd hpx5-source - fi - ./bootstrap - ./configure --prefix=$TRAVIS_ROOT/hpx5 - make -j2 - make check - make install -else - echo "HPX-5 installed..." - find $TRAVIS_ROOT/hpx5 -name hpx-config -fi From 042b1855aa7618669e242682c36fb7574134d57f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Feb 2020 22:46:03 -0800 Subject: [PATCH 006/325] HPX is WIP --- Cxx11/Makefile | 7 ++ Cxx11/nstream-hpx.cc | 176 +++++++++++++++++++++++++++++++++++++++++++ Cxx11/prk_hpx.h | 41 ++++++++++ doc/HPX.md | 13 ++++ 4 files changed, 237 insertions(+) create mode 100644 Cxx11/nstream-hpx.cc create mode 100644 Cxx11/prk_hpx.h create mode 100644 doc/HPX.md diff --git a/Cxx11/Makefile b/Cxx11/Makefile index e3b9e76fe..2adb8c486 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -44,6 +44,7 @@ RANGEFLAGS = $(RANGEFLAG) -DUSE_RANGES STLFLAGS = $(STLFLAG) $(RANGEFLAGS) PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA +HPXFLAGS = -I$(HPXDIR)/include -DUSE_HPX -L$(HPXDIR)/lib $(BOOSTFLAG) $(HWLOCFLAG) $(RANGEFLAGS) THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 @@ -122,6 +123,8 @@ rangefor: stencil-vector-rangefor transpose-vector-rangefor nstream-vector-range kokkos: stencil-kokkos transpose-kokkos nstream-kokkos +hpx: nstream-hpx + raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \ p2p-raja transpose-raja nstream-raja stencil-raja # transpose-vector-raja @@ -193,6 +196,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h %-raja: %-raja.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ +%-hpx: %-hpx.cc prk_util.h prk_hpx.h + $(CXX) $(CXXFLAGS) $< $(HPXFLAGS) -o $@ + ifeq ($(PRK_KOKKOS_BACKEND),Cuda) %-kokkos: %-kokkos.cc prk_util.h ${KOKKOSDIR}/bin/nvcc_wrapper $(CPPFLAGS) $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@ @@ -262,6 +268,7 @@ clean: -rm -f *-rangefor -rm -f *-raja -rm -f *-kokkos + -rm -f *-hpx -rm -f *-thrust -rm -f *-cuda -rm -f *-cublas diff --git a/Cxx11/nstream-hpx.cc b/Cxx11/nstream-hpx.cc new file mode 100644 index 000000000..d41f5fe6d --- /dev/null +++ b/Cxx11/nstream-hpx.cc @@ -0,0 +1,176 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_hpx.h" +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + std::vector A(length); + std::vector B(length); + std::vector C(length); + + auto range = prk::range(static_cast(0), length); + + double scalar(3); + + { + std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + A[i] = 0; + B[i] = 2; + C[i] = 2; + }); + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + A[i] += B[i] + scalar * C[i]; + }); + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/prk_hpx.h b/Cxx11/prk_hpx.h new file mode 100644 index 000000000..36c523eb1 --- /dev/null +++ b/Cxx11/prk_hpx.h @@ -0,0 +1,41 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_HPX_H +#define PRK_HPX_H + +#include +#include + +#include +#include + +#endif /* PRK_HPX_H */ diff --git a/doc/HPX.md b/doc/HPX.md new file mode 100644 index 000000000..1fce480f7 --- /dev/null +++ b/doc/HPX.md @@ -0,0 +1,13 @@ +# + +```sh +cmake .. -DCMAKE_INSTALL_PREFIX=$PRK_DIR/Cxx11/hpx \ + -DCMAKE_CXX_COMPILER=/usr/local/Cellar/llvm/9.0.1/bin/clang++ \ + -DCMAKE_C_COMPILER=/usr/local/Cellar/llvm/9.0.1/bin/clang \ + -DHPX_WITH_TESTS:BOOL=Off \ + -DHPX_WITH_TESTS_BENCHMARKS:BOOL=Off \ + -DHPX_WITH_TESTS_EXAMPLES:BOOL=Off \ + -DHPX_WITH_TESTS_REGRESSIONS:BOOL=Off \ + -DHPX_WITH_TESTS_UNIT:BOOL=Off +make install +``` From 8c63805f1d6267beaa91398618f4c494224977c7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 10:46:26 -0800 Subject: [PATCH 007/325] add HPX flags to make.defs. Signed-off-by: Jeff Hammond --- Cxx11/Makefile | 2 +- common/make.defs.gcc | 44 ++++++++++++++++++++++++++++++++++++++++-- common/make.defs.intel | 36 +++++++++++++++++++++++++--------- common/make.defs.llvm | 13 ++++++++++--- 4 files changed, 80 insertions(+), 15 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 2adb8c486..478b29cb4 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -197,7 +197,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ %-hpx: %-hpx.cc prk_util.h prk_hpx.h - $(CXX) $(CXXFLAGS) $< $(HPXFLAGS) -o $@ + $(HPXCXX) --exe=$@ $(CXXFLAGS) $(HPXFLAGS) $< ifeq ($(PRK_KOKKOS_BACKEND),Cuda) %-kokkos: %-kokkos.cc prk_util.h diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 8df7db087..4fd4a74ff 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -44,6 +44,9 @@ ORNLACCFLAG=-fopenacc # # MacOS OPENCLFLAG=-framework OpenCL +# POCL +# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... +#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL # Linux #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL @@ -52,13 +55,33 @@ METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # +# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md +#SYCLDIR=/opt/isycl +#SYCLCXX=${SYCLDIR}/bin/clang++ +#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib +#SYCLFLAG+=-std=c++17 -O3 +# CodePlay ComputeCpp +#SYCLDIR=/opt/sycl/latest +#SYCLCXX=${SYCLDIR}/bin/compute++ +#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +#SYCLFLAG+=-std=c++14 -O3 +# This makes a huge difference in e.g. nstream... +#SYCLFLAG+=-no-serial-memop +# CentOS7 and Ubuntu14 built for this +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +# PRK header rejects GCC4 +#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0 +# If not found automatically +#SYCLFLAG+=${OPENCLFLAG} +# NVIDIA target +#SYCLFLAG+=-sycl-target ptx64 +# # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... #SYCLDIR=./triSYCL #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # -METALFLAG=-framework MetalPerformanceShaders # # OCCA # @@ -71,7 +94,7 @@ METALFLAG=-framework MetalPerformanceShaders # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -TBBDIR=/usr/local/Cellar/tbb/2019_U8 +TBBDIR=/usr/local/Cellar/tbb/2020_U0 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb @@ -79,6 +102,9 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # Parallel STL, Boost, etc. # #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include +BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include +BOOSTFLAG+=-I${BOOSTROOT} +BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} @@ -89,6 +115,10 @@ RAJADIR=/opt/raja/gcc RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# HPX is more complicated... +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HWLOCFLAG=-I/usr/local/include # # CBLAS for C++ DGEMM # @@ -105,6 +135,16 @@ CUDAFLAGS=-g -O3 -std=c++11 -arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # +# Halide +# +HALIDECXX=c++ +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 -g3 +# # ISPC # ISPC=ispc diff --git a/common/make.defs.intel b/common/make.defs.intel index cab461c08..145b1e750 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -42,19 +42,37 @@ OFFLOADFLAG=-qopenmp-offload=host # Linux OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +#OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations +METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # +# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md +#SYCLDIR=/opt/isycl +#SYCLCXX=${SYCLDIR}/bin/clang++ +#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib +#SYCLFLAG+=-std=c++17 -O3 +# CodePlay ComputeCpp +#SYCLDIR=/opt/sycl/latest +#SYCLCXX=${SYCLDIR}/bin/compute++ +#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +#SYCLFLAG+=-std=c++14 -O3 +# This makes a huge difference in e.g. nstream... +#SYCLFLAG+=-no-serial-memop +# CentOS7 and Ubuntu14 built for this +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +# PRK header rejects GCC4 +#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0 +# If not found automatically +#SYCLFLAG+=${OPENCLFLAG} +# NVIDIA target +#SYCLFLAG+=-sycl-target ptx64 +# # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL -SYCLCXX=${CXX} ${OPENMPFLAG} -SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include -# ProGTX -# https://github.com/ProGTX/sycl-gtx -#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx -#SYCLCXX=${CXX} ${OPENMPFLAG} -#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +#SYCLDIR=./triSYCL +#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # # OCCA # @@ -62,7 +80,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # # Cilk # -CILKFLAG=-intel-extensions # default +#CILKFLAG=-intel-extensions # default # # TBB # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 6a668bf14..b65febe80 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -85,7 +85,7 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL -SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) +SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # # OCCA @@ -95,7 +95,7 @@ SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -TBBDIR=/usr/local/Cellar/tbb/2019_U8 +TBBDIR=/usr/local/Cellar/tbb/2020_U0 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb @@ -103,15 +103,22 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # Parallel STL, Boost, etc. # #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include +BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include +BOOSTFLAG+=-I${BOOSTROOT} +BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -Wno-\#pragma-messages -DUSE_INTEL_PSTL -I./pstl/include KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/clang RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# HPX is more complicated... +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HWLOCFLAG=-I/usr/local/include # # CBLAS for C++ DGEMM # From 0556b39e400134cd586a5b79dd3fc43730876ca2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 10:53:02 -0800 Subject: [PATCH 008/325] update HPX flags --- Cxx11/Makefile | 2 +- common/make.defs.gcc | 3 ++- common/make.defs.intel | 5 +++++ common/make.defs.llvm | 3 ++- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 478b29cb4..72526a38a 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -44,7 +44,7 @@ RANGEFLAGS = $(RANGEFLAG) -DUSE_RANGES STLFLAGS = $(STLFLAG) $(RANGEFLAGS) PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA -HPXFLAGS = -I$(HPXDIR)/include -DUSE_HPX -L$(HPXDIR)/lib $(BOOSTFLAG) $(HWLOCFLAG) $(RANGEFLAGS) +HPXFLAGS = -DUSE_HPX $(HPXFLAG) $(BOOSTFLAG) $(RANGEFLAGS) THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 4fd4a74ff..7e0736211 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -116,9 +116,10 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # HPX is more complicated... +HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx HPXCXX=${HPXDIR}/bin/hpxcxx -HWLOCFLAG=-I/usr/local/include +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.intel b/common/make.defs.intel index 145b1e750..92a0d4e64 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -99,6 +99,11 @@ RAJADIR=/opt/raja/intel RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# HPX is more complicated... +HWLOCFLAG=-I/usr/local/include +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index b65febe80..4021f6c6d 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -116,9 +116,10 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # HPX is more complicated... +HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx HPXCXX=${HPXDIR}/bin/hpxcxx -HWLOCFLAG=-I/usr/local/include +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} # # CBLAS for C++ DGEMM # From 5c799cc1faf5261a6a8a05399246eff5fb2cc0b6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 10:53:32 -0800 Subject: [PATCH 009/325] this template is really dated at this point --- common/{make.defs.in => make.defs.old} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename common/{make.defs.in => make.defs.old} (100%) diff --git a/common/make.defs.in b/common/make.defs.old similarity index 100% rename from common/make.defs.in rename to common/make.defs.old From d8ef1bdec5f9b27d08655f9ae47ff2b75573d743 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 15:02:07 -0800 Subject: [PATCH 010/325] use HPX for_each --- Cxx11/nstream-hpx.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/nstream-hpx.cc b/Cxx11/nstream-hpx.cc index d41f5fe6d..47d1b775c 100644 --- a/Cxx11/nstream-hpx.cc +++ b/Cxx11/nstream-hpx.cc @@ -120,7 +120,7 @@ int main(int argc, char * argv[]) double scalar(3); { - std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + hpx::parallel::for_each(hpx::parallel::execution::seq, std::begin(range), std::end(range), [&] (size_t i) { A[i] = 0; B[i] = 2; C[i] = 2; @@ -130,7 +130,7 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); - std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + hpx::parallel::for_each(hpx::parallel::execution::seq, std::begin(range), std::end(range), [&] (size_t i) { A[i] += B[i] + scalar * C[i]; }); } From 21621722e7e93426ec6e9b35ae211ab1e2ce6fbe Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 15:20:10 -0800 Subject: [PATCH 011/325] UPC++ support --- travis/install-deps.sh | 1 + travis/install-upcxx.sh | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100755 travis/install-upcxx.sh diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 433ebc44a..72effa7b6 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -76,6 +76,7 @@ case "$PRK_TARGET" in sh ./travis/install-kokkos.sh $TRAVIS_ROOT #sh ./travis/install-occa.sh $TRAVIS_ROOT sh ./travis/install-sycl.sh $TRAVIS_ROOT + sh ./travis/install-upcxx.sh $TRAVIS_ROOT ;; allfortran) echo "Fortran" diff --git a/travis/install-upcxx.sh b/travis/install-upcxx.sh new file mode 100755 index 000000000..3725d361f --- /dev/null +++ b/travis/install-upcxx.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +set -e +set -x + +if [ -f ~/use-intel-compilers ] ; then + export CC=icc + export CXX=icpc + export FC=ifort +fi + +TRAVIS_ROOT="$1" + +UPCXX_RELEASE=upcxx-2019.9.0 +UPCXX_PREFIX=$TRAVIS_ROOT/$UPCXX_RELEASE + +if [ ! -d "$UPCXX_PREFIX" ]; then + cd $TRAVIS_ROOT + wget --no-check-certificate -q https://bitbucket.org/berkeleylab/upcxx/downloads/${UPCXX_RELEASE}.tar.gz + tar -xzf $UPCXX_RELEASE.tar.gz + cd $UPCXX_RELEASE + ./install $TRAVIS_ROOT/upcxx +else + echo "UPC++ installed..." + find $TRAVIS_ROOT/upcxx -name upcxx -type f +fi + From 33cc8d5928b58a5f3656c0fb384ba7f7bbfb4448 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 15:21:34 -0800 Subject: [PATCH 012/325] install-hpx in deps --- travis/install-deps.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 3c4fc29a1..19aa5bdcc 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -75,6 +75,7 @@ case "$PRK_TARGET" in #sh ./travis/install-raja.sh $TRAVIS_ROOT sh ./travis/install-kokkos.sh $TRAVIS_ROOT #sh ./travis/install-occa.sh $TRAVIS_ROOT + sh ./travis/install-hpx.sh $TRAVIS_ROOT sh ./travis/install-sycl.sh $TRAVIS_ROOT ;; allfortran) @@ -169,11 +170,6 @@ case "$PRK_TARGET" in echo "Chapel" sh ./travis/install-chapel.sh $TRAVIS_ROOT ;; - allhpx) - echo "HPX" - sh ./travis/install-cmake.sh $TRAVIS_ROOT - sh ./travis/install-hpx3.sh $TRAVIS_ROOT - ;; alllegion) echo "Legion" # GASNet is not needed, it seems From 69efdf6efc464fc3d1cee82877b1eb9ab275618c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 15:45:51 -0800 Subject: [PATCH 013/325] add UPC++ skeleton --- Cxx11/Makefile | 6 ++ Cxx11/nstream-upcxx.cc | 184 +++++++++++++++++++++++++++++++++++++++++ Cxx11/prk_ranges.h | 18 ++-- Cxx11/prk_upcxx.h | 37 +++++++++ common/make.defs.gcc | 6 ++ common/make.defs.llvm | 6 ++ 6 files changed, 247 insertions(+), 10 deletions(-) create mode 100644 Cxx11/nstream-upcxx.cc create mode 100644 Cxx11/prk_upcxx.h diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 72526a38a..2bb1bc4ba 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -45,6 +45,7 @@ STLFLAGS = $(STLFLAG) $(RANGEFLAGS) PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA HPXFLAGS = -DUSE_HPX $(HPXFLAG) $(BOOSTFLAG) $(RANGEFLAGS) +UPCXXFLAGS = $(CPPFLAGS) -DUSE_UPCXX $(UPCXXFLAG) $(BOOSTFLAG) $(RANGEFLAGS) THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 @@ -125,6 +126,8 @@ kokkos: stencil-kokkos transpose-kokkos nstream-kokkos hpx: nstream-hpx +upcxx: nstream-upcxx + raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \ p2p-raja transpose-raja nstream-raja stencil-raja # transpose-vector-raja @@ -196,6 +199,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h %-raja: %-raja.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ +%-upcxx: %-upcxx.cc prk_util.h prk_upcxx.h + $(UPCXX) $(UPCXXFLAGS) $< -o $@ + %-hpx: %-hpx.cc prk_util.h prk_hpx.h $(HPXCXX) --exe=$@ $(CXXFLAGS) $(HPXFLAGS) $< diff --git a/Cxx11/nstream-upcxx.cc b/Cxx11/nstream-upcxx.cc new file mode 100644 index 000000000..083ab96b4 --- /dev/null +++ b/Cxx11/nstream-upcxx.cc @@ -0,0 +1,184 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_upcxx.h" +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + upcxx::init(); + + const int me = upcxx::rank_me(); + const int np = upcxx::rank_n(); + + if (me==0) { + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl; + } + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + std::vector A(length); + std::vector B(length); + std::vector C(length); + + auto range = prk::range(static_cast(0), length); + + double scalar(3); + + { + std::for_each(std::begin(range), std::end(range), [&] (size_t i) { + A[i] = 0; + B[i] = 2; + C[i] = 2; + }); + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + std::for_each(std::begin(range), std::end(range), [&] (size_t i) { + A[i] += B[i] + scalar * C[i]; + }); + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + upcxx::finalize(); + return 0; +} + + diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h index 9eb081844..62281e043 100644 --- a/Cxx11/prk_ranges.h +++ b/Cxx11/prk_ranges.h @@ -32,16 +32,14 @@ #ifndef PRK_RANGES_H #define PRK_RANGES_H -#if defined(USE_RANGES) -# if defined(USE_BOOST_IRANGE) -# include "boost/range/irange.hpp" -# elif defined(USE_RANGES_TS) -# include "range/v3/view/iota.hpp" -# include "range/v3/view/slice.hpp" -# include "range/v3/view/stride.hpp" -# else -# error You have not provided a version of ranges to use. -# endif +#if defined(USE_BOOST_IRANGE) +# include "boost/range/irange.hpp" +#elif defined(USE_RANGES_TS) +# include "range/v3/view/iota.hpp" +# include "range/v3/view/slice.hpp" +# include "range/v3/view/stride.hpp" +#else +# error You have not provided a version of ranges to use. #endif namespace prk { diff --git a/Cxx11/prk_upcxx.h b/Cxx11/prk_upcxx.h new file mode 100644 index 000000000..27db8592e --- /dev/null +++ b/Cxx11/prk_upcxx.h @@ -0,0 +1,37 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_UPCXX_H +#define PRK_UPCXX_H + +#include + +#endif /* PRK_UPCXX_H */ diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 7e0736211..ec0535f57 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -120,6 +120,12 @@ HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx HPXCXX=${HPXDIR}/bin/hpxcxx HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} +# UPC++ +UPCXXDIR=./upcxx +UPCXX=${UPCXXDIR}/bin/upcxx +UPCXXFLAG=-codemode={O3,debug} +UPCXXFLAG+=-std=c++17 +UPCXXFLAG+=-mtune=native -ffast-math # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 4021f6c6d..c150d9ed2 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -120,6 +120,12 @@ HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx HPXCXX=${HPXDIR}/bin/hpxcxx HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} +# UPC++ +UPCXXDIR=./upcxx +UPCXX=${UPCXXDIR}/bin/upcxx +UPCXXFLAG=-codemode={O3,debug} +UPCXXFLAG+=-std=c++17 +UPCXXFLAG+=-mtune=native -ffast-math # # CBLAS for C++ DGEMM # From 2699970ba76e369e157fd72c0a466628d9bc98a4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 26 Feb 2020 16:15:30 -0800 Subject: [PATCH 014/325] fix banner --- Cxx11/nstream-upcxx.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/nstream-upcxx.cc b/Cxx11/nstream-upcxx.cc index 083ab96b4..7925aef08 100644 --- a/Cxx11/nstream-upcxx.cc +++ b/Cxx11/nstream-upcxx.cc @@ -74,7 +74,7 @@ int main(int argc, char * argv[]) if (me==0) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++ HPX STREAM triad: A = B + scalar * C" << std::endl; + std::cout << "UPC++ STREAM triad: A = B + scalar * C" << std::endl; } ////////////////////////////////////////////////////////////////////// From a834102a6f69a38aa697a2c1653127a77a67d2db Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 May 2019 06:48:34 -0700 Subject: [PATCH 015/325] add README to capture what I am learning here --- HALIDE/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 HALIDE/README.md diff --git a/HALIDE/README.md b/HALIDE/README.md new file mode 100644 index 000000000..af170de52 --- /dev/null +++ b/HALIDE/README.md @@ -0,0 +1,11 @@ +# Halide + +# Notes + +``` +$ git clone https://github.com/halide/Halide.git +``` + +``` +$ make CXX=clang++ PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config +``` From f37a0ac91fe5d1a2fe16bc53a872ce99dfcb32dc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 May 2019 10:21:40 -0700 Subject: [PATCH 016/325] add notes since Halide has pre-modern build system --- HALIDE/README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/HALIDE/README.md b/HALIDE/README.md index af170de52..efa2e2d96 100644 --- a/HALIDE/README.md +++ b/HALIDE/README.md @@ -6,6 +6,51 @@ $ git clone https://github.com/halide/Halide.git ``` +# MacOS + +This works: +``` +make CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config +``` + +# Ubuntu 18.10 + +This works: +``` +make PREFIX=/opt/halide +``` + +This does not work: + ``` $ make CXX=clang++ PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config ``` + +This does not work: + +``` +$ make CC=/usr/local/Cellar/llvm/8.0.0/bin/clang CXX=/usr/local/Cellar/llvm/8.0.0/bin/clang++ CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config +``` + +# Issues + +*TL;DR* Do not try to use non-default compilers. + +https://github.com/halide/Halide/issues/3884 + +Mac: +``` +$ make CC=gcc-9 CXX=g++-9 CLANG=/usr/local/Cellar/llvm/8.0.0/bin/clang PREFIX=/opt/halide LLVM_CONFIG=/usr/local/Cellar/llvm/8.0.0/bin/llvm-config +g++-9 -Wall -Werror -Wno-unused-function -Wcast-qual -Wignored-qualifiers -Wno-comment -Wsign-compare -Wno-unknown-warning-option -Wno-psabi -Wsuggest-override -Woverloaded-virtual -fPIC -O3 -fno-omit-frame-pointer -DCOMPILING_HALIDE -std=c++11 -I/usr/local/Cellar/llvm/8.0.0/include -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/tmp/llvm-20190320-85215-19esl1h/llvm-8.0.0.src/tools/lld/include -DLLVM_VERSION=80 -DWITH_PTX=1 -DWITH_ARM=1 -DWITH_HEXAGON=1 -DWITH_AARCH64=1 -DWITH_X86=1 -DWITH_OPENCL=1 -DWITH_METAL=1 -DWITH_OPENGL=1 -DWITH_D3D12=1 -DWITH_MIPS=1 -DWITH_POWERPC=1 -DWITH_WEBASSEMBLY=1 -DWITH_INTROSPECTION -DWITH_AMDGPU=1 -funwind-tables -c ~/Work/Languages/Halide/src/Util.cpp -o bin/build/Util.o -MMD -MP -MF bin/build/Util.d -MT bin/build/Util.o +~/Work/Languages/Halide/src/Util.cpp: In function 'std::string Halide::Internal::running_program_name()': +~/Work/Languages/Halide/src/Util.cpp:80:19: error: 'PATH_MAX' was not declared in this scope + 80 | char path[PATH_MAX] = { 0 }; + | ^~~~~~~~ +~/Work/Languages/Halide/src/Util.cpp:81:32: error: 'path' was not declared in this scope + 81 | uint32_t size = sizeof(path); + | ^~~~ +At global scope: +cc1plus: error: unrecognized command line option '-Wno-unknown-warning-option' [-Werror] +cc1plus: all warnings being treated as errors +make: *** [bin/build/Util.o] Error 1 +``` From e69f46c08111e7b28ec078f81f8e7f0b965face2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 May 2019 13:14:16 -0700 Subject: [PATCH 017/325] add Halide to examples --- common/make.defs.gcc | 14 ++++++++++++-- common/make.defs.llvm | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 8ad79efb2..bf4b46ecf 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -151,13 +151,23 @@ CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # CUDA flags # # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander -#NVCC=/opt/llvm/cocl/bin/cocl +NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA NVCC=nvcc CUDAFLAGS=-g -O3 -std=c++11 CUDAFLAGS+=-arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 -CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +# +# Halide +# +HALIDECXX=c++ +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 -g3 # # ISPC # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 2aecf26d8..db54b5cc0 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -187,9 +187,9 @@ CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # CUDA flags # # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander -NVCC=/opt/llvm/cocl/bin/cocl +#NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA -#NVCC=nvcc -arch=sm_50 +NVCC=nvcc -arch=sm_50 CUDAFLAGS=-g -O3 -std=c++11 CUDAFLAGS+=-arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 From 2bd32e246c5fdb61c5cb1b9158796981cf5718d6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 May 2019 13:14:27 -0700 Subject: [PATCH 018/325] add Halide nstream I have no idea what I am doing and this code is wrong and/or bad. --- HALIDE/README.md => Cxx11/HALIDE.md | 0 Cxx11/Makefile | 6 + Cxx11/nstream-halide.cc | 190 ++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+) rename HALIDE/README.md => Cxx11/HALIDE.md (100%) create mode 100644 Cxx11/nstream-halide.cc diff --git a/HALIDE/README.md b/Cxx11/HALIDE.md similarity index 100% rename from HALIDE/README.md rename to Cxx11/HALIDE.md diff --git a/Cxx11/Makefile b/Cxx11/Makefile index b435091ed..fe6e8e891 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -50,6 +50,7 @@ THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) SYCLFLAGS = $(SYCLFLAG) -DUSE_2D_INDEXING=0 ORNLACCFLAGS = $(ORNLACCFLAG) +HALIDEFLAGS = $(HALIDEFLAG) ifdef OCCADIR include ${OCCADIR}/scripts/makefile @@ -133,6 +134,7 @@ oneapi: onemkl dpcpp sycl sycl-usm sycl-explicit occa: transpose-occa nstream-occa ornlacc: p2p-hyperplane-ornlacc +halide: nstream-halide boost-compute: nstream-boost-compute # busted @@ -253,6 +255,9 @@ endif $(info PRK help: Set OCCA_CXX=$(firstword $(CXX)) to use that compiler for OKL files.) $(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@ +%-halide: %-halide.cc prk_util.h + $(HALIDECXX) $(CXXFLAGS) $< $(HALIDEFLAGS) -o $@ + %-ornlacc: %-ornlacc.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(ORNLACCFLAGS) -o $@ @@ -300,6 +305,7 @@ clean: -rm -f *-cblas -rm -f *-onemkl -rm -f *-occa + -rm -f *-halide -rm -f *-boost-compute -rm -f *-ornlacc -rm -f transpose-async transpose-thread diff --git a/Cxx11/nstream-halide.cc b/Cxx11/nstream-halide.cc new file mode 100644 index 000000000..51f0eee16 --- /dev/null +++ b/Cxx11/nstream-halide.cc @@ -0,0 +1,190 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "Halide.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/Halide STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> []"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + const Halide::Target target = Halide::get_jit_target_from_environment(); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double scalar = 3.0; + + Halide::Buffer A(length); + Halide::Buffer B(length); + Halide::Buffer C(length); + + for (size_t i=0; i out = nstream.realize(length); +#endif + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + From a54488c28d8e8b2d2314d03340d20da81255bae1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 18 May 2019 20:59:45 -0700 Subject: [PATCH 019/325] add Stencil for Halide --- Cxx11/Makefile | 3 +- Cxx11/stencil-halide.cc | 231 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 Cxx11/stencil-halide.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index fe6e8e891..f3e6e5d3b 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -134,7 +134,8 @@ oneapi: onemkl dpcpp sycl sycl-usm sycl-explicit occa: transpose-occa nstream-occa ornlacc: p2p-hyperplane-ornlacc -halide: nstream-halide + +halide: nstream-halide stencil-halide boost-compute: nstream-boost-compute # busted diff --git a/Cxx11/stencil-halide.cc b/Cxx11/stencil-halide.cc new file mode 100644 index 000000000..f0aab6461 --- /dev/null +++ b/Cxx11/stencil-halide.cc @@ -0,0 +1,231 @@ + +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "stencil_seq.hpp" + +void nothing(const int n, const int t, prk::vector & in, prk::vector & out) +{ + std::cout << "You are trying to use a stencil that does not exist.\n"; + std::cout << "Please generate the new stencil using the code generator\n"; + std::cout << "and add it to the case-switch in the driver." << std::endl; + // n will never be zero - this is to silence compiler warnings. + if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl; + std::abort(); +} + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11 Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, n, radius, tile_size; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto stencil_time = 0.0; + + prk::vector in(n*n); + prk::vector out(n*n); + + { + for (auto it=0; it(i+j); + out[i*n+j] = 0.0; + } + } + } + } + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + // Apply the stencil operator + stencil(n, tile_size, in, out); + // Add constant to solution to force refresh of neighbor data, if any + std::transform(in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); + } + stencil_time = prk::wtime() - stencil_time; + } + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // interior of grid with respect to stencil + size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + + // compute L1 norm in parallel + double norm = 0.0; + for (auto i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} From b36ad8d0246a32c40dd1b8e410fdf4d550e8d7a4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 18 May 2019 21:49:07 -0700 Subject: [PATCH 020/325] ignore halide and occa binaries --- .gitignore | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 55140184d..8329936cb 100644 --- a/.gitignore +++ b/.gitignore @@ -184,13 +184,11 @@ Cxx11/p2p-vector-raja Cxx11/p2p-tbb Cxx11/p2p-innerloop-openmp Cxx11/p2p-doacross-openmp -Cxx11/p2p-doacross-openmp Cxx11/p2p-innerloop-opencl Cxx11/p2p-innerloop-vector +Cxx11/p2p-innerloop-tbb Cxx11/p2p-hyperplane-vector Cxx11/p2p-hyperplane-openmp -Cxx11/p2p-hyperplane-openmp -Cxx11/p2p-innerloop-tbb Cxx11/p2p-hyperplane-stl Cxx11/p2p-hyperplane-pstl Cxx11/p2p-hyperplane-tbb @@ -224,6 +222,8 @@ Cxx11/nstream-celerity Cxx11/nstream-hpx Cxx11/nstream-upcxx Cxx11/nstream-executors +Cxx11/nstream-occa +Cxx11/nstream-halide Cxx11/pic Cxx11/pic-dpcpp Cxx11/pic-sycl @@ -258,6 +258,8 @@ Cxx11/stencil-sycl-usm Cxx11/stencil-sycl-explicit Cxx11/stencil-sycl-explicit-usm Cxx11/stencil-dpcpp +Cxx11/stencil-occa +Cxx11/stencil-halide Cxx11/transpose Cxx11/transpose-openmp Cxx11/transpose-mpi @@ -288,6 +290,8 @@ Cxx11/transpose-device-thrust Cxx11/transpose-host-thrust Cxx11/transpose-cublas Cxx11/transpose-cuda +Cxx11/transpose-occa +Cxx11/transpose-halide Cxx11/grid1.cl Cxx11/grid2.cl Cxx11/grid3.cl From 6cb5371e7bc828f109fcdff2f48993b001c1c826 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 18 May 2019 21:49:22 -0700 Subject: [PATCH 021/325] less wrong --- Cxx11/stencil-halide.cc | 76 ++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 46 deletions(-) diff --git a/Cxx11/stencil-halide.cc b/Cxx11/stencil-halide.cc index f0aab6461..bdd1f1487 100644 --- a/Cxx11/stencil-halide.cc +++ b/Cxx11/stencil-halide.cc @@ -61,22 +61,12 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" -#include "stencil_seq.hpp" - -void nothing(const int n, const int t, prk::vector & in, prk::vector & out) -{ - std::cout << "You are trying to use a stencil that does not exist.\n"; - std::cout << "Please generate the new stencil using the code generator\n"; - std::cout << "and add it to the case-switch in the driver." << std::endl; - // n will never be zero - this is to silence compiler warnings. - if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl; - std::abort(); -} +#include "Halide.h" int main(int argc, char* argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11 Stencil execution on 2D grid" << std::endl; + std::cout << "C++11/Halide Stencil execution on 2D grid" << std::endl; ////////////////////////////////////////////////////////////////////// // Process and test input parameters @@ -139,54 +129,48 @@ int main(int argc, char* argv[]) std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; std::cout << "Radius of stencil = " << radius << std::endl; - auto stencil = nothing; - if (star) { - switch (radius) { - case 1: stencil = star1; break; - case 2: stencil = star2; break; - case 3: stencil = star3; break; - case 4: stencil = star4; break; - case 5: stencil = star5; break; - } - } else { - switch (radius) { - case 1: stencil = grid1; break; - case 2: stencil = grid2; break; - case 3: stencil = grid3; break; - case 4: stencil = grid4; break; - case 5: stencil = grid5; break; - } - } + const Halide::Target target = Halide::get_jit_target_from_environment(); ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - auto stencil_time = 0.0; + double stencil_time(0); - prk::vector in(n*n); - prk::vector out(n*n); + Halide::Buffer in(n,n); + Halide::Buffer out(n,n); + + Halide::Var x("x"); + Halide::Var y("y"); + + Halide::Expr c1(0.25); + Halide::Expr c2(0.125); + Halide::Func stencil; + stencil(x,y) = c1 * ( in(x+1,y) + in(x-1,y) + in(x,y+1) + in(x,y+1) ) + + c2 * ( in(x+2,y) + in(x-2,y) + in(x,y+2) + in(x,y+2) ); { - for (auto it=0; it(i+j); - out[i*n+j] = 0.0; - } - } + for (auto i=0; i Date: Tue, 3 Mar 2020 09:44:00 -0800 Subject: [PATCH 022/325] move documentation to the right place Signed-off-by: Jeff Hammond --- {Cxx11 => doc}/HALIDE.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {Cxx11 => doc}/HALIDE.md (100%) diff --git a/Cxx11/HALIDE.md b/doc/HALIDE.md similarity index 100% rename from Cxx11/HALIDE.md rename to doc/HALIDE.md From 52bd76531144854fb3f9e537b1c1e46457853796 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 2 Nov 2020 08:48:46 -0800 Subject: [PATCH 023/325] update Halide stuff for 10.0 release --- common/make.defs.gcc | 6 +++--- common/make.defs.llvm | 6 +++--- common/make.defs.oneapi | 10 ++++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index bf4b46ecf..f0fccc68c 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -162,12 +162,12 @@ CUDAFLAGS+=-arch=sm_50 # Halide # HALIDECXX=c++ -HALIDEDIR=/opt/halide +HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux HALIDEFLAG=-I${HALIDEDIR}/include -HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 HALIDEFLAG+=${DEFAULT_OPT_FLAGS} -HALIDEFLAG+=-std=c++17 -g3 +HALIDEFLAG+=-std=c++17 # # ISPC # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 1764d24f0..08fb7b1a1 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -202,12 +202,12 @@ CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # Halide # HALIDECXX=c++ -HALIDEDIR=/opt/halide +HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux HALIDEFLAG=-I${HALIDEDIR}/include -HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 HALIDEFLAG+=${DEFAULT_OPT_FLAGS} -HALIDEFLAG+=-std=c++17 -g3 +HALIDEFLAG+=-std=c++17 # # ISPC # diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index be6b2dc4b..38e163047 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -106,6 +106,16 @@ CUDAFLAGS+=-arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # +# Halide +# +HALIDECXX=icpx +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 +# # ISPC # ISPC=ispc From 28d98300e67d1c3bc5e8492506808b6dfdc0985e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 23 Jan 2021 19:39:20 -0800 Subject: [PATCH 024/325] no idea --- Cxx11/Makefile | 2 +- Cxx11/nstream-opencl.cc | 93 +++++++++++++++++------------------------ Cxx11/prk_opencl.h | 1 + 3 files changed, 41 insertions(+), 55 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index f0624da81..ebc20ebc8 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -33,7 +33,7 @@ ASMFLAGS = -fverbose-asm OMPFLAGS = $(OPENMPFLAG) -DUSE_OPENMP TARGETFLAGS = $(OFFLOADFLAG) -OPENCLFLAGS = $(OPENCLFLAG) -DCL_HPP_MINIMUM_OPENCL_VERSION=120 -DCL_HPP_TARGET_OPENCL_VERSION=120 +OPENCLFLAGS = $(OPENCLFLAG) -DCL_HPP_MINIMUM_OPENCL_VERSION=120 -DCL_HPP_TARGET_OPENCL_VERSION=120 -DCL_HPP_ENABLE_EXCEPTIONS # We do not yet handle all possible exceptions... #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS ORNLACCFLAGS = $(ORNLACCFLAG) diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 377b789fe..142841159 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -67,18 +67,23 @@ template void run(cl::Context context, int iterations, size_t length) { - auto precision = (sizeof(T)==8) ? 64 : 32; + cl_int err = CL_SUCCESS; cl::Program program(context, prk::opencl::loadProgram("nstream.cl"), true); - auto function = (precision==64) ? "nstream64" : "nstream32"; + auto precision = (sizeof(T)==8) ? 64 : 32; + auto function = (precision==64) ? "nstream64" : "nstream32"; - cl_int err; - auto kernel = cl::KernelFunctor(program, function, &err); - if(err != CL_SUCCESS){ - std::vector devices = context.getInfo(); - std::cout << program.getBuildInfo(devices[0]) << std::endl; + try { + program.build(); + } + catch (...) { + auto info = program.getBuildInfo(&err); + for (auto &pair : info) { + std::cout << pair.second << std::endl; + } } + auto kernel = cl::KernelFunctor(program, function, &err); cl::CommandQueue queue(context); @@ -103,7 +108,6 @@ void run(cl::Context context, int iterations, size_t length) if (iter==1) nstream_time = prk::wtime(); - // nstream the matrix kernel(cl::EnqueueArgs(queue, cl::NDRange(length)), length, scalar, d_a, d_b, d_c); queue.finish(); @@ -150,6 +154,8 @@ void run(cl::Context context, int iterations, size_t length) int main(int argc, char* argv[]) { + prk::opencl::listPlatforms(); + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; std::cout << "C++11/OpenCL STREAM triad: A = B + scalar * C" << std::endl; @@ -191,55 +197,34 @@ int main(int argc, char* argv[]) /// Setup OpenCL environment ////////////////////////////////////////////////////////////////////// - prk::opencl::listPlatforms(); - - cl_int err = CL_SUCCESS; - - cl::Context cpu(CL_DEVICE_TYPE_CPU, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(cpu) ) - { - const int precision = prk::opencl::precision(cpu); - - std::cout << "CPU Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(cpu, iterations, length); - } - run(cpu, iterations, length); - } else { - std::cerr << "No CPU" << std::endl; - } - - cl::Context gpu(CL_DEVICE_TYPE_GPU, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(gpu) ) - { - const int precision = prk::opencl::precision(gpu); - - std::cout << "GPU Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(gpu, iterations, length); - } - run(gpu, iterations, length); - } else { - std::cerr << "No GPU" << std::endl; + std::vector platforms; + cl::Platform::get(&platforms); + if ( platforms.size() == 0 ) { + std::cout <<" No platforms found. Check OpenCL installation!\n"; + return 1; } - - cl::Context acc(CL_DEVICE_TYPE_ACCELERATOR, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(acc) ) - { - - const int precision = prk::opencl::precision(acc); - - std::cout << "ACC Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(acc, iterations, length); + for (auto plat : platforms) { + std::cout << "====================================================\n" + << "CL_PLATFORM_NAME=" << plat.getInfo() << ", " + << "CL_PLATFORM_VENDOR=" << plat.getInfo() << std::endl; + + std::vector devices; + plat.getDevices(CL_DEVICE_TYPE_ALL, &devices); + for (auto dev : devices) { + std::cout << "CL_DEVICE_NAME=" << dev.getInfo() << ", " + << "CL_DEVICE_VENDOR=" << dev.getInfo() << std::endl; + + cl_int err = CL_SUCCESS; + cl::Context ctx(dev, NULL, NULL, NULL, &err); + const int precision = prk::opencl::precision(ctx); + //std::cout << "Device Precision = " << precision << "-bit" << std::endl; + if (precision==64) { + run(dev, iterations, length); + } + run(dev, iterations, length); } - run(acc, iterations, length); - } else { - std::cerr << "No ACC" << std::endl; } + std::cout << "====================================================" << std::endl; return 0; } diff --git a/Cxx11/prk_opencl.h b/Cxx11/prk_opencl.h index 880a9f32f..a10501389 100644 --- a/Cxx11/prk_opencl.h +++ b/Cxx11/prk_opencl.h @@ -15,6 +15,7 @@ #include #include #include +#include #include From 27f1aabc0caeaa0fab1dd9a8e9f6175c2326e749 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Tue, 2 Mar 2021 16:22:11 -0600 Subject: [PATCH 025/325] Enable P2P TBB Flow Graph example with most recent API --- Cxx11/p2p-tasks-tbb.cc | 17 ++--------------- Cxx11/prk_tbb.h | 5 ++++- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/Cxx11/p2p-tasks-tbb.cc b/Cxx11/p2p-tasks-tbb.cc index 32d99c28d..160535139 100644 --- a/Cxx11/p2p-tasks-tbb.cc +++ b/Cxx11/p2p-tasks-tbb.cc @@ -74,7 +74,6 @@ int main(int argc, char* argv[]) ////////////////////////////////////////////////////////////////////// using namespace tbb::flow; - //graph g; int iterations; int m, n; @@ -172,25 +171,13 @@ int main(int argc, char* argv[]) // make_edge(*nodes[(i-1)*num_blocks_n + j-1], *tmp ); } } - auto start = true; - source_node s(g, [&](continue_msg &v) -> bool { - if(start) { - v = continue_msg(); - start = false; - return true; - } - return false; - }, false); - - limiter_node l(g, iterations+1, 1); + limiter_node l(g, iterations+1); - make_edge( s, l ); make_edge( l, *nodes[0] ); make_edge( *nodes[(num_blocks_n * num_blocks_m) - 1], b); make_edge( b, l ); #if TBB_PREVIEW_FLOW_GRAPH_TRACE - s.set_name("Source"); b.set_name("Iteration Barrier"); l.set_name("Limiter"); #endif @@ -216,7 +203,7 @@ int main(int argc, char* argv[]) grid[i*n+0] = static_cast(i); } - s.activate(); + l.try_put(continue_msg{}); g.wait_for_all(); pipeline_time = prk::wtime() - pipeline_time; diff --git a/Cxx11/prk_tbb.h b/Cxx11/prk_tbb.h index c23a21837..f88c11353 100644 --- a/Cxx11/prk_tbb.h +++ b/Cxx11/prk_tbb.h @@ -33,13 +33,16 @@ #define PRK_TBB_H //#include +#include #include #include #include #include #include #include -#include +#if TBB_INTERFACE_VERSION <= 12000 +# include +#endif #if ( PRK_TBB_PARTITIONER == 1) tbb::static_partitioner tbb_partitioner; From c592b23733516ea030b8c1a3d5b65cf770854ff7 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Tue, 2 Mar 2021 20:00:33 -0600 Subject: [PATCH 026/325] Enable P2P for oneTBB --- Cxx11/p2p-tbb.cc | 7 ++++++- Cxx11/prk_tbb.h | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Cxx11/p2p-tbb.cc b/Cxx11/p2p-tbb.cc index 6e0f7cc71..c763acb82 100644 --- a/Cxx11/p2p-tbb.cc +++ b/Cxx11/p2p-tbb.cc @@ -73,7 +73,7 @@ void SequentialSweep(int m, int n, prk::vector & grid) const int N = 64; const int MAX_LEN = 1024; -tbb::atomic Count[MAX_LEN/N+1][MAX_LEN/N+1]; +std::atomic Count[MAX_LEN/N+1][MAX_LEN/N+1]; double F[MAX_LEN][MAX_LEN]; void ParallelSweep( const char* x, int xlen, const char* y, int ylen ) { @@ -88,8 +88,13 @@ void ParallelSweep( const char* x, int xlen, const char* y, int ylen ) { // Roll the wavefront from the origin. typedef std::pair block; block origin(0,0); +#if TBB_INTERFACE_VERSION > 12000 + tbb::parallel_for_each( &origin, &origin+1, + [=]( const block& b, tbb::feeder&feeder ) { +#else tbb::parallel_do( &origin, &origin+1, [=]( const block& b, tbb::parallel_do_feeder&feeder ) { +#endif // Extract bounds on block int bi = b.first; int bj = b.second; diff --git a/Cxx11/prk_tbb.h b/Cxx11/prk_tbb.h index f88c11353..7638cb376 100644 --- a/Cxx11/prk_tbb.h +++ b/Cxx11/prk_tbb.h @@ -33,13 +33,13 @@ #define PRK_TBB_H //#include -#include #include #include #include #include #include #include +#include #if TBB_INTERFACE_VERSION <= 12000 # include #endif From 11f13d6f9909235f2fb5312f5869a410f06cbbeb Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 3 Mar 2021 13:10:35 -0600 Subject: [PATCH 027/325] Adding nstream example using oneDPL with device backend --- Cxx11/Makefile | 7 +- Cxx11/nstream-onedpl.cc | 188 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 Cxx11/nstream-onedpl.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 63cd49488..ad4c7bf33 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -130,7 +130,9 @@ cblas: transpose-cblas dgemm-cblas onemkl: nstream-onemkl dgemm-onemkl dgemm-multigpu-onemkl -oneapi: onemkl dpcpp sycl +onedpl: nstream-onedpl + +oneapi: onemkl dpcpp sycl onedpl occa: transpose-occa nstream-occa @@ -185,6 +187,9 @@ random_draw.o: random_draw.c random_draw.h %-sycl-explicit: %-sycl-explicit.cc prk_util.h prk_sycl.h $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@ +%-onedpl: %-onedpl.cc prk_util.h prk_sycl.h + $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@ + %-onemkl: %-onemkl.cc prk_util.h prk_sycl.h $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< $(ONEMKLFLAG) -o $@ diff --git a/Cxx11/nstream-onedpl.cc b/Cxx11/nstream-onedpl.cc new file mode 100644 index 000000000..8c07b7937 --- /dev/null +++ b/Cxx11/nstream-onedpl.cc @@ -0,0 +1,188 @@ +/// +/// Copyright (c) 2020, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors and +/// the length of the vectors. +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include + +#include "prk_sycl.h" +#include "prk_util.h" + +using namespace oneapi; + +int main(int argc, char *argv[]) { + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++17/oneDPL STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + } catch (const char *e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + + sycl::queue q(sycl::default_selector{}); + prk::SYCL::print_device_platform(q); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time{0}; + + const size_t bytes = length * sizeof(double); + + std::vector h_A(length); + std::vector h_B(length); + std::vector h_C(length); + + std::fill(h_A.begin(), h_A.end(), 0.0); + std::fill(h_B.begin(), h_B.end(), 2.0); + std::fill(h_C.begin(), h_C.end(), 2.0); + + double *d_A = sycl::malloc_device(length, q); + double *d_B = sycl::malloc_device(length, q); + double *d_C = sycl::malloc_device(length, q); + + q.memcpy(d_A, h_A.data(), bytes).wait(); + q.memcpy(d_B, h_B.data(), bytes).wait(); + q.memcpy(d_C, h_C.data(), bytes).wait(); + + double scalar(3); + { + for (int iter = 0; iter <= iterations; iter++) { + + if (iter == 1) + nstream_time = prk::wtime(); + + auto begin = dpl::make_zip_iterator(d_A, d_B, d_C); + std::transform(dpl::execution::make_device_policy(q), begin, + begin + length, d_A, [=](const auto &t) { + using std::get; + return get<0>(t) + get<1>(t) + scalar * get<2>(t); + }); + } + nstream_time = prk::wtime() - nstream_time; + } + + q.memcpy(h_A.data(), d_A, bytes).wait(); + + sycl::free(d_C, q); + sycl::free(d_B, q); + sycl::free(d_A, q); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (int i = 0; i <= iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i = 0; i < length; i++) { + asum += prk::abs(h_A[i]); + } + + double epsilon(1.e-8); + if (prk::abs(ar - asum) / asum > epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time / iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6 * nbytes / avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} From 6b791fb2ab1781ca1bd3fc89d09cac3eccb4072b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 11 Mar 2021 17:09:36 +0000 Subject: [PATCH 028/325] C to Fortran comments at the top --- FORTRAN/pic.F90 | 79 +++++++++++++++++++++------------------------ FORTRAN/pic_soa.F90 | 79 +++++++++++++++++++++------------------------ 2 files changed, 74 insertions(+), 84 deletions(-) diff --git a/FORTRAN/pic.F90 b/FORTRAN/pic.F90 index aacc95f8b..97c459ca0 100644 --- a/FORTRAN/pic.F90 +++ b/FORTRAN/pic.F90 @@ -1,45 +1,40 @@ -/* -Copyright (c) 2015, Intel Corporation -Copyright (c) 2021, Thomas Hayward-Schneider - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. -* Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. -*/ - -/******************************************************************* - -HISTORY: - Written by Evangelos Georganas, August 2015. - - RvdW: Refactored to make the code PRK conforming, December 2015 - - TWHS: Converted from C to Fortran - -**********************************************************************************/ - - +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, Thomas Hayward-Schneider +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! HISTORY: - Written by Evangelos Georganas, August 2015. +! - RvdW: Refactored to make the code PRK conforming, December 2015 +! - TWHS: Converted from C to Fortran +! +! +! #define REL_X 0.5 #define REL_Y 0.5 diff --git a/FORTRAN/pic_soa.F90 b/FORTRAN/pic_soa.F90 index d14686eb1..3eaf111ff 100644 --- a/FORTRAN/pic_soa.F90 +++ b/FORTRAN/pic_soa.F90 @@ -1,45 +1,40 @@ -/* -Copyright (c) 2015, Intel Corporation -Copyright (c) 2021, Thomas Hayward-Schneider - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. -* Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. -*/ - -/******************************************************************* - -HISTORY: - Written by Evangelos Georganas, August 2015. - - RvdW: Refactored to make the code PRK conforming, December 2015 - - TWHS: Converted from C to Fortran - -**********************************************************************************/ - - +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, Thomas Hayward-Schneider +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! HISTORY: - Written by Evangelos Georganas, August 2015. +! - RvdW: Refactored to make the code PRK conforming, December 2015 +! - TWHS: Converted from C to Fortran +! +! +! #define REL_X 0.5 #define REL_Y 0.5 From 66f3a1a14949a0eba8f71c7743cd01d43d3313ca Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 11 Mar 2021 18:33:27 +0000 Subject: [PATCH 029/325] do not build p2p-simd-openmp because not good --- C1z/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/C1z/Makefile b/C1z/Makefile index 93efb27f6..5e7b0ac81 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -64,7 +64,7 @@ vector: p2p-sse p2p-avx p2p-avx-tasks-openmp thread: transpose-thread -openmp: nstream-openmp p2p-simd-openmp \ +openmp: nstream-openmp \ p2p-hyperplane-openmp p2p-hyperplane-2d-openmp \ stencil-openmp stencil-2d-openmp \ transpose-openmp transpose-2d-openmp From ec015154c34c4cfd73d01062383b66c4333b0826 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 11 Mar 2021 18:33:38 +0000 Subject: [PATCH 030/325] update PGI build example --- common/make.defs.pgi | 79 +++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 20 deletions(-) diff --git a/common/make.defs.pgi b/common/make.defs.pgi index 96dda1d33..e713b52c9 100644 --- a/common/make.defs.pgi +++ b/common/make.defs.pgi @@ -73,8 +73,9 @@ CBLASFLAG= # NVCC never supports the latest GCC. # Use appropriate arch or code is compiled to ancient features. #NVCC=nvcc --compiler-bindir= --gpu-architecture=sm_61 -NVCC=nvcc --gpu-architecture=sm_61 +NVCC=nvcc CUDAFLAGS=-g -O3 -std=c++11 +CUDAFLAGS+=--gpu-architecture=sm_70 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 # heavy hammer: #CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED @@ -83,22 +84,60 @@ CUDAFLAGS=-g -O3 -std=c++11 #CUDAFLAGS+=-D_FMA4INTRIN_H_INCLUDED #CUDAFLAGS+=-D_XOPMMINTRIN_H_INCLUDED # many tiny hammers: -CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512FINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512VLINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512BWINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512DQINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512VLBWINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512VBMIVLINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512VBMIINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512VLDQINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512CDINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512PFINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED -CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED -# -# MPI -# -# Needs PATH and LD_LIBRARY_PATH set appropriately... -MPICC=/opt/pgi/linux86-64/2019/mpi/openmpi/bin/mpicc +#CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512FINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512VLINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512BWINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512DQINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512VLBWINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512VBMIVLINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512VBMIINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512VLDQINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512CDINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512PFINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED +# +# MPI-3 +# +# We assume you have Intel MPI and have setup your environment with e.g. +# . /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh +# in your .bashrc. +# +# mpiicc wraps icc. mpicc and mpigcc wrap gcc. +MPIDIR= +MPICC=${MPIDIR}/bin/mpicc +MPICXX=${MPIDIR}/bin/mpicxx +MPIINC=-I${MPIDIR}/include +MPILIB=-L${MPIDIR}/lib -lmpi +#MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi +#MPIINC=-I/usr/include/mpich-3.2-x86_64 +#MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi +# +# Global Arrays +# +GADIR=../deps/ga +GAFLAG=-I${GADIR}/include +GAFLAG+=-L${GADIR}/lib -lga +GAFLAG+=-L${GADIR}/../armci-mpi/lib -larmci # ARMCI-MPI +#GAFLAG+=-L${GADIR}/lib -larmci -lcomex # ARMCI/ComEx +GAFLAG+=${MPIINC} ${MPILIB} +GAFLAG+=-lmpifort -lmpi +GAFLAG+=-i8 # GA is compiled with -i8 on 64-bit systems +# +# PETSc +# +PETSCDIR=../deps/petsc +PETSCFLAG=-I${PETSCDIR}/include +PETSCFLAG+=-L${PETSCDIR}/lib -lpetsc +PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib +# +# Fortran 2008 coarrays +# +# see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details +# single-node +#COARRAYFLAG=-fcoarray=single -lcaf_single +# multi-node +# COARRAYFLAG=-fcoarray=lib -lcaf_mpi + From 86e046d67479fd92b3f77de3db919243528f6d00 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 28 Mar 2021 16:41:46 -0700 Subject: [PATCH 031/325] fix error in tiled target transpose --- FORTRAN/transpose-openmp-target.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index 30f98dbc2..fa464130e 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -163,8 +163,8 @@ program main if (tile_size.lt.order) then !$omp target teams distribute collapse(2) - do j=1,order - do i=1,order + do jt=1,order,tile_size + do it=1,order,tile_size !$omp parallel do simd collapse(2) schedule(static,1) do j=1,tile_size do i=1,tile_size From b34e1882dfc5c966f7c420b8f726bc876094aaed Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 28 Mar 2021 16:43:51 -0700 Subject: [PATCH 032/325] fix error in do concurrent usage in coarrays --- FORTRAN/transpose-coarray.F90 | 2 -- 1 file changed, 2 deletions(-) diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90 index 4f8f2dadc..0093a3493 100644 --- a/FORTRAN/transpose-coarray.F90 +++ b/FORTRAN/transpose-coarray.F90 @@ -193,7 +193,6 @@ program main B(i,j) = 0.0 enddo enddo - enddo enddo else do concurrent (j=1:col_per_pe) @@ -245,7 +244,6 @@ program main B(col_start+i,j) = B(col_start+i,j) + T(j,i) enddo enddo - enddo enddo else ! untiled ! * fully explicit version From fef55eb48f6bc08460fb836d92fc8d4262952ecc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 10:05:22 -0700 Subject: [PATCH 033/325] fix C++ PIC SYCL with hipSYCL hipSYCL/HIP can't figure out how to link in the C object file, so just build everything together. Signed-off-by: Jeff Hammond --- Cxx11/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index ad4c7bf33..a4ecf8169 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -163,14 +163,14 @@ p2p-hyperplane-vector: p2p-hyperplane-openmp.cc prk_util.h %-celerity: %-celerity.cc prk_util.h prk_sycl.h $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $(BOOSTFLAGS) $(CELERITYINC) $(MPIINC) $< $(CELERITYLIB) $(MPILIB) -o $@ -pic: pic.cc prk_util.h random_draw.o - $(CXX) $(CXXFLAGS) $< random_draw.o -o $@ +pic: pic.cc prk_util.h random_draw.c + $(CXX) $(CXXFLAGS) $< random_draw.c -o $@ -pic-sycl: pic-sycl.cc prk_util.h prk_sycl.h random_draw.o - $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< random_draw.o -o $@ +pic-sycl: pic-sycl.cc prk_util.h prk_sycl.h random_draw.c + $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< random_draw.c -o $@ -random_draw.o: random_draw.c random_draw.h - $(CC) $(DEFAULT_OPT_FLAGS) $(CPPFLAGS) -c $< -o $@ +#random_draw.o: random_draw.c random_draw.h +# $(CC) $(DEFAULT_OPT_FLAGS) $(CPPFLAGS) -c $< -o $@ %-dpcpp: %-dpcpp.cc prk_util.h prk_sycl.h $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@ From 20915f7acfb32e9ea39a6dbaeb8ac8f0f511d93a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 10:11:45 -0700 Subject: [PATCH 034/325] hipSYCL supports get_{device,platform} --- Cxx11/prk_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h index ba5b64276..f8dc0f024 100644 --- a/Cxx11/prk_sycl.h +++ b/Cxx11/prk_sycl.h @@ -51,7 +51,7 @@ namespace prk { namespace SYCL { void print_device_platform(const sycl::queue & q) { -#if ! ( defined(TRISYCL) || defined(__HIPSYCL__) ) +#if ! defined(TRISYCL) auto d = q.get_device(); auto p = d.get_platform(); std::cout << "SYCL Device: " << d.get_info() << std::endl; From 40c662abd2d95945b21e9cd4fcecce44273b7ed0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 14:31:52 -0700 Subject: [PATCH 035/325] use discard_write on init to make hipSYCL happy --- Cxx11/nstream-sycl-explicit.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc index 3c1e37160..a3083a244 100644 --- a/Cxx11/nstream-sycl-explicit.cc +++ b/Cxx11/nstream-sycl-explicit.cc @@ -100,15 +100,15 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) sycl::buffer d_C { sycl::range<1>{length} }; q.submit([&](sycl::handler& h) { - sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); h.fill(A,(T)0); }); q.submit([&](sycl::handler& h) { - sycl::accessor B(d_B, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor B(d_B, h, sycl::range<1>(length), sycl::id<1>(0)); h.fill(B,(T)2); }); q.submit([&](sycl::handler& h) { - sycl::accessor C(d_C, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor C(d_C, h, sycl::range<1>(length), sycl::id<1>(0)); h.fill(C,(T)2); }); q.wait(); From 9c4c53f285419c6bb136ff8628327503df1fbaca Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 11:48:35 -0700 Subject: [PATCH 036/325] renamge rangefor to ranges --- Cxx11/Makefile | 10 +++++----- Cxx11/generate-cxx-stencil.py | 4 ++-- Cxx11/{nstream-rangefor.cc => nstream-ranges.cc} | 0 Cxx11/{stencil-rangefor.cc => stencil-ranges.cc} | 0 Cxx11/{stencil_rangefor.hpp => stencil_ranges.hpp} | 0 Cxx11/{transpose-rangefor.cc => transpose-ranges.cc} | 0 6 files changed, 7 insertions(+), 7 deletions(-) rename Cxx11/{nstream-rangefor.cc => nstream-ranges.cc} (100%) rename Cxx11/{stencil-rangefor.cc => stencil-ranges.cc} (100%) rename Cxx11/{stencil_rangefor.hpp => stencil_ranges.hpp} (100%) rename Cxx11/{transpose-rangefor.cc => transpose-ranges.cc} (100%) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index a4ecf8169..973abc3bb 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -57,7 +57,7 @@ endif OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca .PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl \ - rangefor kokkos raja cuda cublas sycl dpcpp \ + ranges kokkos raja cuda cublas sycl dpcpp \ boost-compute thrust executor oneapi onemkl EXTRA= @@ -72,7 +72,7 @@ ifneq ($(findstring pgc++,$(CXX)),pgc++) EXTRA += pstl endif -all: sequential vector valarray openmp taskloop stl rangefor opencl sycl $(EXTRA) +all: sequential vector valarray openmp taskloop stl ranges opencl sycl $(EXTRA) sequential: p2p stencil transpose nstream dgemm sparse @@ -104,7 +104,7 @@ stl: stencil-stl transpose-stl nstream-stl pstl: stencil-pstl transpose-pstl nstream-pstl -rangefor: stencil-rangefor transpose-rangefor nstream-rangefor +ranges: stencil-ranges transpose-ranges nstream-ranges executors: nstream-executors transpose-executors @@ -215,7 +215,7 @@ pic-sycl: pic-sycl.cc prk_util.h prk_sycl.h random_draw.c %-pstl: %-pstl.cc prk_util.h prk_pstl.h $(CXX) $(CXXFLAGS) $< $(PSTLFLAGS) -o $@ -%-rangefor: %-rangefor.cc prk_util.h prk_ranges.h +%-ranges: %-ranges.cc prk_util.h prk_ranges.h $(CXX) $(CXXFLAGS) $< $(RANGEFLAGS) -o $@ %-executors: %-executors.cc prk_util.h prk_executors.h @@ -307,7 +307,7 @@ clean: -rm -f *-tbb -rm -f *-stl -rm -f *-pstl - -rm -f *-rangefor + -rm -f *-ranges -rm -f *-raja -rm -f *-kokkos -rm -f *-thrust diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index af92c6c61..b6cb34ba7 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -73,7 +73,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): bodygen(src,pattern,stencil_size,radius,W,model) src.write(' }\n') src.write(' }\n') - elif (model=='rangefor'): + elif (model=='ranges'): src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector & in, prk::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' for (auto i : inside) {\n') @@ -200,7 +200,7 @@ def instance(src,model,pattern,r): codegen(src,pattern,stencil_size,r,W,model) def main(): - for model in ['seq','vector','rangefor','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','rajaview','kokkos','cuda']: + for model in ['seq','vector','ranges','stl','pgnu','pstl','openmp','taskloop','target','tbb','raja','rajaview','kokkos','cuda']: src = open('stencil_'+model+'.hpp','w') if (model=='target'): src.write('#define RESTRICT __restrict__\n\n') diff --git a/Cxx11/nstream-rangefor.cc b/Cxx11/nstream-ranges.cc similarity index 100% rename from Cxx11/nstream-rangefor.cc rename to Cxx11/nstream-ranges.cc diff --git a/Cxx11/stencil-rangefor.cc b/Cxx11/stencil-ranges.cc similarity index 100% rename from Cxx11/stencil-rangefor.cc rename to Cxx11/stencil-ranges.cc diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_ranges.hpp similarity index 100% rename from Cxx11/stencil_rangefor.hpp rename to Cxx11/stencil_ranges.hpp diff --git a/Cxx11/transpose-rangefor.cc b/Cxx11/transpose-ranges.cc similarity index 100% rename from Cxx11/transpose-rangefor.cc rename to Cxx11/transpose-ranges.cc From 3a4aeec1defbd656ab5059780a56563bd5182fa2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 12:41:11 -0700 Subject: [PATCH 037/325] s/range-for/ranges/g --- Cxx11/nstream-ranges.cc | 2 +- Cxx11/stencil-ranges.cc | 4 +-- Cxx11/transpose-ranges.cc | 70 +++++++++++++++++++++++++++++++-------- 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/Cxx11/nstream-ranges.cc b/Cxx11/nstream-ranges.cc index 3bb522f8f..7a4e3da84 100644 --- a/Cxx11/nstream-ranges.cc +++ b/Cxx11/nstream-ranges.cc @@ -68,7 +68,7 @@ int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/range-for STREAM triad: A = B + scalar * C" << std::endl; + std::cout << "C++11/ranges STREAM triad: A = B + scalar * C" << std::endl; ////////////////////////////////////////////////////////////////////// /// Read and test input parameters diff --git a/Cxx11/stencil-ranges.cc b/Cxx11/stencil-ranges.cc index 82661d996..1566a8d19 100644 --- a/Cxx11/stencil-ranges.cc +++ b/Cxx11/stencil-ranges.cc @@ -1,6 +1,6 @@ - /// /// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -76,7 +76,7 @@ void nothing(const int n, const int t, prk::vector & in, prk::vector3) ? std::atoi(argv[3]) : 32; + tile_size = (argc>3) ? std::atoi(argv[3]) : order; // a negative tile size means no tiling of the local transpose if (tile_size <= 0) tile_size = order; + // + if ((tile_size < order) && (order % tile_size)) { + throw "ERROR: tile size must evenly divide order"; + } } catch (const char * e) { std::cout << e << std::endl; @@ -109,24 +117,60 @@ int main(int argc, char * argv[]) // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); - auto itrange = prk::range(0,order,tile_size); - auto jtrange = prk::range(0,order,tile_size); + // untiled + auto v = ranges::views::cartesian_product( + ranges::views::iota(0, order), + ranges::views::iota(0, order) + ); + + // tiled: s is the strided (outer) view and t is the tile (inner) view + auto s = ranges::views::cartesian_product( + ranges::stride_view(ranges::views::iota(0, order), tile_size), + ranges::stride_view(ranges::views::iota(0, order), tile_size) + ); + auto t = ranges::views::cartesian_product( + ranges::views::iota(0, tile_size), + ranges::views::iota(0, tile_size) + ); for (int iter = 0; iter<=iterations; iter++) { if (iter==1) trans_time = prk::wtime(); - for (auto it : itrange) { - auto irange = prk::range(it,std::min(order,it+tile_size)); - for (auto jt : jtrange) { - auto jrange = prk::range(jt,std::min(order,jt+tile_size)); - for (auto i : irange) { - for (auto j : jrange) { - B[i*order+j] += A[j*order+i]; - A[j*order+i] += 1.0; + if (tile_size < order) { +#if USE_FOR_EACH_RANGES + std::for_each(std::begin(s), std::end(s), [=,&A,&B] (auto itjt) { + std::for_each(std::begin(t), std::end(t), [=,&A,&B] (auto ij) { + auto [it, jt] = itjt; + auto [i, j] = ij; + B[(it+i)*order+(jt+j)] += A[(jt+j)*order+(it+i)]; + A[(jt+j)*order+(it+i)] += 1.0; + }); + }); +#else + for (auto itjt : s) { + auto [it, jt] = itjt; + for (auto ij : t) { + auto [i, j] = ij; + B[(it+i)*order+(jt+j)] += A[(jt+j)*order+(it+i)]; + A[(jt+j)*order+(it+i)] += 1.0; } - } } +#endif + } else { +#if USE_FOR_EACH_RANGES + std::for_each(std::begin(v), std::end(v), [=,&A,&B] (auto ij) { + auto [i, j] = ij; + B[i*order+j] += A[j*order+i]; + A[j*order+i] += 1.0; + }); +#else + for (auto ij : v) { + auto [i, j] = ij; + B[i*order+j] += A[j*order+i]; + A[j*order+i] += 1.0; + } +#endif } } trans_time = prk::wtime() - trans_time; From bf1c88a73e90ba127f3e2a7f99bd219e7f40a072 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 13:20:05 -0700 Subject: [PATCH 038/325] ignore dSYM explicitly --- .gitignore | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/.gitignore b/.gitignore index 762976d59..209be91a0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ octave-workspace # Octave crashes */*/*/*.swp *.swo # Vim *.dSYM # Mac +*/*.dSYM # Mac +*/*/*.dSYM # Mac *.optrpt # Intel compiler *__genmod.* # Intel Fortran compiler *.patch # patch files @@ -368,3 +370,41 @@ SERIAL/Sparse/sparse SERIAL/Stencil/stencil SERIAL/Synch_p2p/p2p SERIAL/Transpose/transpose +dgemm-vector.dSYM +dgemm.dSYM +nstream-opencl.dSYM +nstream-openmp-target.dSYM +nstream-openmp.dSYM +nstream-ranges.dSYM +nstream-stl.dSYM +nstream-taskloop.dSYM +nstream-valarray.dSYM +nstream-vector.dSYM +nstream.dSYM +p2p-hyperplane-openmp.dSYM +p2p-hyperplane-vector.dSYM +p2p-innerloop-opencl.dSYM +p2p-tasks-openmp.dSYM +p2p-vector.dSYM +p2p.dSYM +sparse-vector.dSYM +sparse.dSYM +stencil-opencl.dSYM +stencil-openmp-target.dSYM +stencil-openmp.dSYM +stencil-ranges.dSYM +stencil-stl.dSYM +stencil-taskloop.dSYM +stencil-vector.dSYM +stencil.dSYM +transpose-async.dSYM +transpose-opencl.dSYM +transpose-openmp-target.dSYM +transpose-openmp.dSYM +transpose-ranges.dSYM +transpose-stl.dSYM +transpose-taskloop.dSYM +transpose-thread.dSYM +transpose-valarray.dSYM +transpose-vector.dSYM +transpose.dSYM From 29df9f20330c6bf5549cc316ab3a82b7d4e998b2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 13:20:56 -0700 Subject: [PATCH 039/325] remove PRAGMA_SIMD --- Cxx11/stencil_seq.hpp | 10 ---------- Cxx11/stencil_vector.hpp | 10 ---------- 2 files changed, 20 deletions(-) diff --git a/Cxx11/stencil_seq.hpp b/Cxx11/stencil_seq.hpp index d03ec4b1a..90500de4d 100644 --- a/Cxx11/stencil_seq.hpp +++ b/Cxx11/stencil_seq.hpp @@ -2,7 +2,6 @@ void star1(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & in, prk::vector & in, prk::vector & in, prk::vector & in, prk::vector & in, prk::vector & in, prk::vector & in, prk::vector & in, prk::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector Date: Mon, 29 Mar 2021 13:21:07 -0700 Subject: [PATCH 040/325] redo stencil-ranges --- Cxx11/generate-cxx-stencil.py | 47 ++- Cxx11/stencil-ranges.cc | 26 +- Cxx11/stencil_ranges.hpp | 605 +++++++++++++++++++++++++++++----- 3 files changed, 576 insertions(+), 102 deletions(-) diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index b6cb34ba7..00095484e 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -52,6 +52,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' }\n') src.write(' }\n') src.write(' }\n') + src.write('}\n\n') elif (model=='taskloop'): src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) {\n') src.write(' OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )\n') @@ -65,6 +66,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' }\n') src.write(' }\n') src.write(' }\n') + src.write('}\n\n') elif (model=='target'): src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n') src.write(' OMP_TARGET( teams distribute parallel for simd collapse(2) )\n') @@ -73,26 +75,41 @@ def codegen(src,pattern,stencil_size,radius,W,model): bodygen(src,pattern,stencil_size,radius,W,model) src.write(' }\n') src.write(' }\n') + src.write('}\n\n') elif (model=='ranges'): + src.write('void '+pattern+str(radius)+'(const int n, prk::vector & in, prk::vector & out) {\n') + src.write(' auto dim = ranges::views::iota('+str(radius)+',n-'+str(radius)+');\n') + src.write(' auto inside = ranges::views::cartesian_product(dim,dim);\n') + src.write(' for (auto ij : inside) {\n') + src.write(' auto [i, j] = ij;\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' }\n') + src.write('}\n\n') src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector & in, prk::vector & out) {\n') - src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') - src.write(' for (auto i : inside) {\n') - src.write(' PRAGMA_SIMD\n') - src.write(' for (auto j : inside) {\n') + src.write(' auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t));\n') + src.write(' auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t));\n') + src.write(' const auto r = '+str(radius)+';\n') + src.write(' for (auto itjt : s2) {\n') + src.write(' auto [it, jt] = itjt;\n') + src.write(' for (auto iijj : t2) {\n') + src.write(' auto [ii, jj] = iijj;\n') + src.write(' auto i = ii + it;\n') + src.write(' auto j = jj + jt;\n') + src.write(' if (r <= i && i < n-r && r <= j && j < n-r) {\n') bodygen(src,pattern,stencil_size,radius,W,model) - src.write(' }\n') - src.write(' }\n') - src.write(' }\n') - src.write(' }\n') + src.write(' }\n') + src.write(' }\n') + src.write(' }\n') + src.write('}\n\n') elif (model=='stl'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' std::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n') - #src.write(' PRAGMA_SIMD\n') src.write(' std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n') bodygen(src,pattern,stencil_size,radius,W,model) src.write(' });\n') src.write(' });\n') + src.write('}\n\n') elif (model=='pgnu'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') @@ -101,6 +118,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): bodygen(src,pattern,stencil_size,radius,W,model) src.write(' });\n') src.write(' });\n') + src.write('}\n\n') elif (model=='pstl'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') @@ -109,6 +127,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): bodygen(src,pattern,stencil_size,radius,W,model) src.write(' });\n') src.write(' });\n') + src.write('}\n\n') elif (model=='raja'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' RAJA::RangeSegment inside('+str(radius)+',n-'+str(radius)+');\n') @@ -117,6 +136,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): bodygen(src,pattern,stencil_size,radius,W,model) src.write(' });\n') src.write(' });\n') + src.write('}\n\n') elif (model=='rajaview'): src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n') src.write(' RAJA::RangeSegment inner1('+str(radius)+',n-'+str(radius)+');\n') @@ -124,6 +144,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' RAJA::kernel(inner2, [=](int i, int j) {\n') bodygen(src,pattern,stencil_size,radius,W,model) src.write(' });\n') + src.write('}\n\n') elif (model=='tbb'): src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector & in, prk::vector & out) {\n') src.write(' tbb::blocked_range2d range('+str(radius)+', n-'+str(radius)+', t, '+str(radius)+', n-'+str(radius)+', t);\n') @@ -135,12 +156,14 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' }\n') src.write(' }\n') src.write(' }, tbb_partitioner );\n') + src.write('}\n\n') elif (model=='kokkos'): src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n') src.write(' auto inside = Kokkos::MDRangePolicy>({'+str(radius)+','+str(radius)+'},{n-'+str(radius)+',n-'+str(radius)+'},{t,t});\n') src.write(' Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {\n') bodygen(src,pattern,stencil_size,radius,W,model) src.write(' });\n') + src.write('}\n\n') elif (model=='cuda'): src.write('__global__ void '+pattern+str(radius)+'(const int n, const prk_float * in, prk_float * out) {\n') src.write(' const int i = blockIdx.x * blockDim.x + threadIdx.x;\n') @@ -148,31 +171,31 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' if ( ('+str(radius)+' <= i) && (i < n-'+str(radius)+') && ('+str(radius)+' <= j) && (j < n-'+str(radius)+') ) {\n') bodygen(src,pattern,stencil_size,radius,W,model) src.write(' }\n') + src.write('}\n\n') elif (model=='vector'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' for (int it='+str(radius)+'; it & in, prk::vector & out) {\n') src.write(' for (int it='+str(radius)+'; it & in, prk::vector & out) { @@ -89,7 +93,6 @@ int main(int argc, char* argv[]) throw "Usage: <# iterations> [ ]"; } - // number of times to run the algorithm iterations = std::atoi(argv[1]); if (iterations < 1) { throw "ERROR: iterations must be >= 1"; @@ -109,6 +112,9 @@ int main(int argc, char* argv[]) tile_size = std::atoi(argv[3]); if (tile_size <= 0) tile_size = n; if (tile_size > n) tile_size = n; + //if ((tile_size < n) && (n % tile_size)) { + // throw "ERROR: tile size must evenly divide grid dimension"; + //} } // stencil pattern @@ -167,25 +173,25 @@ int main(int argc, char* argv[]) prk::vector in(n*n); prk::vector out(n*n); - // initialize the input and output arrays - auto range = prk::range(0,n); - for (auto i : range) { - for (auto j : range) { + auto v = ranges::views::cartesian_product(ranges::views::iota(0, n),ranges::views::iota(0, n)); + + for (auto ij : v) { + auto [i, j] = ij; in[i*n+j] = static_cast(i+j); out[i*n+j] = 0.0; - } } for (int iter = 0; iter<=iterations; iter++) { if (iter==1) stencil_time = prk::wtime(); + // Apply the stencil operator stencil(n, tile_size, in, out); + // Add constant to solution to force refresh of neighbor data, if any - for (auto i : range) { - for (auto j : range) { + for (auto ij : v) { + auto [i, j] = ij; in[i*n+j] += 1.0; - } } } diff --git a/Cxx11/stencil_ranges.hpp b/Cxx11/stencil_ranges.hpp index f1d2c3e73..875a7675e 100644 --- a/Cxx11/stencil_ranges.hpp +++ b/Cxx11/stencil_ranges.hpp @@ -1,23 +1,62 @@ +void star1(const int n, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(1,n-1); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i)*n+(j-1)] * -0.5 + +in[(i-1)*n+(j)] * -0.5 + +in[(i+1)*n+(j)] * 0.5 + +in[(i)*n+(j+1)] * 0.5; + } +} + void star1(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(1,n-1); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 1; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i)*n+(j-1)] * -0.5 +in[(i-1)*n+(j)] * -0.5 +in[(i+1)*n+(j)] * 0.5 +in[(i)*n+(j+1)] * 0.5; - } - } - } - } + } + } + } +} + +void star2(const int n, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(2,n-2); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i)*n+(j-2)] * -0.125 + +in[(i)*n+(j-1)] * -0.25 + +in[(i-2)*n+(j)] * -0.125 + +in[(i-1)*n+(j)] * -0.25 + +in[(i+1)*n+(j)] * 0.25 + +in[(i+2)*n+(j)] * 0.125 + +in[(i)*n+(j+1)] * 0.25 + +in[(i)*n+(j+2)] * 0.125; + } } void star2(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(2,n-2); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 2; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i)*n+(j-2)] * -0.125 +in[(i)*n+(j-1)] * -0.25 +in[(i-2)*n+(j)] * -0.125 @@ -26,17 +65,42 @@ void star2(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(3,n-3); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i)*n+(j-3)] * -0.05555555555555555 + +in[(i)*n+(j-2)] * -0.08333333333333333 + +in[(i)*n+(j-1)] * -0.16666666666666666 + +in[(i-3)*n+(j)] * -0.05555555555555555 + +in[(i-2)*n+(j)] * -0.08333333333333333 + +in[(i-1)*n+(j)] * -0.16666666666666666 + +in[(i+1)*n+(j)] * 0.16666666666666666 + +in[(i+2)*n+(j)] * 0.08333333333333333 + +in[(i+3)*n+(j)] * 0.05555555555555555 + +in[(i)*n+(j+1)] * 0.16666666666666666 + +in[(i)*n+(j+2)] * 0.08333333333333333 + +in[(i)*n+(j+3)] * 0.05555555555555555; + } } void star3(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(3,n-3); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 3; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i)*n+(j-3)] * -0.05555555555555555 +in[(i)*n+(j-2)] * -0.08333333333333333 +in[(i)*n+(j-1)] * -0.16666666666666666 @@ -49,17 +113,46 @@ void star3(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(4,n-4); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125 + +in[(i)*n+(j-3)] * -0.041666666666666664 + +in[(i)*n+(j-2)] * -0.0625 + +in[(i)*n+(j-1)] * -0.125 + +in[(i-4)*n+(j)] * -0.03125 + +in[(i-3)*n+(j)] * -0.041666666666666664 + +in[(i-2)*n+(j)] * -0.0625 + +in[(i-1)*n+(j)] * -0.125 + +in[(i+1)*n+(j)] * 0.125 + +in[(i+2)*n+(j)] * 0.0625 + +in[(i+3)*n+(j)] * 0.041666666666666664 + +in[(i+4)*n+(j)] * 0.03125 + +in[(i)*n+(j+1)] * 0.125 + +in[(i)*n+(j+2)] * 0.0625 + +in[(i)*n+(j+3)] * 0.041666666666666664 + +in[(i)*n+(j+4)] * 0.03125; + } } void star4(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(4,n-4); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 4; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125 +in[(i)*n+(j-3)] * -0.041666666666666664 +in[(i)*n+(j-2)] * -0.0625 @@ -76,17 +169,50 @@ void star4(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(5,n-5); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i)*n+(j-5)] * -0.02 + +in[(i)*n+(j-4)] * -0.025 + +in[(i)*n+(j-3)] * -0.03333333333333333 + +in[(i)*n+(j-2)] * -0.05 + +in[(i)*n+(j-1)] * -0.1 + +in[(i-5)*n+(j)] * -0.02 + +in[(i-4)*n+(j)] * -0.025 + +in[(i-3)*n+(j)] * -0.03333333333333333 + +in[(i-2)*n+(j)] * -0.05 + +in[(i-1)*n+(j)] * -0.1 + +in[(i+1)*n+(j)] * 0.1 + +in[(i+2)*n+(j)] * 0.05 + +in[(i+3)*n+(j)] * 0.03333333333333333 + +in[(i+4)*n+(j)] * 0.025 + +in[(i+5)*n+(j)] * 0.02 + +in[(i)*n+(j+1)] * 0.1 + +in[(i)*n+(j+2)] * 0.05 + +in[(i)*n+(j+3)] * 0.03333333333333333 + +in[(i)*n+(j+4)] * 0.025 + +in[(i)*n+(j+5)] * 0.02; + } } void star5(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(5,n-5); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 5; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i)*n+(j-5)] * -0.02 +in[(i)*n+(j-4)] * -0.025 +in[(i)*n+(j-3)] * -0.03333333333333333 @@ -107,17 +233,37 @@ void star5(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(1,n-1); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25 + +in[(i)*n+(j-1)] * -0.25 + +in[(i-1)*n+(j)] * -0.25 + +in[(i+1)*n+(j)] * 0.25 + +in[(i)*n+(j+1)] * 0.25 + +in[(i+1)*n+(j+1)] * 0.25 + ; + } } void grid1(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(1,n-1); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 1; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25 +in[(i)*n+(j-1)] * -0.25 +in[(i-1)*n+(j)] * -0.25 @@ -125,17 +271,51 @@ void grid1(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(2,n-2); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625 + +in[(i-1)*n+(j-2)] * -0.020833333333333332 + +in[(i)*n+(j-2)] * -0.020833333333333332 + +in[(i+1)*n+(j-2)] * -0.020833333333333332 + +in[(i-2)*n+(j-1)] * -0.020833333333333332 + +in[(i-1)*n+(j-1)] * -0.125 + +in[(i)*n+(j-1)] * -0.125 + +in[(i+2)*n+(j-1)] * 0.020833333333333332 + +in[(i-2)*n+(j)] * -0.020833333333333332 + +in[(i-1)*n+(j)] * -0.125 + +in[(i+1)*n+(j)] * 0.125 + +in[(i+2)*n+(j)] * 0.020833333333333332 + +in[(i-2)*n+(j+1)] * -0.020833333333333332 + +in[(i)*n+(j+1)] * 0.125 + +in[(i+1)*n+(j+1)] * 0.125 + +in[(i+2)*n+(j+1)] * 0.020833333333333332 + +in[(i-1)*n+(j+2)] * 0.020833333333333332 + +in[(i)*n+(j+2)] * 0.020833333333333332 + +in[(i+1)*n+(j+2)] * 0.020833333333333332 + +in[(i+2)*n+(j+2)] * 0.0625 + ; + } } void grid2(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(2,n-2); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 2; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625 +in[(i-1)*n+(j-2)] * -0.020833333333333332 +in[(i)*n+(j-2)] * -0.020833333333333332 @@ -157,17 +337,73 @@ void grid2(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(3,n-3); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.027777777777777776 + +in[(i-2)*n+(j-3)] * -0.005555555555555556 + +in[(i-1)*n+(j-3)] * -0.005555555555555556 + +in[(i)*n+(j-3)] * -0.005555555555555556 + +in[(i+1)*n+(j-3)] * -0.005555555555555556 + +in[(i+2)*n+(j-3)] * -0.005555555555555556 + +in[(i-3)*n+(j-2)] * -0.005555555555555556 + +in[(i-2)*n+(j-2)] * -0.041666666666666664 + +in[(i-1)*n+(j-2)] * -0.013888888888888888 + +in[(i)*n+(j-2)] * -0.013888888888888888 + +in[(i+1)*n+(j-2)] * -0.013888888888888888 + +in[(i+3)*n+(j-2)] * 0.005555555555555556 + +in[(i-3)*n+(j-1)] * -0.005555555555555556 + +in[(i-2)*n+(j-1)] * -0.013888888888888888 + +in[(i-1)*n+(j-1)] * -0.08333333333333333 + +in[(i)*n+(j-1)] * -0.08333333333333333 + +in[(i+2)*n+(j-1)] * 0.013888888888888888 + +in[(i+3)*n+(j-1)] * 0.005555555555555556 + +in[(i-3)*n+(j)] * -0.005555555555555556 + +in[(i-2)*n+(j)] * -0.013888888888888888 + +in[(i-1)*n+(j)] * -0.08333333333333333 + +in[(i+1)*n+(j)] * 0.08333333333333333 + +in[(i+2)*n+(j)] * 0.013888888888888888 + +in[(i+3)*n+(j)] * 0.005555555555555556 + +in[(i-3)*n+(j+1)] * -0.005555555555555556 + +in[(i-2)*n+(j+1)] * -0.013888888888888888 + +in[(i)*n+(j+1)] * 0.08333333333333333 + +in[(i+1)*n+(j+1)] * 0.08333333333333333 + +in[(i+2)*n+(j+1)] * 0.013888888888888888 + +in[(i+3)*n+(j+1)] * 0.005555555555555556 + +in[(i-3)*n+(j+2)] * -0.005555555555555556 + +in[(i-1)*n+(j+2)] * 0.013888888888888888 + +in[(i)*n+(j+2)] * 0.013888888888888888 + +in[(i+1)*n+(j+2)] * 0.013888888888888888 + +in[(i+2)*n+(j+2)] * 0.041666666666666664 + +in[(i+3)*n+(j+2)] * 0.005555555555555556 + +in[(i-2)*n+(j+3)] * 0.005555555555555556 + +in[(i-1)*n+(j+3)] * 0.005555555555555556 + +in[(i)*n+(j+3)] * 0.005555555555555556 + +in[(i+1)*n+(j+3)] * 0.005555555555555556 + +in[(i+2)*n+(j+3)] * 0.005555555555555556 + +in[(i+3)*n+(j+3)] * 0.027777777777777776 + ; + } } void grid3(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(3,n-3); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 3; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.027777777777777776 +in[(i-2)*n+(j-3)] * -0.005555555555555556 +in[(i-1)*n+(j-3)] * -0.005555555555555556 @@ -211,17 +447,103 @@ void grid3(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(4,n-4); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625 + +in[(i-3)*n+(j-4)] * -0.002232142857142857 + +in[(i-2)*n+(j-4)] * -0.002232142857142857 + +in[(i-1)*n+(j-4)] * -0.002232142857142857 + +in[(i)*n+(j-4)] * -0.002232142857142857 + +in[(i+1)*n+(j-4)] * -0.002232142857142857 + +in[(i+2)*n+(j-4)] * -0.002232142857142857 + +in[(i+3)*n+(j-4)] * -0.002232142857142857 + +in[(i-4)*n+(j-3)] * -0.002232142857142857 + +in[(i-3)*n+(j-3)] * -0.020833333333333332 + +in[(i-2)*n+(j-3)] * -0.004166666666666667 + +in[(i-1)*n+(j-3)] * -0.004166666666666667 + +in[(i)*n+(j-3)] * -0.004166666666666667 + +in[(i+1)*n+(j-3)] * -0.004166666666666667 + +in[(i+2)*n+(j-3)] * -0.004166666666666667 + +in[(i+4)*n+(j-3)] * 0.002232142857142857 + +in[(i-4)*n+(j-2)] * -0.002232142857142857 + +in[(i-3)*n+(j-2)] * -0.004166666666666667 + +in[(i-2)*n+(j-2)] * -0.03125 + +in[(i-1)*n+(j-2)] * -0.010416666666666666 + +in[(i)*n+(j-2)] * -0.010416666666666666 + +in[(i+1)*n+(j-2)] * -0.010416666666666666 + +in[(i+3)*n+(j-2)] * 0.004166666666666667 + +in[(i+4)*n+(j-2)] * 0.002232142857142857 + +in[(i-4)*n+(j-1)] * -0.002232142857142857 + +in[(i-3)*n+(j-1)] * -0.004166666666666667 + +in[(i-2)*n+(j-1)] * -0.010416666666666666 + +in[(i-1)*n+(j-1)] * -0.0625 + +in[(i)*n+(j-1)] * -0.0625 + +in[(i+2)*n+(j-1)] * 0.010416666666666666 + +in[(i+3)*n+(j-1)] * 0.004166666666666667 + +in[(i+4)*n+(j-1)] * 0.002232142857142857 + +in[(i-4)*n+(j)] * -0.002232142857142857 + +in[(i-3)*n+(j)] * -0.004166666666666667 + +in[(i-2)*n+(j)] * -0.010416666666666666 + +in[(i-1)*n+(j)] * -0.0625 + +in[(i+1)*n+(j)] * 0.0625 + +in[(i+2)*n+(j)] * 0.010416666666666666 + +in[(i+3)*n+(j)] * 0.004166666666666667 + +in[(i+4)*n+(j)] * 0.002232142857142857 + +in[(i-4)*n+(j+1)] * -0.002232142857142857 + +in[(i-3)*n+(j+1)] * -0.004166666666666667 + +in[(i-2)*n+(j+1)] * -0.010416666666666666 + +in[(i)*n+(j+1)] * 0.0625 + +in[(i+1)*n+(j+1)] * 0.0625 + +in[(i+2)*n+(j+1)] * 0.010416666666666666 + +in[(i+3)*n+(j+1)] * 0.004166666666666667 + +in[(i+4)*n+(j+1)] * 0.002232142857142857 + +in[(i-4)*n+(j+2)] * -0.002232142857142857 + +in[(i-3)*n+(j+2)] * -0.004166666666666667 + +in[(i-1)*n+(j+2)] * 0.010416666666666666 + +in[(i)*n+(j+2)] * 0.010416666666666666 + +in[(i+1)*n+(j+2)] * 0.010416666666666666 + +in[(i+2)*n+(j+2)] * 0.03125 + +in[(i+3)*n+(j+2)] * 0.004166666666666667 + +in[(i+4)*n+(j+2)] * 0.002232142857142857 + +in[(i-4)*n+(j+3)] * -0.002232142857142857 + +in[(i-2)*n+(j+3)] * 0.004166666666666667 + +in[(i-1)*n+(j+3)] * 0.004166666666666667 + +in[(i)*n+(j+3)] * 0.004166666666666667 + +in[(i+1)*n+(j+3)] * 0.004166666666666667 + +in[(i+2)*n+(j+3)] * 0.004166666666666667 + +in[(i+3)*n+(j+3)] * 0.020833333333333332 + +in[(i+4)*n+(j+3)] * 0.002232142857142857 + +in[(i-3)*n+(j+4)] * 0.002232142857142857 + +in[(i-2)*n+(j+4)] * 0.002232142857142857 + +in[(i-1)*n+(j+4)] * 0.002232142857142857 + +in[(i)*n+(j+4)] * 0.002232142857142857 + +in[(i+1)*n+(j+4)] * 0.002232142857142857 + +in[(i+2)*n+(j+4)] * 0.002232142857142857 + +in[(i+3)*n+(j+4)] * 0.002232142857142857 + +in[(i+4)*n+(j+4)] * 0.015625 + ; + } } void grid4(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(4,n-4); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 4; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625 +in[(i-3)*n+(j-4)] * -0.002232142857142857 +in[(i-2)*n+(j-4)] * -0.002232142857142857 @@ -295,17 +617,141 @@ void grid4(const int n, const int t, prk::vector & in, prk::vector & in, prk::vector & out) { + auto dim = ranges::views::iota(5,n-5); + auto inside = ranges::views::cartesian_product(dim,dim); + for (auto ij : inside) { + auto [i, j] = ij; + out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01 + +in[(i-4)*n+(j-5)] * -0.0011111111111111111 + +in[(i-3)*n+(j-5)] * -0.0011111111111111111 + +in[(i-2)*n+(j-5)] * -0.0011111111111111111 + +in[(i-1)*n+(j-5)] * -0.0011111111111111111 + +in[(i)*n+(j-5)] * -0.0011111111111111111 + +in[(i+1)*n+(j-5)] * -0.0011111111111111111 + +in[(i+2)*n+(j-5)] * -0.0011111111111111111 + +in[(i+3)*n+(j-5)] * -0.0011111111111111111 + +in[(i+4)*n+(j-5)] * -0.0011111111111111111 + +in[(i-5)*n+(j-4)] * -0.0011111111111111111 + +in[(i-4)*n+(j-4)] * -0.0125 + +in[(i-3)*n+(j-4)] * -0.0017857142857142857 + +in[(i-2)*n+(j-4)] * -0.0017857142857142857 + +in[(i-1)*n+(j-4)] * -0.0017857142857142857 + +in[(i)*n+(j-4)] * -0.0017857142857142857 + +in[(i+1)*n+(j-4)] * -0.0017857142857142857 + +in[(i+2)*n+(j-4)] * -0.0017857142857142857 + +in[(i+3)*n+(j-4)] * -0.0017857142857142857 + +in[(i+5)*n+(j-4)] * 0.0011111111111111111 + +in[(i-5)*n+(j-3)] * -0.0011111111111111111 + +in[(i-4)*n+(j-3)] * -0.0017857142857142857 + +in[(i-3)*n+(j-3)] * -0.016666666666666666 + +in[(i-2)*n+(j-3)] * -0.0033333333333333335 + +in[(i-1)*n+(j-3)] * -0.0033333333333333335 + +in[(i)*n+(j-3)] * -0.0033333333333333335 + +in[(i+1)*n+(j-3)] * -0.0033333333333333335 + +in[(i+2)*n+(j-3)] * -0.0033333333333333335 + +in[(i+4)*n+(j-3)] * 0.0017857142857142857 + +in[(i+5)*n+(j-3)] * 0.0011111111111111111 + +in[(i-5)*n+(j-2)] * -0.0011111111111111111 + +in[(i-4)*n+(j-2)] * -0.0017857142857142857 + +in[(i-3)*n+(j-2)] * -0.0033333333333333335 + +in[(i-2)*n+(j-2)] * -0.025 + +in[(i-1)*n+(j-2)] * -0.008333333333333333 + +in[(i)*n+(j-2)] * -0.008333333333333333 + +in[(i+1)*n+(j-2)] * -0.008333333333333333 + +in[(i+3)*n+(j-2)] * 0.0033333333333333335 + +in[(i+4)*n+(j-2)] * 0.0017857142857142857 + +in[(i+5)*n+(j-2)] * 0.0011111111111111111 + +in[(i-5)*n+(j-1)] * -0.0011111111111111111 + +in[(i-4)*n+(j-1)] * -0.0017857142857142857 + +in[(i-3)*n+(j-1)] * -0.0033333333333333335 + +in[(i-2)*n+(j-1)] * -0.008333333333333333 + +in[(i-1)*n+(j-1)] * -0.05 + +in[(i)*n+(j-1)] * -0.05 + +in[(i+2)*n+(j-1)] * 0.008333333333333333 + +in[(i+3)*n+(j-1)] * 0.0033333333333333335 + +in[(i+4)*n+(j-1)] * 0.0017857142857142857 + +in[(i+5)*n+(j-1)] * 0.0011111111111111111 + +in[(i-5)*n+(j)] * -0.0011111111111111111 + +in[(i-4)*n+(j)] * -0.0017857142857142857 + +in[(i-3)*n+(j)] * -0.0033333333333333335 + +in[(i-2)*n+(j)] * -0.008333333333333333 + +in[(i-1)*n+(j)] * -0.05 + +in[(i+1)*n+(j)] * 0.05 + +in[(i+2)*n+(j)] * 0.008333333333333333 + +in[(i+3)*n+(j)] * 0.0033333333333333335 + +in[(i+4)*n+(j)] * 0.0017857142857142857 + +in[(i+5)*n+(j)] * 0.0011111111111111111 + +in[(i-5)*n+(j+1)] * -0.0011111111111111111 + +in[(i-4)*n+(j+1)] * -0.0017857142857142857 + +in[(i-3)*n+(j+1)] * -0.0033333333333333335 + +in[(i-2)*n+(j+1)] * -0.008333333333333333 + +in[(i)*n+(j+1)] * 0.05 + +in[(i+1)*n+(j+1)] * 0.05 + +in[(i+2)*n+(j+1)] * 0.008333333333333333 + +in[(i+3)*n+(j+1)] * 0.0033333333333333335 + +in[(i+4)*n+(j+1)] * 0.0017857142857142857 + +in[(i+5)*n+(j+1)] * 0.0011111111111111111 + +in[(i-5)*n+(j+2)] * -0.0011111111111111111 + +in[(i-4)*n+(j+2)] * -0.0017857142857142857 + +in[(i-3)*n+(j+2)] * -0.0033333333333333335 + +in[(i-1)*n+(j+2)] * 0.008333333333333333 + +in[(i)*n+(j+2)] * 0.008333333333333333 + +in[(i+1)*n+(j+2)] * 0.008333333333333333 + +in[(i+2)*n+(j+2)] * 0.025 + +in[(i+3)*n+(j+2)] * 0.0033333333333333335 + +in[(i+4)*n+(j+2)] * 0.0017857142857142857 + +in[(i+5)*n+(j+2)] * 0.0011111111111111111 + +in[(i-5)*n+(j+3)] * -0.0011111111111111111 + +in[(i-4)*n+(j+3)] * -0.0017857142857142857 + +in[(i-2)*n+(j+3)] * 0.0033333333333333335 + +in[(i-1)*n+(j+3)] * 0.0033333333333333335 + +in[(i)*n+(j+3)] * 0.0033333333333333335 + +in[(i+1)*n+(j+3)] * 0.0033333333333333335 + +in[(i+2)*n+(j+3)] * 0.0033333333333333335 + +in[(i+3)*n+(j+3)] * 0.016666666666666666 + +in[(i+4)*n+(j+3)] * 0.0017857142857142857 + +in[(i+5)*n+(j+3)] * 0.0011111111111111111 + +in[(i-5)*n+(j+4)] * -0.0011111111111111111 + +in[(i-3)*n+(j+4)] * 0.0017857142857142857 + +in[(i-2)*n+(j+4)] * 0.0017857142857142857 + +in[(i-1)*n+(j+4)] * 0.0017857142857142857 + +in[(i)*n+(j+4)] * 0.0017857142857142857 + +in[(i+1)*n+(j+4)] * 0.0017857142857142857 + +in[(i+2)*n+(j+4)] * 0.0017857142857142857 + +in[(i+3)*n+(j+4)] * 0.0017857142857142857 + +in[(i+4)*n+(j+4)] * 0.0125 + +in[(i+5)*n+(j+4)] * 0.0011111111111111111 + +in[(i-4)*n+(j+5)] * 0.0011111111111111111 + +in[(i-3)*n+(j+5)] * 0.0011111111111111111 + +in[(i-2)*n+(j+5)] * 0.0011111111111111111 + +in[(i-1)*n+(j+5)] * 0.0011111111111111111 + +in[(i)*n+(j+5)] * 0.0011111111111111111 + +in[(i+1)*n+(j+5)] * 0.0011111111111111111 + +in[(i+2)*n+(j+5)] * 0.0011111111111111111 + +in[(i+3)*n+(j+5)] * 0.0011111111111111111 + +in[(i+4)*n+(j+5)] * 0.0011111111111111111 + +in[(i+5)*n+(j+5)] * 0.01 + ; + } } void grid5(const int n, const int t, prk::vector & in, prk::vector & out) { - auto inside = prk::range(5,n-5); - for (auto i : inside) { - PRAGMA_SIMD - for (auto j : inside) { + auto s2 = ranges::views::cartesian_product(ranges::stride_view(ranges::views::iota(0, n), t),ranges::stride_view(ranges::views::iota(0, n), t)); + auto t2 = ranges::views::cartesian_product(ranges::views::iota(0, t),ranges::views::iota(0, t)); + const auto r = 5; + for (auto itjt : s2) { + auto [it, jt] = itjt; + for (auto iijj : t2) { + auto [ii, jj] = iijj; + auto i = ii + it; + auto j = jj + jt; + if (r <= i && i < n-r && r <= j && j < n-r) { out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01 +in[(i-4)*n+(j-5)] * -0.0011111111111111111 +in[(i-3)*n+(j-5)] * -0.0011111111111111111 @@ -417,9 +863,8 @@ void grid5(const int n, const int t, prk::vector & in, prk::vector Date: Mon, 29 Mar 2021 13:21:22 -0700 Subject: [PATCH 041/325] ignore new ranges binaries --- .gitignore | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 209be91a0..77276add0 100644 --- a/.gitignore +++ b/.gitignore @@ -144,7 +144,7 @@ Cxx11/nstream-openmp Cxx11/nstream-openmp-target Cxx11/nstream-pstl Cxx11/nstream-raja -Cxx11/nstream-rangefor +Cxx11/nstream-ranges Cxx11/nstream-stl Cxx11/nstream-sycl Cxx11/nstream-sycl-explicit @@ -210,7 +210,7 @@ Cxx11/stencil-openmp Cxx11/stencil-openmp-target Cxx11/stencil-pstl Cxx11/stencil-raja -Cxx11/stencil-rangefor +Cxx11/stencil-ranges Cxx11/stencil-stl Cxx11/stencil-sycl Cxx11/stencil-sycl-explicit @@ -239,7 +239,7 @@ Cxx11/transpose-openmp Cxx11/transpose-openmp-target Cxx11/transpose-pstl Cxx11/transpose-raja -Cxx11/transpose-rangefor +Cxx11/transpose-ranges Cxx11/transpose-stl Cxx11/transpose-sycl Cxx11/transpose-sycl-explicit From e52625bb1f0b184d0130c22875e0173080b9dfd6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 13:28:24 -0700 Subject: [PATCH 042/325] build updates --- common/make.defs.llvm | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 358deef4b..583d9a668 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -156,23 +156,23 @@ CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -TBBDIR=/usr/local/Cellar/tbb/2020_U1 +TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb # # Parallel STL, Boost, etc. # -#BOOSTFLAG=-I/usr/local/Cellar/boost/1.72.0/include -#BOOSTFLAG=-I/usr/include/boost169 -BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_1/include # M1 Big Sur +#BOOSTFLAG=-I/usr/local/Cellar/boost/1.72.0/include # old Homebrew +#BOOSTFLAG=-I/usr/include/boost169 # Linux +BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_2/include # new Homebrew BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 # triSYCL requires Boost SYCLFLAG+=${BOOSTFLAG} -RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} PSTLFLAG+=-I./llvm-pstl/include -DLLVM_PSTL KOKKOSDIR=/opt/kokkos/clang @@ -233,8 +233,9 @@ ISPCFLAG=-O3 --target=host --opt=fast-math MPIDIR=/usr MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx +MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include -MPILIB=-L${MPIDIR}/lib -lmpi +MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi #MPIINC=-I/usr/include/mpich-3.2-x86_64 #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi @@ -248,7 +249,8 @@ GAFLAG+=-L${GADIR}/../armci-mpi/lib -larmci # ARMCI-MPI #GAFLAG+=-L${GADIR}/lib -larmci -lcomex # ARMCI/ComEx GAFLAG+=${MPIINC} ${MPILIB} GAFLAG+=-lmpifort -lmpi -GAFLAG+=-i8 # GA is compiled with -i8 on 64-bit systems +GAFLAG+=${BLASFLAG} +GAFLAG+=-fdefault-integer-8 # GA is compiled with 64b integers on 64-bit systems # # PETSc # @@ -257,6 +259,14 @@ PETSCFLAG=-I${PETSCDIR}/include PETSCFLAG+=-L${PETSCDIR}/lib -lpetsc PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib # +# Fortran 2008 coarrays +# +# see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details +# single-node +#COARRAYFLAG=-fcoarray=single -lcaf_single +# multi-node +COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +# # MEMKIND (used in C1z) # MEMKINDDIR=/home/parallels/PRK/deps From 2b4b0a37167f75a401ee09b0e34ee38f9bd28ae6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 30 Mar 2021 13:26:39 -0700 Subject: [PATCH 043/325] this seems to work better --- FORTRAN/stencil-openacc.F90 | 179 +++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 84 deletions(-) diff --git a/FORTRAN/stencil-openacc.F90 b/FORTRAN/stencil-openacc.F90 index 84d5969c7..02a1eecca 100644 --- a/FORTRAN/stencil-openacc.F90 +++ b/FORTRAN/stencil-openacc.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2013, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -100,6 +101,86 @@ subroutine initialize_w(is_star,r,W) endif end subroutine initialize_w +subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) + use iso_fortran_env + implicit none + logical, intent(in) :: is_star, tiling + integer(kind=INT32), intent(in) :: tile_size, r, n + real(kind=REAL64), intent(in) :: W(-r:r,-r:r) + real(kind=REAL64), intent(in) :: A(n,n) + real(kind=REAL64), intent(inout) :: B(n,n) + integer(kind=INT32) :: i, j, ii, jj, it, jt + !$acc data pcopyin(W,A) pcopy(B) + if (is_star) then + if (.not.tiling) then + !$acc parallel loop collapse(2) + do j=r,n-r-1 + do i=r,n-r-1 + do jj=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) + enddo + do ii=-r,-1 + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) + enddo + do ii=1,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) + enddo + enddo + enddo + else ! tiling + !$acc parallel loop gang collapse(2) + do jt=r,n-r-1,tile_size + do it=r,n-r-1,tile_size + !$acc loop vector collapse(2) + do j=jt,min(n-r-1,jt+tile_size-1) + do i=it,min(n-r-1,it+tile_size-1) + do jj=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) + enddo + do ii=-r,-1 + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) + enddo + do ii=1,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) + enddo + enddo + enddo + enddo + enddo + endif ! tiling + else ! grid + if (.not.tiling) then + !$acc parallel loop collapse(2) + do j=r,n-r-1 + do i=r,n-r-1 + do jj=-r,r + do ii=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) + enddo + enddo + enddo + enddo + else ! tiling + !$acc parallel loop gang collapse(2) + do jt=r,n-r-1,tile_size + do it=r,n-r-1,tile_size + !$acc loop vector collapse(2) + do j=jt,min(n-r-1,jt+tile_size-1) + do i=it,min(n-r-1,it+tile_size-1) + do jj=-r,r + do ii=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) + enddo + enddo + enddo + enddo + enddo + enddo + endif ! tiling + endif ! star + !$acc end data +end subroutine apply_stencil + program main use iso_fortran_env implicit none @@ -121,7 +202,6 @@ program main real(kind=REAL64), parameter :: cx=1.d0, cy=1.d0 ! runtime variables integer(kind=INT32) :: i, j, k - integer(kind=INT32) :: ii, jj, it, jt integer(kind=INT64) :: flops ! floating point ops per iteration real(kind=REAL64) :: norm, reference_norm ! L1 norm of solution integer(kind=INT64) :: active_points ! interior of grid with respect to stencil @@ -224,104 +304,36 @@ program main call initialize_w(is_star,r,W) - ! HOST - !$acc parallel loop gang + !$acc data pcopyin(W,A) pcopy(B) + + !$acc parallel loop collapse(2) do j=1,n - !$acc loop vector do i=1,n A(i,j) = cx*i+cy*j B(i,j) = 0.d0 enddo enddo - t0 = 0.0d0 - - !$acc data pcopyin(W,A) pcopy(B) + t0 = 0 do k=0,iterations if (k.eq.1) t0 = prk_get_wtime() - !call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - if (is_star) then - if (.not.tiling) then - !$acc parallel loop gang collapse(2) - do j=r,n-r-1 - do i=r,n-r-1 - !$acc loop vector - do jj=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) - enddo - !$acc loop vector - do ii=-r,-1 - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - !$acc loop vector - do ii=1,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - enddo - enddo - else ! tiling - !$acc parallel loop gang ! collapse(2) leads to incorrect results - do jt=r,n-r-1,tile_size - do it=r,n-r-1,tile_size - !$acc loop vector collapse(3) - do j=jt,min(n-r-1,jt+tile_size-1) - do i=it,min(n-r-1,it+tile_size-1) - do jj=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) - enddo - do ii=-r,-1 - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - do ii=1,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - enddo - enddo - enddo - enddo - endif ! tiling - else ! grid - if (.not.tiling) then - !$acc parallel loop gang ! collapse(2) leads to incorrect results - do j=r,n-r-1 - do i=r,n-r-1 - !$acc loop vector collapse(2) - do jj=-r,r - do ii=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) - enddo - enddo - enddo - enddo - else ! tiling - !$acc parallel loop gang collapse(2) - do jt=r,n-r-1,tile_size - do it=r,n-r-1,tile_size - !$acc loop vector collapse(4) - do j=jt,min(n-r-1,jt+tile_size-1) - do i=it,min(n-r-1,it+tile_size-1) - do jj=-r,r - do ii=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) - enddo - enddo - enddo - enddo - enddo - enddo - endif ! tiling - endif ! star - !$acc parallel loop gang + ! DEVICE + ! Apply the stencil operator + call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) + + ! DEVICE + ! add constant to solution to force refresh of neighbor data, if any + !$acc parallel loop collapse(2) do j=1,n - !$acc loop vector do i=1,n A(i,j) = A(i,j) + 1.d0 enddo enddo - enddo + + enddo ! iterations t1 = prk_get_wtime() @@ -329,9 +341,8 @@ program main stencil_time = t1 - t0 - !$acc parallel loop reduction(+:norm) + !$acc parallel loop collapse(2) reduction(+:norm) do j=r,n-r - !$acc loop reduction(+:norm) do i=r,n-r norm = norm + abs(B(i,j)) enddo From 3227b610b8980c4fdce91f817dd2a88819852288 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 30 Mar 2021 13:31:21 -0700 Subject: [PATCH 044/325] this appears to be working better --- FORTRAN/stencil-openmp-target.F90 | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/FORTRAN/stencil-openmp-target.F90 b/FORTRAN/stencil-openmp-target.F90 index a95d45809..96ed3462f 100644 --- a/FORTRAN/stencil-openmp-target.F90 +++ b/FORTRAN/stencil-openmp-target.F90 @@ -101,9 +101,10 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) real(kind=REAL64), intent(in) :: A(n,n) real(kind=REAL64), intent(inout) :: B(n,n) integer(kind=INT32) :: i, j, ii, jj, it, jt + !$omp target data use_device_addr(A,B,W) if (is_star) then if (.not.tiling) then - !$omp target teams distribute parallel do simd collapse(2) schedule(static,1) + !$omp target teams distribute parallel do simd collapse(2) GPU_SCHEDULE do j=r,n-r-1 do i=r,n-r-1 do jj=-r,r @@ -122,7 +123,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) !$omp target teams distribute collapse(2) do jt=r,n-r-1,tile_size do it=r,n-r-1,tile_size - !$omp parallel do simd collapse(2) schedule(static,1) + !$omp parallel do simd collapse(2) GPU_SCHEDULE do j=jt,min(n-r-1,jt+tile_size-1) do i=it,min(n-r-1,it+tile_size-1) do jj=-r,r @@ -143,7 +144,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) endif ! tiling else ! grid if (.not.tiling) then - !$omp target teams distribute parallel do simd collapse(2) schedule(static,1) + !$omp target teams distribute parallel do simd collapse(2) GPU_SCHEDULE do j=r,n-r-1 do i=r,n-r-1 do jj=-r,r @@ -158,7 +159,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) !$omp target teams distribute collapse(2) do jt=r,n-r-1,tile_size do it=r,n-r-1,tile_size - !$omp parallel do simd collapse(2) schedule(static,1) + !$omp parallel do simd collapse(2) GPU_SCHEDULE do j=jt,min(n-r-1,jt+tile_size-1) do i=it,min(n-r-1,it+tile_size-1) do jj=-r,r @@ -174,6 +175,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) !$omp end target teams distribute endif ! tiling endif ! star + !$omp end target data end subroutine apply_stencil program main @@ -197,6 +199,7 @@ program main real(kind=REAL64), parameter :: cx=1.d0, cy=1.d0 ! runtime variables integer(kind=INT32) :: i, j, k + integer(kind=INT32) :: ii, jj, it, jt integer(kind=INT64) :: flops ! floating point ops per iteration real(kind=REAL64) :: norm, reference_norm ! L1 norm of solution integer(kind=INT64) :: active_points ! interior of grid with respect to stencil @@ -279,7 +282,6 @@ program main norm = 0.d0 active_points = int(n-2*r,INT64)**2 - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() write(*,'(a,i8)') 'Number of iterations = ', iterations write(*,'(a,i8)') 'Grid size = ', n write(*,'(a,i8)') 'Radius of stencil = ', r @@ -325,7 +327,7 @@ program main ! DEVICE ! add constant to solution to force refresh of neighbor data, if any - !$omp target teams distribute parallel do simd collapse(2) schedule(static,1) + !$omp target teams distribute parallel do simd collapse(2) GPU_SCHEDULE do j=1,n do i=1,n A(i,j) = A(i,j) + 1.d0 @@ -336,10 +338,11 @@ program main enddo ! iterations t1 = omp_get_wtime() - stencil_time = t1 - t0 !$omp end target data + stencil_time = t1 - t0 + ! HOST ! compute L1 norm in parallel !$omp parallel do collapse(2) & From 8ad44411a7203b06be36374e265deadca6695223 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 2 Apr 2021 10:17:06 -0700 Subject: [PATCH 045/325] fix issue with missing class on Ubuntu error: Compile server encountered fatal condition: javax/tools/DiagnosticListener java.lang.ClassNotFoundException: javax.tools.DiagnosticListener https://stackoverflow.com/a/58253778 Signed-off-by: Jeff Hammond --- SCALA/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/SCALA/README.md b/SCALA/README.md index 51f1641ee..638507ace 100644 --- a/SCALA/README.md +++ b/SCALA/README.md @@ -5,10 +5,10 @@ Just type `make` # How to run ``` -JAVA_OPTS="-Xmx4G" scala nstream 10 $((1024*1024*64)) +JAVA_OPTS="-Xmx4G" scala -nc nstream 10 $((1024*1024*64)) ``` -Note that the environmental variable JAVA_OPTS sets the maximum memory +Note that the environmental variable `JAVA_OPTS` sets the maximum memory used by Java to 4G, which is probably acceptable for most use cases. The default is quite low and will not allow you to run nstream with more than ~16MW. @@ -17,6 +17,6 @@ If you're interested in running in a script mode, simple specify the file name of the source code. ``` -JAVA_OPTS="-Xmx4G" scala nstream.scala 10 $((1024*1024*64)) +JAVA_OPTS="-Xmx4G" scala -nc nstream.scala 10 $((1024*1024*64)) ``` From c7263cf91f71c4dec6e4c078738e44eb99645c2c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 1 Apr 2021 12:03:48 -0700 Subject: [PATCH 046/325] remove Coriander workarounds i haven't used Coriander in years and working around it makes the code ugly. Signed-off-by: Jeff Hammond --- Cxx11/nstream-cuda.cu | 15 ++------------- Cxx11/prk_cuda.h | 7 ------- Cxx11/stencil-cuda.cu | 10 ++-------- Cxx11/transpose-cuda.cu | 19 +++---------------- 4 files changed, 7 insertions(+), 44 deletions(-) diff --git a/Cxx11/nstream-cuda.cu b/Cxx11/nstream-cuda.cu index f4076ed6d..610b1f9fb 100644 --- a/Cxx11/nstream-cuda.cu +++ b/Cxx11/nstream-cuda.cu @@ -142,15 +142,11 @@ int main(int argc, char * argv[]) prk_float * h_A; prk_float * h_B; prk_float * h_C; -#ifndef __CORIANDERCC__ + prk::CUDA::check( cudaMallocHost((void**)&h_A, bytes) ); prk::CUDA::check( cudaMallocHost((void**)&h_B, bytes) ); prk::CUDA::check( cudaMallocHost((void**)&h_C, bytes) ); -#else - h_A = new prk_float[length]; - h_B = new prk_float[length]; - h_C = new prk_float[length]; -#endif + for (int i=0; i(0); h_B[i] = static_cast(2); @@ -178,10 +174,7 @@ int main(int argc, char * argv[]) } else { nstream<<>>(static_cast(length), scalar, d_A, d_B, d_C); } -#ifndef __CORIANDERCC__ - // silence "ignoring cudaDeviceSynchronize for now" warning prk::CUDA::check( cudaDeviceSynchronize() ); -#endif } nstream_time = prk::wtime() - nstream_time; } @@ -192,10 +185,8 @@ int main(int argc, char * argv[]) prk::CUDA::check( cudaFree(d_B) ); prk::CUDA::check( cudaFree(d_A) ); -#ifndef __CORIANDERCC__ prk::CUDA::check( cudaFreeHost(h_B) ); prk::CUDA::check( cudaFreeHost(h_C) ); -#endif ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -215,9 +206,7 @@ int main(int argc, char * argv[]) asum += prk::abs(h_A[i]); } -#ifndef __CORIANDERCC__ prk::CUDA::check( cudaFreeHost(h_A) ); -#endif double epsilon=1.e-8; if (prk::abs(ar-asum)/asum > epsilon) { diff --git a/Cxx11/prk_cuda.h b/Cxx11/prk_cuda.h index 65941cd02..eb08fd486 100644 --- a/Cxx11/prk_cuda.h +++ b/Cxx11/prk_cuda.h @@ -24,12 +24,7 @@ #endif #endif -#ifdef __CORIANDERCC__ -// Coriander does not support double -typedef float prk_float; -#else typedef double prk_float; -#endif namespace prk { @@ -105,7 +100,6 @@ namespace prk void print() { for (int i=0; i>>(n, d_in); -#ifndef __CORIANDERCC__ - // silence "ignoring cudaDeviceSynchronize for now" warning prk::CUDA::check( cudaDeviceSynchronize() ); -#endif } stencil_time = prk::wtime() - stencil_time; diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu index a5822d143..2c62f3ef8 100644 --- a/Cxx11/transpose-cuda.cu +++ b/Cxx11/transpose-cuda.cu @@ -136,12 +136,6 @@ int main(int argc, char * argv[]) std::cout << "The results are probably going to be wrong; use tile_size<=32.\n"; } } -#endif -#ifdef __CORIANDERCC__ - // This has not been analyzed, but it is an empirical fact. - if (order > 1234) { - std::cout << "The results are probably going to be wrong, because order>1234.\n"; - } #endif } catch (const char * e) { @@ -175,13 +169,10 @@ int main(int argc, char * argv[]) const size_t bytes = nelems * sizeof(prk_float); prk_float * h_a; prk_float * h_b; -#ifndef __CORIANDERCC__ + prk::CUDA::check( cudaMallocHost((void**)&h_a, bytes) ); prk::CUDA::check( cudaMallocHost((void**)&h_b, bytes) ); -#else - h_a = new prk_float[nelems]; - h_b = new prk_float[nelems]; -#endif + // fill A with the sequence 0 to order^2-1 for (int j=0; j>>(order, d_a, d_b); -#ifndef __CORIANDERCC__ - // silence "ignoring cudaDeviceSynchronize for now" warning + prk::CUDA::check( cudaDeviceSynchronize() ); -#endif } trans_time = prk::wtime() - trans_time; @@ -242,10 +231,8 @@ int main(int argc, char * argv[]) std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif -#ifndef __CORIANDERCC__ prk::CUDA::check( cudaFreeHost(h_b) ); prk::CUDA::check( cudaFreeHost(h_a) ); -#endif const auto epsilon = 1.0e-8; if (abserr < epsilon) { From 596684f4725e834dbe6fb081fecf436952d3f659 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 3 Apr 2021 22:46:51 -0700 Subject: [PATCH 047/325] use wrappers on runtime functions in nstream CU versions --- Cxx11/nstream-cublas.cu | 55 ++++++++++++---------------- Cxx11/nstream-cuda.cu | 69 ++++++++++++++++------------------- Cxx11/nstream-managed-cuda.cu | 61 +++++++++++++++++-------------- Cxx11/prk_cuda.h | 57 +++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+), 97 deletions(-) diff --git a/Cxx11/nstream-cublas.cu b/Cxx11/nstream-cublas.cu index 5d02e8f43..b858b17c2 100644 --- a/Cxx11/nstream-cublas.cu +++ b/Cxx11/nstream-cublas.cu @@ -1,5 +1,6 @@ /// /// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -111,14 +112,9 @@ int main(int argc, char * argv[]) double nstream_time(0); - const size_t bytes = length * sizeof(double); - - double * h_A; - double * h_B; - double * h_C; - prk::CUDA::check( cudaMallocHost((void**)&h_A, bytes) ); - prk::CUDA::check( cudaMallocHost((void**)&h_B, bytes) ); - prk::CUDA::check( cudaMallocHost((void**)&h_C, bytes) ); + double * h_A = prk::CUDA::malloc_host(length); + double * h_B = prk::CUDA::malloc_host(length); + double * h_C = prk::CUDA::malloc_host(length); for (size_t i=0; i(length); + double * d_B = prk::CUDA::malloc_device(length); + double * d_C = prk::CUDA::malloc_device(length); + + prk::CUDA::copyH2D(d_A, h_A, length); + prk::CUDA::copyH2D(d_B, h_B, length); + prk::CUDA::copyH2D(d_C, h_C, length); double scalar(3); { for (int iter = 0; iter<=iterations; iter++) { - if (iter==1) nstream_time = prk::wtime(); + if (iter==1) { + prk::CUDA::sync(); + nstream_time = prk::wtime(); + } double one(1); prk::CUDA::check( cublasDaxpy(h, length, @@ -151,23 +148,16 @@ int main(int argc, char * argv[]) &scalar, // alpha d_C, 1, // x, incx d_A, 1) ); // y, incy - - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); } nstream_time = prk::wtime() - nstream_time; } - prk::CUDA::check( cudaMemcpy(&(h_A[0]), d_A, bytes, cudaMemcpyDeviceToHost) ); + prk::CUDA::copyD2H(h_A, d_A, length); - prk::CUDA::check( cudaFree(d_C) ); - prk::CUDA::check( cudaFree(d_B) ); - prk::CUDA::check( cudaFree(d_A) ); - - prk::CUDA::check( cudaFreeHost(h_B) ); - prk::CUDA::check( cudaFreeHost(h_C) ); - - prk::CUDA::check( cublasDestroy(h) ); - //prk::CUDA::check( cublasShutdown() ); + prk::CUDA::free(d_A); + prk::CUDA::free(d_B); + prk::CUDA::free(d_C); ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -179,7 +169,6 @@ int main(int argc, char * argv[]) for (int i=0; i<=iterations; i++) { ar += br + scalar * cr; } - ar *= length; double asum(0); @@ -187,7 +176,9 @@ int main(int argc, char * argv[]) asum += prk::abs(h_A[i]); } - prk::CUDA::check( cudaFreeHost(h_A) ); + prk::CUDA::free_host(h_A); + prk::CUDA::free_host(h_B); + prk::CUDA::free_host(h_C); double epsilon=1.e-8; if (prk::abs(ar-asum)/asum > epsilon) { diff --git a/Cxx11/nstream-cuda.cu b/Cxx11/nstream-cuda.cu index 610b1f9fb..974fda1ab 100644 --- a/Cxx11/nstream-cuda.cu +++ b/Cxx11/nstream-cuda.cu @@ -1,5 +1,6 @@ /// /// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -64,7 +65,7 @@ #include "prk_util.h" #include "prk_cuda.h" -__global__ void nstream(const unsigned n, const prk_float scalar, prk_float * A, const prk_float * B, const prk_float * C) +__global__ void nstream(const unsigned n, const double scalar, double * A, const double * B, const double * C) { auto i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { @@ -72,7 +73,7 @@ __global__ void nstream(const unsigned n, const prk_float scalar, prk_float * A, } } -__global__ void nstream2(const unsigned n, const prk_float scalar, prk_float * A, const prk_float * B, const prk_float * C) +__global__ void nstream2(const unsigned n, const double scalar, double * A, const double * B, const double * C) { for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { A[i] += B[i] + scalar * C[i]; @@ -138,55 +139,48 @@ int main(int argc, char * argv[]) double nstream_time(0); - const size_t bytes = length * sizeof(prk_float); - prk_float * h_A; - prk_float * h_B; - prk_float * h_C; + double * h_A = prk::CUDA::malloc_host(length); + double * h_B = prk::CUDA::malloc_host(length); + double * h_C = prk::CUDA::malloc_host(length); - prk::CUDA::check( cudaMallocHost((void**)&h_A, bytes) ); - prk::CUDA::check( cudaMallocHost((void**)&h_B, bytes) ); - prk::CUDA::check( cudaMallocHost((void**)&h_C, bytes) ); - - for (int i=0; i(0); - h_B[i] = static_cast(2); - h_C[i] = static_cast(2); + for (size_t i=0; i(length); + double * d_B = prk::CUDA::malloc_device(length); + double * d_C = prk::CUDA::malloc_device(length); + + prk::CUDA::copyH2D(d_A, h_A, length); + prk::CUDA::copyH2D(d_B, h_B, length); + prk::CUDA::copyH2D(d_C, h_C, length); + + double scalar(3); { for (int iter = 0; iter<=iterations; iter++) { - if (iter==1) nstream_time = prk::wtime(); + if (iter==1) { + prk::CUDA::sync(); + nstream_time = prk::wtime(); + } if (grid_stride) { nstream2<<>>(static_cast(length), scalar, d_A, d_B, d_C); } else { nstream<<>>(static_cast(length), scalar, d_A, d_B, d_C); } - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); } nstream_time = prk::wtime() - nstream_time; } - prk::CUDA::check( cudaMemcpy(&(h_A[0]), d_A, bytes, cudaMemcpyDeviceToHost) ); - - prk::CUDA::check( cudaFree(d_C) ); - prk::CUDA::check( cudaFree(d_B) ); - prk::CUDA::check( cudaFree(d_A) ); + prk::CUDA::copyD2H(h_A, d_A, length); - prk::CUDA::check( cudaFreeHost(h_B) ); - prk::CUDA::check( cudaFreeHost(h_C) ); + prk::CUDA::free(d_A); + prk::CUDA::free(d_B); + prk::CUDA::free(d_C); ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -198,7 +192,6 @@ int main(int argc, char * argv[]) for (int i=0; i<=iterations; i++) { ar += br + scalar * cr; } - ar *= length; double asum(0); @@ -206,7 +199,9 @@ int main(int argc, char * argv[]) asum += prk::abs(h_A[i]); } - prk::CUDA::check( cudaFreeHost(h_A) ); + prk::CUDA::free_host(h_A); + prk::CUDA::free_host(h_B); + prk::CUDA::free_host(h_C); double epsilon=1.e-8; if (prk::abs(ar-asum)/asum > epsilon) { @@ -219,7 +214,7 @@ int main(int argc, char * argv[]) } else { std::cout << "Solution validates" << std::endl; double avgtime = nstream_time/iterations; - double nbytes = 4.0 * length * sizeof(prk_float); + double nbytes = 4.0 * length * sizeof(double); std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime << " Avg time (s): " << avgtime << std::endl; } diff --git a/Cxx11/nstream-managed-cuda.cu b/Cxx11/nstream-managed-cuda.cu index c95769c72..d79b113c2 100644 --- a/Cxx11/nstream-managed-cuda.cu +++ b/Cxx11/nstream-managed-cuda.cu @@ -1,6 +1,9 @@ /// /// Copyright (c) 2017, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// + +// Copyright (c) 2021, NVIDIA /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions /// are met: @@ -64,7 +67,7 @@ #include "prk_util.h" #include "prk_cuda.h" -__global__ void nstream(const unsigned n, const prk_float scalar, prk_float * A, const prk_float * B, const prk_float * C) +__global__ void nstream(const unsigned n, const double scalar, double * A, const double * B, const double * C) { unsigned i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { @@ -72,21 +75,21 @@ __global__ void nstream(const unsigned n, const prk_float scalar, prk_float * A, } } -__global__ void nstream2(const unsigned n, const prk_float scalar, prk_float * A, const prk_float * B, const prk_float * C) +__global__ void nstream2(const unsigned n, const double scalar, double * A, const double * B, const double * C) { for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { A[i] += B[i] + scalar * C[i]; } } -__global__ void fault_pages(const unsigned n, prk_float * A, prk_float * B, prk_float * C) +__global__ void fault_pages(const unsigned n, double * A, double * B, double * C) { - //const unsigned inc = 4096/sizeof(prk_float); + //const unsigned inc = 4096/sizeof(double); //for (unsigned int i = 0; i < n; i += inc) { for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { - A[i] = (prk_float)0; - B[i] = (prk_float)2; - C[i] = (prk_float)2; + A[i] = (double)0; + B[i] = (double)2; + C[i] = (double)2; } } @@ -149,51 +152,53 @@ int main(int argc, char * argv[]) double nstream_time(0); - prk_float * A; - prk_float * B; - prk_float * C; + double * A; + double * B; + double * C; - const size_t bytes = length * sizeof(prk_float); if (system_memory) { A = new double[length]; B = new double[length]; C = new double[length]; } else { - prk::CUDA::check( cudaMallocManaged((void**)&A, bytes) ); - prk::CUDA::check( cudaMallocManaged((void**)&B, bytes) ); - prk::CUDA::check( cudaMallocManaged((void**)&C, bytes) ); + A = prk::CUDA::malloc_managed(length); + B = prk::CUDA::malloc_managed(length); + C = prk::CUDA::malloc_managed(length); } // initialize on CPU to ensure pages are faulted there for (int i=0; i(0); - B[i] = static_cast(2); - C[i] = static_cast(2); + A[i] = static_cast(0); + B[i] = static_cast(2); + C[i] = static_cast(2); } if (ordered_fault) { fault_pages<<<1,1>>>(static_cast(length), A, B, C); - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); } if (prefetch) { - prk::CUDA::check( cudaMemPrefetchAsync(A, bytes, 0) ); - prk::CUDA::check( cudaMemPrefetchAsync(B, bytes, 0) ); - prk::CUDA::check( cudaMemPrefetchAsync(C, bytes, 0) ); + prk::CUDA::prefetch(A, length); + prk::CUDA::prefetch(B, length); + prk::CUDA::prefetch(C, length); } - prk_float scalar(3); + double scalar(3); { for (int iter = 0; iter<=iterations; iter++) { - if (iter==1) nstream_time = prk::wtime(); + if (iter==1) { + prk::CUDA::sync(); + nstream_time = prk::wtime(); + } if (grid_stride) { nstream2<<>>(static_cast(length), scalar, A, B, C); } else { nstream<<>>(static_cast(length), scalar, A, B, C); } - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); } nstream_time = prk::wtime() - nstream_time; } @@ -220,9 +225,9 @@ int main(int argc, char * argv[]) free(B); free(C); } else { - prk::CUDA::check( cudaFree(A) ); - prk::CUDA::check( cudaFree(B) ); - prk::CUDA::check( cudaFree(C) ); + prk::CUDA::free(A); + prk::CUDA::free(B); + prk::CUDA::free(C); } double epsilon=1.e-8; @@ -236,7 +241,7 @@ int main(int argc, char * argv[]) } else { std::cout << "Solution validates" << std::endl; double avgtime = nstream_time/iterations; - double nbytes = 4.0 * length * sizeof(prk_float); + double nbytes = 4.0 * length * sizeof(double); std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime << " Avg time (s): " << avgtime << std::endl; } diff --git a/Cxx11/prk_cuda.h b/Cxx11/prk_cuda.h index eb08fd486..4b0f71892 100644 --- a/Cxx11/prk_cuda.h +++ b/Cxx11/prk_cuda.h @@ -142,6 +142,63 @@ namespace prk } }; + template + T * malloc_device(size_t n) { + T * ptr; + size_t bytes = n * sizeof(T); + prk::CUDA::check( cudaMalloc((void**)&ptr, bytes) ); + return ptr; + } + + template + T * malloc_host(size_t n) { + T * ptr; + size_t bytes = n * sizeof(T); + prk::CUDA::check( cudaMallocHost((void**)&ptr, bytes) ); + return ptr; + } + + template + T * malloc_managed(size_t n) { + T * ptr; + size_t bytes = n * sizeof(T); + prk::CUDA::check( cudaMallocManaged((void**)&ptr, bytes) ); + return ptr; + } + + template + void free(T * ptr) { + prk::CUDA::check( cudaFree((void*)ptr) ); + } + + template + void free_host(T * ptr) { + prk::CUDA::check( cudaFreeHost((void*)ptr) ); + } + + template + void copyD2H(T * output, T * const input, size_t n) { + size_t bytes = n * sizeof(T); + prk::CUDA::check( cudaMemcpy(output, input, bytes, cudaMemcpyDeviceToHost) ); + } + + template + void copyH2D(T * output, T * const input, size_t n) { + size_t bytes = n * sizeof(T); + prk::CUDA::check( cudaMemcpy(output, input, bytes, cudaMemcpyHostToDevice) ); + } + + template + void prefetch(T * ptr, size_t n, int device = 0) { + size_t bytes = n * sizeof(T); + //std::cout << "device=" << device << "\n"; + prk::CUDA::check( cudaMemPrefetchAsync(ptr, bytes, device) ); + } + + void sync(void) { + prk::CUDA::check( cudaDeviceSynchronize() ); + } + } // CUDA namespace } // prk namespace From c4000ee1021acba16cf69cc369222f7e84dbdd3e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 3 Apr 2021 23:07:31 -0700 Subject: [PATCH 048/325] use new stuff --- Cxx11/transpose-cublas.cu | 66 +++++++++++++++------------------------ Cxx11/transpose-cuda.cu | 53 +++++++++++++++---------------- 2 files changed, 51 insertions(+), 68 deletions(-) diff --git a/Cxx11/transpose-cublas.cu b/Cxx11/transpose-cublas.cu index 8e0ddfa5b..7f050ca28 100644 --- a/Cxx11/transpose-cublas.cu +++ b/Cxx11/transpose-cublas.cu @@ -105,63 +105,52 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// const size_t nelems = (size_t)order * (size_t)order; - const size_t bytes = nelems * sizeof(double); - double * h_a; - double * h_b; - prk::CUDA::check( cudaMallocHost((void**)&h_a, bytes) ); - prk::CUDA::check( cudaMallocHost((void**)&h_b, bytes) ); + double * h_a = prk::CUDA::malloc_host(nelems); + double * h_b = prk::CUDA::malloc_host(nelems); - // fill A with the sequence 0 to order^2-1 as doubles + // fill A with the sequence 0 to order^2-1 for (int j=0; j(order*j+i); + h_b[j*order+i] = static_cast(0); } } // copy input from host to device - double * d_a; - double * d_b; - prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) ); - prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) ); - prk::CUDA::check( cudaMemcpy(d_a, &(h_a[0]), bytes, cudaMemcpyHostToDevice) ); - prk::CUDA::check( cudaMemcpy(d_b, &(h_b[0]), bytes, cudaMemcpyHostToDevice) ); + double * d_a = prk::CUDA::malloc_device(nelems); + double * d_b = prk::CUDA::malloc_device(nelems); + + prk::CUDA::copyH2D(d_a, h_a, nelems); + prk::CUDA::copyH2D(d_b, h_b, nelems); #if CUBLAS_AXPY_BUG // We need a vector of ones because CUBLAS daxpy does not // correctly implement incx=0. - double * h_o; - prk::CUDA::check( cudaMallocHost((void**)&h_o, bytes) ); + double * h_o = prk::CUDA::malloc_host(nelems); for (int j=0; j(nelems); + prk::CUDA::copyH2D(d_o, h_o, nelems); #endif -#ifdef USE_HOST_BUFFERS - double p_a = h_a; - double p_b = h_b; -#if CUBLAS_AXPY_BUG - double p_o = h_o; -#endif -#else double * p_a = d_a; double * p_b = d_b; #if CUBLAS_AXPY_BUG double * p_o = d_o; -#endif #endif double trans_time{0}; for (int iter = 0; iter<=iterations; iter++) { - if (iter==1) trans_time = prk::wtime(); + if (iter==1) { + prk::CUDA::sync(); + trans_time = prk::wtime(); + } double one(1); // B += trans(A) i.e. B = trans(A) + B @@ -191,22 +180,19 @@ int main(int argc, char * argv[]) // (Host buffer version) // The performance is ~10% better if this is done every iteration, // instead of only once before the timer is stopped. - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); } trans_time = prk::wtime() - trans_time; - // copy output back to host - prk::CUDA::check( cudaMemcpy(&(h_b[0]), d_b, bytes, cudaMemcpyDeviceToHost) ); + prk::CUDA::copyD2H(h_b, d_b, nelems); #if CUBLAS_AXPY_BUG - prk::CUDA::check( cudaFree(d_o) ); - prk::CUDA::check( cudaFreeHost(h_o) ); + prk::CUDA::free(d_o); + prk::CUDA::free_host(h_o); #endif - prk::CUDA::check( cudaFree(d_b) ); - prk::CUDA::check( cudaFree(d_a) ); - - prk::CUDA::check( cudaFreeHost(h_a) ); + prk::CUDA::free(d_a); + prk::CUDA::free(d_b); prk::CUDA::check( cublasDestroy(h) ); //prk::CUDA::check( cublasShutdown() ); @@ -215,7 +201,6 @@ int main(int argc, char * argv[]) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - // TODO: replace with std::generate, std::accumulate, or similar const double addit = (iterations+1.) * (iterations/2.); double abserr(0); for (int j=0; j(nelems); + double * h_b = prk::CUDA::malloc_host(nelems); // fill A with the sequence 0 to order^2-1 for (int j=0; j(order*j+i); - h_b[j*order+i] = static_cast(0); + h_a[j*order+i] = static_cast(order*j+i); + h_b[j*order+i] = static_cast(0); } } // copy input from host to device - prk_float * d_a; - prk_float * d_b; - prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) ); - prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) ); - prk::CUDA::check( cudaMemcpy(d_a, &(h_a[0]), bytes, cudaMemcpyHostToDevice) ); - prk::CUDA::check( cudaMemcpy(d_b, &(h_b[0]), bytes, cudaMemcpyHostToDevice) ); + double * d_a = prk::CUDA::malloc_device(nelems); + double * d_b = prk::CUDA::malloc_device(nelems); + + prk::CUDA::copyH2D(d_a, h_a, nelems); + prk::CUDA::copyH2D(d_b, h_b, nelems); double trans_time{0}; for (int iter = 0; iter<=iterations; iter++) { - if (iter==1) trans_time = prk::wtime(); + if (iter==1) { + prk::CUDA::sync(); + trans_time = prk::wtime(); + } transpose<<>>(order, d_a, d_b); - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); } trans_time = prk::wtime() - trans_time; - // copy output back to host - prk::CUDA::check( cudaMemcpy(&(h_b[0]), d_b, bytes, cudaMemcpyDeviceToHost) ); + prk::CUDA::copyD2H(h_b, d_b, nelems); #ifdef VERBOSE - // copy input back to host - debug only - prk::CUDA::check( cudaMemcpy(&(h_a[0]), d_a, bytes, cudaMemcpyDeviceToHost) ); + prk::CUDA::copyD2H(h_a, d_a, nelems); #endif - prk::CUDA::check( cudaFree(d_b) ); - prk::CUDA::check( cudaFree(d_a) ); + prk::CUDA::free(d_a); + prk::CUDA::free(d_b); ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -231,14 +228,14 @@ int main(int argc, char * argv[]) std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif - prk::CUDA::check( cudaFreeHost(h_b) ); - prk::CUDA::check( cudaFreeHost(h_a) ); + prk::CUDA::free_host(h_a); + prk::CUDA::free_host(h_b); const auto epsilon = 1.0e-8; if (abserr < epsilon) { std::cout << "Solution validates" << std::endl; auto avgtime = trans_time/iterations; - auto bytes = (size_t)order * (size_t)order * sizeof(prk_float); + auto bytes = (size_t)order * (size_t)order * sizeof(double); std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime << " Avg time (s): " << avgtime << std::endl; } else { From a4c22c2a399d58f6814928968a3700b39000a9d4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 08:28:39 -0700 Subject: [PATCH 049/325] use double not auto --- Cxx11/transpose-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu index 0f90075a6..e2e2c1260 100644 --- a/Cxx11/transpose-cuda.cu +++ b/Cxx11/transpose-cuda.cu @@ -231,7 +231,7 @@ int main(int argc, char * argv[]) prk::CUDA::free_host(h_a); prk::CUDA::free_host(h_b); - const auto epsilon = 1.0e-8; + const double epsilon = 1.0e-8; if (abserr < epsilon) { std::cout << "Solution validates" << std::endl; auto avgtime = trans_time/iterations; From 96c1160015d186377402a1341ba38661c097616c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 08:28:51 -0700 Subject: [PATCH 050/325] add copyright --- Cxx11/transpose-cublas.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/Cxx11/transpose-cublas.cu b/Cxx11/transpose-cublas.cu index 7f050ca28..6e43c0b6a 100644 --- a/Cxx11/transpose-cublas.cu +++ b/Cxx11/transpose-cublas.cu @@ -1,5 +1,6 @@ /// /// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions From ca2cfb077ce1609eec54fd3a817fd0ff8ae05f81 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 08:28:59 -0700 Subject: [PATCH 051/325] add async copy --- Cxx11/prk_cuda.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Cxx11/prk_cuda.h b/Cxx11/prk_cuda.h index 4b0f71892..a87aa68e3 100644 --- a/Cxx11/prk_cuda.h +++ b/Cxx11/prk_cuda.h @@ -188,6 +188,18 @@ namespace prk prk::CUDA::check( cudaMemcpy(output, input, bytes, cudaMemcpyHostToDevice) ); } + template + void copyD2Hasync(T * output, T * const input, size_t n) { + size_t bytes = n * sizeof(T); + prk::CUDA::check( cudaMemcpyAsync(output, input, bytes, cudaMemcpyDeviceToHost) ); + } + + template + void copyH2Dasync(T * output, T * const input, size_t n) { + size_t bytes = n * sizeof(T); + prk::CUDA::check( cudaMemcpyAsync(output, input, bytes, cudaMemcpyHostToDevice) ); + } + template void prefetch(T * ptr, size_t n, int device = 0) { size_t bytes = n * sizeof(T); From 01dfac3e9334d98cd1c6e7511b2044af40aa13cd Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 08:29:13 -0700 Subject: [PATCH 052/325] use prk::CUDA:: stuff --- Cxx11/dgemm-cublas.cu | 60 +++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu index a46b678a3..afbc7f071 100644 --- a/Cxx11/dgemm-cublas.cu +++ b/Cxx11/dgemm-cublas.cu @@ -1,5 +1,6 @@ /// /// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -135,7 +136,7 @@ void prk_dgemm(const cublasHandle_t & h, &beta, // beta pC, order) ); // C, ldc } - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); } void prk_bgemm(const cublasHandle_t & h, @@ -157,7 +158,7 @@ void prk_bgemm(const cublasHandle_t & h, &beta, C, order, order*order, batches) ); - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); // cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, // cublasOperation_t transa, @@ -252,23 +253,16 @@ int main(int argc, char * argv[]) const size_t bytes = nelems * sizeof(double); // host buffers - double * h_a; - double * h_b; - double * h_c; - prk::CUDA::check( cudaMallocHost((void**)&h_a, bytes) ); - prk::CUDA::check( cudaMallocHost((void**)&h_b, bytes) ); - prk::CUDA::check( cudaMallocHost((void**)&h_c, matrices*bytes) ); + double * h_a = prk::CUDA::malloc_host(nelems); + double * h_b = prk::CUDA::malloc_host(nelems); + double * h_c = prk::CUDA::malloc_host(matrices*nelems); // device buffers - double * d_a; - double * d_b; - double * d_c; - prk::CUDA::check( cudaMalloc((void**)&d_a, matrices*bytes) ); - prk::CUDA::check( cudaMalloc((void**)&d_b, matrices*bytes) ); - prk::CUDA::check( cudaMalloc((void**)&d_c, matrices*bytes) ); + double * d_a = prk::CUDA::malloc_device(matrices*nelems); + double * d_b = prk::CUDA::malloc_device(matrices*nelems); + double * d_c = prk::CUDA::malloc_device(matrices*nelems); if (input_copy) { - for (int i=0; i>>(order, matrices, d_c); @@ -289,22 +283,25 @@ int main(int argc, char * argv[]) init<<>>(order, matrices, d_a, d_b, d_c); } - prk::CUDA::check( cudaDeviceSynchronize() ); + prk::CUDA::sync(); double xfer(0); double comp(0); { for (int iter = 0; iter<=iterations; iter++) { - if (iter==1) dgemm_time = prk::wtime(); + if (iter==1) { + prk::CUDA::sync(); + dgemm_time = prk::wtime(); + } if (input_copy) { double t0 = prk::wtime(); for (int b=0; b Date: Mon, 5 Apr 2021 08:42:28 -0700 Subject: [PATCH 053/325] more prk::CUDA wrapping; add set_device --- Cxx11/dgemm-cublas.cu | 2 -- Cxx11/dgemm-mpi-cublas.cu | 4 +-- Cxx11/dgemm-multigpu-cublas.cu | 54 ++++++++++++++++++---------------- Cxx11/prk_cuda.h | 4 +++ 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu index afbc7f071..d001e5a7c 100644 --- a/Cxx11/dgemm-cublas.cu +++ b/Cxx11/dgemm-cublas.cu @@ -223,7 +223,6 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; - if (batches == 0) { std::cout << "No batching" << std::endl; } else if (batches < 0) { @@ -250,7 +249,6 @@ int main(int argc, char * argv[]) const int matrices = (batches==0 ? 1 : abs(batches)); const size_t nelems = (size_t)order * (size_t)order; - const size_t bytes = nelems * sizeof(double); // host buffers double * h_a = prk::CUDA::malloc_host(nelems); diff --git a/Cxx11/dgemm-mpi-cublas.cu b/Cxx11/dgemm-mpi-cublas.cu index 061a5c16d..8f785b007 100644 --- a/Cxx11/dgemm-mpi-cublas.cu +++ b/Cxx11/dgemm-mpi-cublas.cu @@ -169,8 +169,8 @@ int main(int argc, char * argv[]) double * d_a; double * d_b; double * d_c; - prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) ); - prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) ); + d_a = prk::CUDA::malloc_device(order*order); + d_b = prk::CUDA::malloc_device(order*order); prk::CUDA::check( cudaMalloc((void**)&d_c, bytes) ); init<<>>(order, d_a, d_b, d_c); diff --git a/Cxx11/dgemm-multigpu-cublas.cu b/Cxx11/dgemm-multigpu-cublas.cu index 2b58d5302..160a9d12c 100644 --- a/Cxx11/dgemm-multigpu-cublas.cu +++ b/Cxx11/dgemm-multigpu-cublas.cu @@ -1,5 +1,6 @@ /// /// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -132,6 +133,7 @@ void prk_bgemm(const cublasHandle_t & h, &beta, C, order, order*order, batches) ); + prk::CUDA::sync(); // cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, // cublasOperation_t transa, @@ -191,14 +193,14 @@ int main(int argc, char * argv[]) return 1; } - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Matrix order = " << order << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; if (batches == 0) { std::cout << "No batching" << std::endl; } else if (batches < 0) { - std::cout << "Batch size = " << -batches << " (loop over legacy BLAS)" << std::endl; + std::cout << "Batch size = " << -batches << " (loop over legacy BLAS)" << std::endl; } else if (batches > 0) { - std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; } std::cout << "Number of GPUs to use = " << use_ngpu << std::endl; @@ -213,7 +215,7 @@ int main(int argc, char * argv[]) std::vector contexts(ngpus); for (int i=0; i h_c(ngpus,nullptr); for (int i=0; i(matrices*nelems); } // device buffers @@ -244,23 +245,26 @@ int main(int argc, char * argv[]) std::vector d_b(ngpus,nullptr); std::vector d_c(ngpus,nullptr); for (int i=0; i(matrices*nelems); + d_b[i] = prk::CUDA::malloc_device(matrices*nelems); + d_c[i] = prk::CUDA::malloc_device(matrices*nelems); init<<>>(order, matrices, d_a[i], d_b[i], d_c[i]); } for (int i=0; i Date: Mon, 5 Apr 2021 08:56:45 -0700 Subject: [PATCH 054/325] work around lack of exscan --- Cxx11/prk_mpi.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Cxx11/prk_mpi.h b/Cxx11/prk_mpi.h index 55f5785cd..15750b51e 100644 --- a/Cxx11/prk_mpi.h +++ b/Cxx11/prk_mpi.h @@ -207,8 +207,17 @@ namespace prk MPI_Datatype dt = (std::is_signed() ? MPI_INT64_T : MPI_UINT64_T); std::vector global_sizes(np_); // in global_offsets_.resize(np_); // out + // there is probably a better way to do this. i should be able to MPI_Exscan then MPI_Allgather instead. prk::MPI::check( MPI_Allgather(&local_size_, 1, dt, global_sizes.data(), 1, dt, comm_) ); +#if 0 std::exclusive_scan( global_sizes.cbegin(), global_sizes.cend(), global_offsets_.begin(), 0); +#else + global_offsets_[0] = 0; + for ( size_t i = 1 ; i < global_sizes.size() ; ++i ) { + global_offsets_[i] = global_sizes[i-1]; + + } +#endif } my_global_offset_begin_ = global_offsets_[me_]; my_global_offset_end_ = (me_ != np_-1) ? global_offsets_[me_+1] : global_size_; @@ -294,7 +303,7 @@ namespace prk return local_pointer_[local_offset]; } - constexpr T * data(void) noexcept + T * data(void) noexcept { return local_pointer_; } From 81eee24e9bf3ee8957ef9a1b5953f536f9e19197 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 08:56:53 -0700 Subject: [PATCH 055/325] add copyright --- Cxx11/dgemm-mpi-cublas.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/Cxx11/dgemm-mpi-cublas.cu b/Cxx11/dgemm-mpi-cublas.cu index 8f785b007..93cb7456a 100644 --- a/Cxx11/dgemm-mpi-cublas.cu +++ b/Cxx11/dgemm-mpi-cublas.cu @@ -1,5 +1,6 @@ /// /// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions From 1afb7f8e7e2e790850ddc25ca2bae69ecee13d77 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 10:38:27 -0700 Subject: [PATCH 056/325] add example Mac Homebrew build --- PYTHON/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PYTHON/README.md b/PYTHON/README.md index 05b08d11a..33354905f 100644 --- a/PYTHON/README.md +++ b/PYTHON/README.md @@ -4,3 +4,10 @@ mpiexec -n 4 python -m mpi4py nstream-numpy-mpi.py 10 10000000 mpiexec -n 4 python -m mpi4py transpose-numpy-mpi.py 10 1000 ``` + +On Mac with Homebrew, this might work better: + +``` + mpiexec -n 4 ./nstream-numpy-mpi.py 10 10000000 + mpiexec -n 4 ./transpose-numpy-mpi.py 10 1000 +``` From 437aa73cb765beb1573ed2f62d2209be243bc0f0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 11:35:51 -0700 Subject: [PATCH 057/325] NVHPC C++ stdpar (#564) * update NVHPC example build * add C++ stdpar nstream and transpose * use Thrust zip_iterator in (nstream) * hacking around cartesian_product not being there (transpose) --- .gitignore | 4 + Cxx11/Makefile | 7 ++ Cxx11/nstream-stdpar.cc | 192 ++++++++++++++++++++++++++++++++++++++ Cxx11/prk_util.h | 6 +- Cxx11/transpose-stdpar.cc | 174 ++++++++++++++++++++++++++++++++++ common/make.defs.gcc | 10 +- common/make.defs.nvhpc | 70 +------------- 7 files changed, 388 insertions(+), 75 deletions(-) create mode 100644 Cxx11/nstream-stdpar.cc create mode 100644 Cxx11/transpose-stdpar.cc diff --git a/.gitignore b/.gitignore index 77276add0..81f301457 100644 --- a/.gitignore +++ b/.gitignore @@ -107,6 +107,7 @@ Cxx11/dgemm-onemkl Cxx11/xgemm-onemkl Cxx11/dgemm-openmp Cxx11/dgemm-raja +Cxx11/dgemm-stdpar Cxx11/dgemm-sycl Cxx11/dgemm-vector Cxx11/dgemm-vector-raja @@ -145,6 +146,7 @@ Cxx11/nstream-openmp-target Cxx11/nstream-pstl Cxx11/nstream-raja Cxx11/nstream-ranges +Cxx11/nstream-stdpar Cxx11/nstream-stl Cxx11/nstream-sycl Cxx11/nstream-sycl-explicit @@ -211,6 +213,7 @@ Cxx11/stencil-openmp-target Cxx11/stencil-pstl Cxx11/stencil-raja Cxx11/stencil-ranges +Cxx11/stencil-stdpar Cxx11/stencil-stl Cxx11/stencil-sycl Cxx11/stencil-sycl-explicit @@ -240,6 +243,7 @@ Cxx11/transpose-openmp-target Cxx11/transpose-pstl Cxx11/transpose-raja Cxx11/transpose-ranges +Cxx11/transpose-stdpar Cxx11/transpose-stl Cxx11/transpose-sycl Cxx11/transpose-sycl-explicit diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 973abc3bb..584c66500 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -50,6 +50,7 @@ THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) SYCLFLAGS = $(SYCLFLAG) OPENACCFLAGS = $(OPENACCFLAG) +STDPARFLAGS = $(STDPARFLAG) #$(RANGEFLAGS) ifdef OCCADIR include ${OCCADIR}/scripts/makefile @@ -138,6 +139,8 @@ occa: transpose-occa nstream-occa openacc: p2p-hyperplane-openacc +stdpar: nstream-stdpar transpose-stdpar #stencil-stdpar p2p-stdpar + boost-compute: nstream-boost-compute # busted #nstream-valarray-boost-compute @@ -272,6 +275,9 @@ endif %-openacc: %-openacc.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(OPENACCFLAGS) -o $@ +%-stdpar: %-stdpar.cc prk_util.h + $(CXX) $(CXXFLAGS) $< $(STDPARFLAGS) -o $@ + %: %.cc prk_util.h $(CXX) $(CXXFLAGS) $< -o $@ @@ -308,6 +314,7 @@ clean: -rm -f *-stl -rm -f *-pstl -rm -f *-ranges + -rm -f *-stdpar -rm -f *-raja -rm -f *-kokkos -rm -f *-thrust diff --git a/Cxx11/nstream-stdpar.cc b/Cxx11/nstream-stdpar.cc new file mode 100644 index 000000000..4723f0a93 --- /dev/null +++ b/Cxx11/nstream-stdpar.cc @@ -0,0 +1,192 @@ +/// +/// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2021, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors and +/// the length of the vectors. +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#include +#include +#include + +#include + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/STDPAR STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time{0}; + + std::vector A(length); + std::vector B(length); + std::vector C(length); + + //auto range = prk::range(static_cast(0), length); + + double scalar(3); + + { + std::fill( std::begin(A), std::end(A), 0.0 ); + std::fill( std::begin(B), std::end(B), 2.0 ); + std::fill( std::begin(C), std::end(C), 2.0 ); + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + +#if 0 + // stupid version + std::transform( std::execution::par_unseq, + std::begin(A), std::end(A), std::begin(B), std::begin(A), + [](auto&& x, auto&& y) { + return x + y; // A[i] += B[i] + } + ); + std::transform( std::execution::par_unseq, + std::begin(A), std::end(A), std::begin(C), std::begin(A), + [scalar](auto&& x, auto&& y) { + return x + scalar * y; // A[i] += scalar * C[i] + } + ); +#else + auto nstream = [=] (thrust::tuple t) { + return thrust::get<0>(t) + thrust::get<1>(t) + scalar * thrust::get<2>(t); + }; + std::transform( thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())), + thrust::make_zip_iterator(thrust::make_tuple(A.end() , B.end() , C.end())), + A.begin(), + nstream); +#endif + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (int i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index b897df7ef..d316ed14c 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -48,6 +48,11 @@ # error You need a C++11 compiler or a newer C++ standard library. #endif +// weird issue with NVC++ 21.2 and GCC 10.2.1 (not officially supported) +#ifndef __GCC_ATOMIC_CHAR8_T_LOCK_FREE +#define __GCC_ATOMIC_CHAR8_T_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE +#endif + #include #include #include // std::setprecision @@ -58,7 +63,6 @@ #include #include #include -#include #include #include #include // std::thread::hardware_concurrency diff --git a/Cxx11/transpose-stdpar.cc b/Cxx11/transpose-stdpar.cc new file mode 100644 index 000000000..8744aadb4 --- /dev/null +++ b/Cxx11/transpose-stdpar.cc @@ -0,0 +1,174 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2021, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#include +#include +#include +//#include +//#include + +#include + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/STDPAR Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + std::vector A(order*order); + std::vector B(order*order,0.0); + + // fill A with the sequence 0 to order^2-1 as doubles + std::iota(A.begin(), A.end(), 0.0); + + //auto range = std::views::iota(0,order); + + //std::vector range(order); + //std::iota(range.begin(), range.end(), 0); + + thrust::counting_iterator begin(0); + thrust::counting_iterator end(order*order); + + double trans_time{0}; + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) trans_time = prk::wtime(); + + double * const pA = A.data(); + double * const pB = B.data(); + std::for_each( std::execution::par_unseq, + begin, end, + [order,pA,pB] __device__ (int idx) { + auto i = idx / order; + auto j = idx % order; + pB[i*order+j] += pA[j*order+i]; + pA[j*order+i] += 1.0; + }); + } + trans_time = prk::wtime() - trans_time; + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double addit = (iterations+1.) * (iterations/2.); + double abserr(0); + // TODO: replace with std::generate, std::accumulate, or similar + for (int j=0; j(ij)*(1.+iterations)+addit; + abserr += prk::abs(B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + diff --git a/common/make.defs.gcc b/common/make.defs.gcc index c8b14d6f2..36de132e8 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -4,16 +4,14 @@ # # Base compilers and language options # -LLVM_ROOT=/opt/homebrew/Cellar/llvm/11.1.0 -LLVM_PATH=${LLVM_ROOT}/bin/ -#CLANG_VERSION=-9 +VERSION=-10 # C99 is required in some implementations. CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt # All of the Fortran code is written for the 2008 standard and requires preprocessing. FC=gfortran${VERSION} -std=f2008 -cpp -fexternal-blas -fblas-matmul-limit=0 # C++11 may not be required but does no harm here. -CXX=g++${VERSION} -std=gnu++17 -pthread +CXX=g++${VERSION} -std=gnu++20 -pthread # # Compiler flags # @@ -136,8 +134,8 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include #BOOSTFLAG=-I/usr/include/boost169 BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_1/include # M1 Big Sur -RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} #PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages KOKKOSDIR=/opt/kokkos/gcc diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index ee1c86999..863119eec 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -22,10 +22,11 @@ OPENMPFLAG=-mp #OPENMPFLAG+=-Minfo=mp,vect OPENMPSIMDFLAG= OFFLOADFLAG=-mp -target=gpu -#OFFLOADFLAG+=-Minfo=accel +OFFLOADFLAG+=-Minfo=accel OFFLOADFLAG+=-DGPU_SCHEDULE="schedule(static,1)" OPENACCFLAG=-acc -target=gpu OPENACCFLAG+=-Mlarge_arrays +OPENACCFLAG+=-Minfo=accel STDPARFLAG=-stdpar -Minfo=accel # # OpenCL flags @@ -35,73 +36,6 @@ OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL #OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations #OPENCLFLAG+=-Wno-deprecated-declarations -Wno-missing-braces # -# Metal (MacOS-only, unused) -# -#METALFLAG=-framework MetalPerformanceShaders -# -# OCCA -# -#OCCADIR=${HOME}/prk-repo/Cxx11/occa -# -# SYCL flags -# -# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md -# -#SYCLDIR=/opt/isycl -#SYCLDIR=${HOME}/ISYCL/llvm/build -#SYCLCXX=${SYCLDIR}/bin/clang++ -#SYCLFLAG=-g -std=c++17 -O3 -#SYCLFLAG+=-fsycl -fsycl-unnamed-lambda -DDPCPP -#SYCLFLAG+=-L${SYCLDIR}/lib -lsycl -Wl,-rpath=${SYCLDIR}/lib -#SYCLFLAG+=-fsycl-targets=nvptx64-nvidia-cuda-sycldevice -DDPCPP_CUDA -###SYCLFLAG+=-Wno-unknown-cuda-version -# -# CodePlay ComputeCpp -# -#SYCLDIR=/opt/sycl/latest -#SYCLDIR=/opt/codeplay/latest -#SYCLCXX=${SYCLDIR}/bin/compute++ -#SYCLFLAG=-sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp -#SYCLFLAG+=-std=c++14 -O3 -# This makes a huge difference in e.g. nstream... -#SYCLFLAG+=-no-serial-memop -# CentOS7 and Ubuntu14 built for this -#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 -# PRK header rejects GCC4 -#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0 -# If not found automatically -#SYCLFLAG+=${OPENCLFLAG} -# NVIDIA target -#SYCLFLAG+=-sycl-target ptx64 -#SYCLFLAG+=-DPRK_NO_OPENCL_GPU -# -# hipSYCL -# -#SYCLDIR=/opt/hipsycl/usr/local # if installed with DESTDIR -#SYCLDIR=/opt/hipSYCL -#SYCLDIR=/opt/spack/spack/opt/spack/linux-ubuntu18.04-haswell/gcc-8.3.0/hipsycl-master-appurj662qod4y4z5zxipr2fwthl66k7 -#SYCLCXX=${SYCLDIR}/bin/syclcc-clang -#SYCLFLAG=-std=c++17 -O3 -#SYCLFLAG+=-DHIPSYCL -# CPU platform -#SYCLFLAG+=--hipsycl-platform=cpu -#SYCLFLAG+=--hipsycl-platform=cuda -#SYCLFLAG+=--hipsycl-gpu-arch=sm_60 -#SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib # wrong? -#SYCLFLAG+=-Wl,-rpath=${SYCLDIR}/lib -# -#CELERITYDIR=${SYCLDIR} -#CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor -#CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime -# -# OCCA -# -#OCCADIR=${HOME}/prk-repo/Cxx11/occa -# -# TBB -# -TBBDIR= -TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # From 1b8f22ebf356972fa3ee302afdfa726fd354ba33 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 13:30:17 -0700 Subject: [PATCH 058/325] Fix CUDA transpose (#566) * fixed CUDA transpose * variant selection --- Cxx11/transpose-cuda.cu | 116 +++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 49 deletions(-) diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu index e2e2c1260..d7e0fcf64 100644 --- a/Cxx11/transpose-cuda.cu +++ b/Cxx11/transpose-cuda.cu @@ -1,6 +1,6 @@ /// /// Copyright (c) 2013, Intel Corporation -/// Copyright (c) 2015, NVIDIA CORPORATION. +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -56,38 +56,70 @@ #include "prk_util.h" #include "prk_cuda.h" -#define TILED 0 - -#if TILED -// The kernel was derived from https://github.com/parallel-forall/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu, -// which is the reason for the additional copyright noted above. +// The kernel was derived from https://github.com/parallel-forall/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu const int tile_dim = 32; const int block_rows = 8; -__global__ void transpose(int order, double * A, double * B) +__global__ void transposeNoBankConflict(int order, double * A, double * B) { + __shared__ double tile[tile_dim][tile_dim+1]; + auto x = blockIdx.x * tile_dim + threadIdx.x; auto y = blockIdx.y * tile_dim + threadIdx.y; auto width = gridDim.x * tile_dim; + for (int j = 0; j < tile_dim; j += block_rows) { + tile[threadIdx.y+j][threadIdx.x] = A[(y+j)*width + x]; + A[(y+j)*width + x] += (double)1; + } + + __syncthreads(); + + x = blockIdx.y * tile_dim + threadIdx.x; + y = blockIdx.x * tile_dim + threadIdx.y; + for (int j = 0; j < tile_dim; j+= block_rows) { - B[x*width + (y+j)] += A[(y+j)*width + x]; - A[(y+j)*width + x] += (double)1; + B[(y+j)*width + x] += tile[threadIdx.x][threadIdx.y + j]; } } -#else -__global__ void transpose(unsigned order, double * A, double * B) + +__global__ void transposeCoalesced(int order, double * A, double * B) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto j = blockIdx.y * blockDim.y + threadIdx.y; + __shared__ double tile[tile_dim][tile_dim]; + + auto x = blockIdx.x * tile_dim + threadIdx.x; + auto y = blockIdx.y * tile_dim + threadIdx.y; + auto width = gridDim.x * tile_dim; - if ((i vnames = {"naive", "coalesced", "no bank conflicts"}; int main(int argc, char * argv[]) { @@ -102,10 +134,10 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// int iterations; - int order, tile_size; + int order, variant; try { if (argc < 3) { - throw "Usage: <# iterations> "; + throw "Usage: <# iterations> [variant (0/1/2)]"; } iterations = std::atoi(argv[1]); @@ -120,23 +152,13 @@ int main(int argc, char * argv[]) throw "ERROR: matrix dimension too large - overflow risk"; } -#if TILED - if (order % tile_dim != 0) { - std::cout << "Sorry, but order (" << order << ") must be evenly divible by " << tile_dim - << " or the results are going to be wrong.\n"; - } -#else - // default tile size for tiling of local transpose - tile_size = 32; + variant = 2; // transposeNoBankConflicts if (argc > 3) { - tile_size = std::atoi(argv[3]); - if (tile_size <= 0) tile_size = order; - if (tile_size > order) tile_size = order; - if (tile_size > 32) { - std::cout << "The results are probably going to be wrong; use tile_size<=32.\n"; - } + variant = std::atoi(argv[3]); + } + if (variant < 0 || variant > 2) { + throw "Please select a valid variant (0: naive 1: coalesced, 2: no bank conflicts)"; } -#endif } catch (const char * e) { std::cout << e << std::endl; @@ -145,19 +167,10 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; -#if TILED - std::cout << "Tile size = " << tile_dim << std::endl; -#else - std::cout << "Tile size = " << tile_size << std::endl; -#endif + std::cout << "Variant = " << vnames[variant] << std::endl; -#if TILED dim3 dimGrid(order/tile_dim, order/tile_dim, 1); dim3 dimBlock(tile_dim, block_rows, 1); -#else - dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); - dim3 dimBlock(tile_size, tile_size, 1); -#endif info.checkDims(dimBlock, dimGrid); @@ -194,8 +207,13 @@ int main(int argc, char * argv[]) trans_time = prk::wtime(); } - transpose<<>>(order, d_a, d_b); - + if (variant==0) { + transposeNaive<<>>(order, d_a, d_b); + } else if (variant==1) { + transposeCoalesced<<>>(order, d_a, d_b); + } else if (variant==2) { + transposeNoBankConflict<<>>(order, d_a, d_b); + } prk::CUDA::sync(); } trans_time = prk::wtime() - trans_time; @@ -228,9 +246,6 @@ int main(int argc, char * argv[]) std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif - prk::CUDA::free_host(h_a); - prk::CUDA::free_host(h_b); - const double epsilon = 1.0e-8; if (abserr < epsilon) { std::cout << "Solution validates" << std::endl; @@ -251,6 +266,9 @@ int main(int argc, char * argv[]) return 1; } + prk::CUDA::free_host(h_a); + prk::CUDA::free_host(h_b); + return 0; } From 04c06b7a16b4bf8e41bacba03fdf9e33ca7ff54d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 6 Apr 2021 08:39:35 -0700 Subject: [PATCH 059/325] fix bug in DAXPY with INC=0 (#567) --- Cxx11/transpose-cublas.cu | 53 ++++++++------------------------------- 1 file changed, 11 insertions(+), 42 deletions(-) diff --git a/Cxx11/transpose-cublas.cu b/Cxx11/transpose-cublas.cu index 6e43c0b6a..fd8f628a4 100644 --- a/Cxx11/transpose-cublas.cu +++ b/Cxx11/transpose-cublas.cu @@ -109,6 +109,7 @@ int main(int argc, char * argv[]) double * h_a = prk::CUDA::malloc_host(nelems); double * h_b = prk::CUDA::malloc_host(nelems); + double * h_o = prk::CUDA::malloc_host(1); // fill A with the sequence 0 to order^2-1 for (int j=0; j(0); } } + h_o[0] = 1; // copy input from host to device double * d_a = prk::CUDA::malloc_device(nelems); double * d_b = prk::CUDA::malloc_device(nelems); + double * d_o = prk::CUDA::malloc_device(1); prk::CUDA::copyH2D(d_a, h_a, nelems); prk::CUDA::copyH2D(d_b, h_b, nelems); - -#if CUBLAS_AXPY_BUG - // We need a vector of ones because CUBLAS daxpy does not - // correctly implement incx=0. - double * h_o = prk::CUDA::malloc_host(nelems); - for (int j=0; j(nelems); - prk::CUDA::copyH2D(d_o, h_o, nelems); -#endif - - double * p_a = d_a; - double * p_b = d_b; -#if CUBLAS_AXPY_BUG - double * p_o = d_o; -#endif + prk::CUDA::copyH2D(d_o, h_o, 1); double trans_time{0}; @@ -158,42 +143,26 @@ int main(int argc, char * argv[]) prk::CUDA::check( cublasDgeam(h, CUBLAS_OP_T, CUBLAS_OP_N, // opA, opB order, order, // m, n - &one, p_a, order, // alpha, A, lda - &one, p_b, order, // beta, B, ldb - p_b, order) ); // C, ldc (in-place for B) + &one, d_a, order, // alpha, A, lda + &one, d_b, order, // beta, B, ldb + d_b, order) ); // C, ldc (in-place for B) // A += 1.0 i.e. A = 1.0 * 1.0 + A -#if CUBLAS_AXPY_BUG - // THIS IS CORRECT - prk::CUDA::check( cublasDaxpy(h, - order*order, // n - &one, // alpha - p_o, 1, // x, incx - p_a, 1) ); // y, incy -#else - // THIS IS BUGGY prk::CUDA::check( cublasDaxpy(h, order*order, // n &one, // alpha - &one, 0, // x, incx - p_a, 1) ); // y, incy -#endif - // (Host buffer version) - // The performance is ~10% better if this is done every iteration, - // instead of only once before the timer is stopped. + d_o, 0, // x, incx + d_a, 1) ); // y, incy prk::CUDA::sync(); } trans_time = prk::wtime() - trans_time; prk::CUDA::copyD2H(h_b, d_b, nelems); -#if CUBLAS_AXPY_BUG - prk::CUDA::free(d_o); - prk::CUDA::free_host(h_o); -#endif - prk::CUDA::free(d_a); prk::CUDA::free(d_b); + prk::CUDA::free(d_o); + prk::CUDA::free_host(h_o); prk::CUDA::check( cublasDestroy(h) ); //prk::CUDA::check( cublasShutdown() ); From b94fdb45dcfa0ce3d57e468ea7b85885b40e8cc4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 8 Apr 2021 16:00:10 -0700 Subject: [PATCH 060/325] tiling implementation is bad --- FORTRAN/transpose-openmp-target.F90 | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index fa464130e..344a4c34a 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -65,7 +65,7 @@ program main integer(kind=INT32) :: order ! order of a the matrix real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix - real(kind=REAL64) :: T(32,32) ! Tile + !real(kind=REAL64) :: T(32,32) ! Tile integer(kind=INT64) :: bytes ! combined size of matrices ! runtime variables integer(kind=INT32) :: i, j, k @@ -165,18 +165,18 @@ program main !$omp target teams distribute collapse(2) do jt=1,order,tile_size do it=1,order,tile_size + !!$omp parallel do simd collapse(2) schedule(static,1) + !do j=1,tile_size + ! do i=1,tile_size + ! T(i,j) = A(it+i-1,jt+j-1) + ! enddo + !enddo + !!$omp end parallel do simd !$omp parallel do simd collapse(2) schedule(static,1) do j=1,tile_size do i=1,tile_size - T(i,j) = A(it+i-1,jt+j-1) - enddo - enddo - !$omp end parallel do simd - !$omp parallel do simd collapse(2) schedule(static,1) - do j=1,tile_size - do i=1,tile_size - !B(jt+j-1,it+i-1) = B(jt+j-1,it+i-1) + A(it+i-1,jt+j-1) - B(jt+j-1,it+i-1) = B(jt+j-1,it+i-1) + T(i,j) + B(jt+j-1,it+i-1) = B(jt+j-1,it+i-1) + A(it+i-1,jt+j-1) + !B(jt+j-1,it+i-1) = B(jt+j-1,it+i-1) + T(i,j) A(it+i-1,jt+j-1) = A(it+i-1,jt+j-1) + 1.0 enddo enddo From 576297fa373d0bafa1e5877954b49a9a6c8df985 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 9 Apr 2021 10:13:51 -0700 Subject: [PATCH 061/325] add Cray temps (#568) --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 81f301457..eb075285b 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,11 @@ octave-workspace # Octave crashes *.patch # patch files *.dbg # Flang +# Cray GPU compiler +*.lst +*.cub +*.ptx + common/make.defs scripts/small/runfgmpi scripts/wide/runfgmpi From 8cc665554bf4ff98ec18a7b2fbe3a55405560581 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 08:18:45 -0700 Subject: [PATCH 062/325] updates for CUDA --- Cxx11/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 584c66500..a9d5667e9 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -50,7 +50,7 @@ THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) SYCLFLAGS = $(SYCLFLAG) OPENACCFLAGS = $(OPENACCFLAG) -STDPARFLAGS = $(STDPARFLAG) #$(RANGEFLAGS) +STDPARFLAGS = $(STDPARFLAG) $(RANGEFLAGS) ifdef OCCADIR include ${OCCADIR}/scripts/makefile @@ -251,10 +251,10 @@ endif $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -o $@ %-mpi-cublas: %-mpi-cublas.cu prk_util.h prk_cuda.h prk_mpi.h - $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $(MPIINC) -DPRK_USE_CUBLAS $< -lcublas $(MPILIB) -o $@ + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $(MPIINC) -DPRK_USE_CUBLAS $< -lcublas -lcublasLt $(MPILIB) -o $@ %-cublas: %-cublas.cu prk_util.h prk_cuda.h - $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) -DPRK_USE_CUBLAS $< -lcublas -o $@ + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) -DPRK_USE_CUBLAS $< -lcublas -lcublasLt -o $@ %-hip: %-hip.cc prk_util.h prk_hip.h $(HIPCC) $(HIPFLAGS) $(CPPFLAGS) $< -o $@ From 6c4ff221aaff71fb6211b5554287f9061c22828f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 08:19:07 -0700 Subject: [PATCH 063/325] allow no tiling --- FORTRAN/transpose-stdpar.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FORTRAN/transpose-stdpar.F90 b/FORTRAN/transpose-stdpar.F90 index 02ce93352..3d5ffefd2 100644 --- a/FORTRAN/transpose-stdpar.F90 +++ b/FORTRAN/transpose-stdpar.F90 @@ -124,11 +124,11 @@ program main tile_size = order ! no tiling endif - if (mod(order,tile_size).ne.0) then + if ((tile_size.gt.0).and.(mod(order,tile_size).ne.0)) then write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size' stop 1 endif - if (tile_size.gt.32) then + if ((tile_size.ne.order).and.(tile_size.gt.32)) then write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space' stop 1 endif From dd0c72a84530a2b522fbbbb1d981448e238236ba Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 08:26:41 -0700 Subject: [PATCH 064/325] T should be team private --- FORTRAN/transpose-openmp-target.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index fa464130e..20656ddc4 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -162,7 +162,7 @@ program main if (k.eq.1) t0 = omp_get_wtime() if (tile_size.lt.order) then - !$omp target teams distribute collapse(2) + !$omp target teams distribute collapse(2) private(T) do jt=1,order,tile_size do it=1,order,tile_size !$omp parallel do simd collapse(2) schedule(static,1) From 4996457044f2c481e34070f5ddf0b658d3dfe3e1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 08:18:45 -0700 Subject: [PATCH 065/325] updates for CUDA --- Cxx11/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 584c66500..a9d5667e9 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -50,7 +50,7 @@ THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) SYCLFLAGS = $(SYCLFLAG) OPENACCFLAGS = $(OPENACCFLAG) -STDPARFLAGS = $(STDPARFLAG) #$(RANGEFLAGS) +STDPARFLAGS = $(STDPARFLAG) $(RANGEFLAGS) ifdef OCCADIR include ${OCCADIR}/scripts/makefile @@ -251,10 +251,10 @@ endif $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -o $@ %-mpi-cublas: %-mpi-cublas.cu prk_util.h prk_cuda.h prk_mpi.h - $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $(MPIINC) -DPRK_USE_CUBLAS $< -lcublas $(MPILIB) -o $@ + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $(MPIINC) -DPRK_USE_CUBLAS $< -lcublas -lcublasLt $(MPILIB) -o $@ %-cublas: %-cublas.cu prk_util.h prk_cuda.h - $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) -DPRK_USE_CUBLAS $< -lcublas -o $@ + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) -DPRK_USE_CUBLAS $< -lcublas -lcublasLt -o $@ %-hip: %-hip.cc prk_util.h prk_hip.h $(HIPCC) $(HIPFLAGS) $(CPPFLAGS) $< -o $@ From 2dce5da0ddd5a44fb1b7b6889306e68f3802d096 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 08:19:07 -0700 Subject: [PATCH 066/325] allow no tiling --- FORTRAN/transpose-stdpar.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FORTRAN/transpose-stdpar.F90 b/FORTRAN/transpose-stdpar.F90 index 02ce93352..3d5ffefd2 100644 --- a/FORTRAN/transpose-stdpar.F90 +++ b/FORTRAN/transpose-stdpar.F90 @@ -124,11 +124,11 @@ program main tile_size = order ! no tiling endif - if (mod(order,tile_size).ne.0) then + if ((tile_size.gt.0).and.(mod(order,tile_size).ne.0)) then write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size' stop 1 endif - if (tile_size.gt.32) then + if ((tile_size.ne.order).and.(tile_size.gt.32)) then write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space' stop 1 endif From e2f0b74ca8362e9847e48b606f0896edec10ee64 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 08:26:41 -0700 Subject: [PATCH 067/325] T should be team private --- FORTRAN/transpose-openmp-target.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index 344a4c34a..5e953e8f8 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -162,7 +162,7 @@ program main if (k.eq.1) t0 = omp_get_wtime() if (tile_size.lt.order) then - !$omp target teams distribute collapse(2) + !$omp target teams distribute collapse(2) private(T) do jt=1,order,tile_size do it=1,order,tile_size !!$omp parallel do simd collapse(2) schedule(static,1) From f29ecc6bd1a8cfccdf9298b93ca154c866250fa1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 11:38:11 -0700 Subject: [PATCH 068/325] add CUDA Fortran --- FORTRAN/Makefile | 6 + FORTRAN/nstream-cufortran.cuf | 245 ++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 FORTRAN/nstream-cufortran.cuf diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 87c1b9981..b5abd0cc5 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -78,6 +78,9 @@ target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgem openacc: p2p-openacc p2p-innerloop-openacc stencil-openacc transpose-openacc nstream-openacc +cuf: cufortran +cufortran: nstream-cufortran + stdpar: nstream-stdpar stencil-stdpar transpose-stdpar blas: dgemm-blas @@ -122,6 +125,9 @@ dgemm-blas: dgemm-blas.F90 %-openacc: %-openacc.F90 $(FC) $(FCFLAGS) $(OPENACCFLAG) $< -o $@ +%-cufortran: %-cufortran.cuf + $(FC) $(FCFLAGS) $(CUFORTFLAG) $< -o $@ + %-stdpar: %-stdpar.F90 $(FC) $(FCFLAGS) $(STDPARFLAG) $< -o $@ diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.cuf new file mode 100644 index 000000000..c8c18190b --- /dev/null +++ b/FORTRAN/nstream-cufortran.cuf @@ -0,0 +1,245 @@ +! +! Copyright (c) 2017, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: nstream +! +! PURPOSE: To compute memory bandwidth when adding a vector of a given +! number of double precision values to the scalar multiple of +! another vector of the same length, and storing the result in +! a third vector. +! +! USAGE: The program takes as input the number +! of iterations to loop over the triad vectors, the length of the +! vectors, and the block_size for the GPU. +! +! <# iterations> +! +! The output consists of diagnostics to make sure the +! algorithm worked, and of timing statistics. +! +! NOTES: Bandwidth is determined as the number of words read, plus the +! number of words written, times the size of the words, divided +! by the execution time. For a vector length of N, the total +! number of words read and written is 4*N*sizeof(double). +! +! +! HISTORY: This code is loosely based on the Stream benchmark by John +! McCalpin, but does not follow all the Stream rules. Hence, +! reported results should not be associated with Stream in +! external publications +! +! Converted to C++11 by Jeff Hammond, November May 2017. +! +! ******************************************************************* + +function prk_get_wtime() result(t) + use iso_fortran_env + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +end function prk_get_wtime + +module kernels +contains + attributes(global) subroutine nstream(scalar, A, B, C) + use iso_fortran_env + implicit none + real(kind=REAL64), intent(inout) :: A(:) + real(kind=REAL64), intent(in) :: B(:), C(:) + real(kind=REAL64), intent(in), value :: scalar + integer :: i, n + n = size(A) + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= n) then + A(i) = A(i) + B(i) + scalar * C(i) + endif + end subroutine nstream +end module kernels + +program main + use iso_fortran_env + use cudafor + use kernels + implicit none + real(kind=REAL64) :: prk_get_wtime + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations, block_size + integer(kind=INT64) :: length + real(kind=REAL64), allocatable, managed :: A(:) + real(kind=REAL64), allocatable, managed :: B(:) + real(kind=REAL64), allocatable, managed :: C(:) + real(kind=REAL64) :: scalar + integer(kind=INT64) :: bytes + ! runtime variables + integer(kind=INT64) :: i + integer(kind=INT32) :: k + real(kind=REAL64) :: asum, ar, br, cr + real(kind=REAL64) :: t0, t1, nstream_time, avgtime + real(kind=REAL64), parameter :: epsilon=1.D-8 + ! CUDA stuff + type(dim3) :: grid, tblock + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a45)') 'CUDA Fortran STREAM triad: A = B + scalar * C' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a64)') 'Usage: ./transpose <# iterations> []' + stop 1 + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + stop 1 + endif + + length = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') length + if (length .lt. 1) then + write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length + stop 1 + endif + + block_size = 256 + if (command_argument_count().gt.2) then + call get_command_argument(3,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') block_size + if (block_size .lt. 0) then + write(*,'(a,i5)') 'ERROR: block_size must be positive : ', block_size + stop 1 + endif + endif + + write(*,'(a,i12)') 'Number of iterations = ', iterations + write(*,'(a,i12)') 'Vector length = ', length + write(*,'(a,i12)') 'GPU block size = ', block_size + + tblock = dim3(block_size,1,1) + grid = dim3(ceiling(real(length)/tblock%x),1,1) + + ! ******************************************************************** + ! ** Allocate space for the input and transpose matrix + ! ******************************************************************** + + allocate( A(length), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of A returned ',err + stop 1 + endif + + allocate( B(length), stat=err ) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of B returned ',err + stop 1 + endif + + allocate( C(length), stat=err ) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of C returned ',err + stop 1 + endif + + scalar = 3 + + t0 = 0 + + do i=1,length + A(i) = 0 + B(i) = 2 + C(i) = 2 + enddo + + do k=0,iterations + + if (k.eq.1) t0 = prk_get_wtime() + + call nstream<<>>(scalar, A, B, C) + + enddo ! iterations + + t1 = prk_get_wtime() + + nstream_time = t1 - t0 + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + ar = 0 + br = 2 + cr = 2 + do k=0,iterations + ar = ar + br + scalar * cr; + enddo + + asum = 0 + do i=1,length + asum = asum + abs(A(i)-ar) + enddo + + deallocate( C ) + deallocate( B ) + deallocate( A ) + + if (abs(asum) .gt. epsilon) then + write(*,'(a35)') 'Failed Validation on output array' + write(*,'(a30,f30.15)') ' Expected value: ', ar + write(*,'(a30,f30.15)') ' Observed value: ', A(1) + write(*,'(a35)') 'ERROR: solution did not validate' + stop 1 + else + write(*,'(a17)') 'Solution validates' + avgtime = nstream_time/iterations; + bytes = 4 * int(length,INT64) * storage_size(A)/8 + write(*,'(a12,f15.3,1x,a12,e15.6)') & + 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & + 'Avg time (s): ', avgtime + endif + +end program main + From 4ddd109b6258324addd55478c494006451e2d8f3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 11:38:54 -0700 Subject: [PATCH 069/325] add CUDA Fortran --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index eb075285b..2aa16a256 100644 --- a/.gitignore +++ b/.gitignore @@ -261,6 +261,7 @@ Cxx11/transpose-thread Cxx11/transpose-valarray Cxx11/transpose-vector Cxx11/transpose-vector-raja +FORTRAN/*.mod FORTRAN/dgemm FORTRAN/dgemm-ga FORTRAN/dgemm-openmp @@ -269,6 +270,7 @@ FORTRAN/dgemm-pretty FORTRAN/dgemm-taskloop-openmp FORTRAN/nstream FORTRAN/nstream-coarray +FORTRAN/nstream-cufortran FORTRAN/nstream-ga FORTRAN/nstream-openmp FORTRAN/nstream-mpi From 8bfd72855ea7481faae5805fc7fad09047640cee Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 14:41:27 -0700 Subject: [PATCH 070/325] OpenACC collapse works now --- FORTRAN/transpose-openacc.F90 | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index 907cf5111..5745e1b5c 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -145,9 +145,8 @@ program main t0 = 0 if (tile_size.lt.order) then - !$acc parallel loop gang ! collapse(2) leads to incorrect results + !$acc parallel loop gang collapse(2) !leads to incorrect results do jt=1,order,tile_size - !$acc loop do it=1,order,tile_size !$acc loop vector collapse(2) do j=jt,min(order,jt+tile_size-1) @@ -159,9 +158,8 @@ program main enddo enddo else - !$acc parallel loop gang + !$acc parallel loop collapse(2) do j=1,order - !$acc loop vector do i=1,order A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) B(i,j) = 0.0 @@ -176,9 +174,8 @@ program main ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if (tile_size.lt.order) then - !$acc parallel loop gang ! collapse(2) leads to incorrect results + !$acc parallel loop gang collapse(2) !leads to incorrect results do jt=1,order,tile_size - !$acc loop do it=1,order,tile_size !$acc loop vector collapse(2) do j=jt,min(order,jt+tile_size-1) @@ -190,9 +187,8 @@ program main enddo enddo else - !$acc parallel loop gang + !$acc parallel loop collapse(2) do j=1,order - !$acc loop vector do i=1,order B(j,i) = B(j,i) + A(i,j) A(i,j) = A(i,j) + 1.0 @@ -215,9 +211,8 @@ program main abserr = 0.0 ! this will overflow if iterations>>1000 addit = (0.5*iterations) * (iterations+1) - !$acc parallel loop reduction(+:abserr) + !$acc parallel loop collapse(2) reduction(+:abserr) do j=1,order - !$acc loop reduction(+:abserr) do i=1,order temp = ((real(order,REAL64)*real(i-1,REAL64))+real(j-1,REAL64)) & * real(iterations+1,REAL64) From 3cc2c5cb509f90fa1f66714bbddf9d1bd82a60f0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 11:48:23 -0700 Subject: [PATCH 071/325] set default tilesize to 16 and fix untiled --- FORTRAN/transpose-stdpar.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FORTRAN/transpose-stdpar.F90 b/FORTRAN/transpose-stdpar.F90 index 3d5ffefd2..4f9a93aa3 100644 --- a/FORTRAN/transpose-stdpar.F90 +++ b/FORTRAN/transpose-stdpar.F90 @@ -113,7 +113,7 @@ program main endif ! same default as the C implementation - tile_size = 32 + tile_size = 16 if (command_argument_count().gt.2) then call get_command_argument(3,argtmp,arglen,err) if (err.eq.0) read(argtmp,'(i32)') tile_size @@ -128,7 +128,7 @@ program main write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size' stop 1 endif - if ((tile_size.ne.order).and.(tile_size.gt.32)) then + if ((tile_size.ne.order) .and. (tile_size.gt.32)) then write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space' stop 1 endif From 4fabf07adf14e212d6c7d92abb9a68600bed4d15 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 5 Apr 2021 11:51:53 -0700 Subject: [PATCH 072/325] bug has been fixed --- FORTRAN/transpose-openacc.F90 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index 5745e1b5c..af6798745 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -145,7 +146,7 @@ program main t0 = 0 if (tile_size.lt.order) then - !$acc parallel loop gang collapse(2) !leads to incorrect results + !$acc parallel loop gang collapse(2) do jt=1,order,tile_size do it=1,order,tile_size !$acc loop vector collapse(2) @@ -174,7 +175,7 @@ program main ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if (tile_size.lt.order) then - !$acc parallel loop gang collapse(2) !leads to incorrect results + !$acc parallel loop gang collapse(2) do jt=1,order,tile_size do it=1,order,tile_size !$acc loop vector collapse(2) From 0d4e61ccea76a06bbea6d17d52367dc1b9d9a50b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 11:47:48 -0700 Subject: [PATCH 073/325] update toolchain example --- common/make.defs.nvhpc | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index 863119eec..a1b2040af 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -1,15 +1,16 @@ # # This file shows the NVHPC toolchain options. -NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_x86_64/2021 +#NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_x86_64/2021 +#NVHPC_CBIN=${NVHPC_PATH}/compilers/bin # # Base compilers and language options # # C99 is required in some implementations. -CC=${NVHPC_PATH}/compilers/bin/nvc -c11 +CC=${NVHPC_CBIB}nvc -c11 # All of the Fortran code is written for the 2008 standard and requires preprocessing. -FC=${NVHPC_PATH}/compilers/bin/nvfortran -acc -gpu=managed -cuda -cudalib -target=gpu -DNVHPC +FC=${NVHPC_CBIN}nvfortran -DNVHPC # C++11 may not be required but does no harm here. -CXX=${NVHPC_PATH}/compilers/bin/nvc++ --c++17 +CXX=${NVHPC_CBIN}nvc++ -std=gnu++20 # # Compiler flags # @@ -21,13 +22,17 @@ DEFAULT_OPT_FLAGS+=-Wall #-Werror OPENMPFLAG=-mp #OPENMPFLAG+=-Minfo=mp,vect OPENMPSIMDFLAG= -OFFLOADFLAG=-mp -target=gpu +OFFLOADFLAG=-mp -target=gpu -gpu=managed OFFLOADFLAG+=-Minfo=accel OFFLOADFLAG+=-DGPU_SCHEDULE="schedule(static,1)" OPENACCFLAG=-acc -target=gpu OPENACCFLAG+=-Mlarge_arrays OPENACCFLAG+=-Minfo=accel -STDPARFLAG=-stdpar -Minfo=accel +STDPARFLAG=-stdpar -gpu=managed +STDPARFLAG+=-Minfo=accel +STDPARFLAG+=-cudalib=cublas,cutensor +CUFORTFLAG=-cuda -gpu=managed -acc # ACC required for CUF+managed +CUFORTFLAG+=-Minfo=accel # # OpenCL flags # @@ -39,16 +44,16 @@ OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL # # Parallel STL, Boost, etc. # -BOOSTFLAG=-I/usr/local/Cellar/boost/1.69.0_2/include -RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +BOOSTFLAG= +#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} KOKKOSDIR= KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} RAJADIR= RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} -THRUSTDIR=/opt/nvidia/thrust -THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +THRUSTDIR=../deps/thrust +THRUSTFLAG=-I${THRUSTDIR} # # CBLAS for C++ DGEMM # @@ -59,12 +64,16 @@ CBLASFLAG= # # Linux w/ NVIDIA CUDA # Use appropriate arch or code is compiled to ancient features. -NVCC=${NVHPC_PATH}/compilers/bin/nvcc -CUDAFLAGS=-g -O3 -std=c++11 +#NVCC=${NVHPC_CBIN}nvc++ +NVCC=${NVHPC_CBIN}nvcc +CUDAFLAGS=-g -O3 -std=c++14 +CUDAFLAGS+=--extended-lambda CUDAFLAGS+=--gpu-architecture=sm_70 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' +CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/11.2/targets/x86_64-linux/include +CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/11.2/targets/x86_64-linux/lib # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 # heavy hammer: CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED @@ -92,6 +101,7 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED # # mpiicc wraps icc. mpicc and mpigcc wrap gcc. MPIDIR=${NVHPC_PATH}/comm_libs/openmpi/openmpi-3.1.5 +#MPIDIR=${NVHPC_PATH}/comm_libs/openmpi4/openmpi-4.0.5 MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIINC=-I${MPIDIR}/include From f4c91524552658c12a0193fe7c8010afeefb30a9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 12 Apr 2021 10:43:35 -0700 Subject: [PATCH 074/325] remove unnecessary argument --- Cxx11/transpose-cuda.cu | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu index d7e0fcf64..2495a7117 100644 --- a/Cxx11/transpose-cuda.cu +++ b/Cxx11/transpose-cuda.cu @@ -67,11 +67,10 @@ __global__ void transposeNoBankConflict(int order, double * A, double * B) auto x = blockIdx.x * tile_dim + threadIdx.x; auto y = blockIdx.y * tile_dim + threadIdx.y; - auto width = gridDim.x * tile_dim; for (int j = 0; j < tile_dim; j += block_rows) { - tile[threadIdx.y+j][threadIdx.x] = A[(y+j)*width + x]; - A[(y+j)*width + x] += (double)1; + tile[threadIdx.y+j][threadIdx.x] = A[(y+j)*order + x]; + A[(y+j)*order + x] += (double)1; } __syncthreads(); @@ -80,7 +79,7 @@ __global__ void transposeNoBankConflict(int order, double * A, double * B) y = blockIdx.x * tile_dim + threadIdx.y; for (int j = 0; j < tile_dim; j+= block_rows) { - B[(y+j)*width + x] += tile[threadIdx.x][threadIdx.y + j]; + B[(y+j)*order + x] += tile[threadIdx.x][threadIdx.y + j]; } } @@ -90,11 +89,10 @@ __global__ void transposeCoalesced(int order, double * A, double * B) auto x = blockIdx.x * tile_dim + threadIdx.x; auto y = blockIdx.y * tile_dim + threadIdx.y; - auto width = gridDim.x * tile_dim; for (int j = 0; j < tile_dim; j += block_rows) { - tile[threadIdx.y+j][threadIdx.x] = A[(y+j)*width + x]; - A[(y+j)*width + x] += (double)1; + tile[threadIdx.y+j][threadIdx.x] = A[(y+j)*order + x]; + A[(y+j)*order + x] += (double)1; } __syncthreads(); @@ -103,7 +101,7 @@ __global__ void transposeCoalesced(int order, double * A, double * B) y = blockIdx.x * tile_dim + threadIdx.y; for (int j = 0; j < tile_dim; j+= block_rows) { - B[(y+j)*width + x] += tile[threadIdx.x][threadIdx.y + j]; + B[(y+j)*order + x] += tile[threadIdx.x][threadIdx.y + j]; } } @@ -111,11 +109,10 @@ __global__ void transposeNaive(int order, double * A, double * B) { auto x = blockIdx.x * tile_dim + threadIdx.x; auto y = blockIdx.y * tile_dim + threadIdx.y; - auto width = gridDim.x * tile_dim; for (int j = 0; j < tile_dim; j+= block_rows) { - B[x*width + (y+j)] += A[(y+j)*width + x]; - A[(y+j)*width + x] += (double)1; + B[x*order + (y+j)] += A[(y+j)*order + x]; + A[(y+j)*order + x] += (double)1; } } From 19e5bbf2d672412abe6a255460d5b205b1f8de08 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 20:50:56 -0700 Subject: [PATCH 075/325] fix comments --- FORTRAN/nstream-coarray.F90 | 2 +- FORTRAN/nstream-cufortran.cuf | 2 +- FORTRAN/nstream-ga.F90 | 2 +- FORTRAN/nstream-mpi.F90 | 2 +- FORTRAN/nstream-openacc.F90 | 2 +- FORTRAN/nstream-openmp-target.F90 | 2 +- FORTRAN/nstream-openmp.F90 | 2 +- FORTRAN/nstream-pretty.F90 | 2 +- FORTRAN/nstream-stdpar.F90 | 2 +- FORTRAN/nstream-taskloop-openmp.F90 | 2 +- FORTRAN/nstream.F90 | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/FORTRAN/nstream-coarray.F90 b/FORTRAN/nstream-coarray.F90 index f113ff566..78060d587 100644 --- a/FORTRAN/nstream-coarray.F90 +++ b/FORTRAN/nstream-coarray.F90 @@ -152,7 +152,7 @@ program main endif ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length)[*], stat=err) diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.cuf index c8c18190b..173d438ca 100644 --- a/FORTRAN/nstream-cufortran.cuf +++ b/FORTRAN/nstream-cufortran.cuf @@ -163,7 +163,7 @@ program main grid = dim3(ceiling(real(length)/tblock%x),1,1) ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) diff --git a/FORTRAN/nstream-ga.F90 b/FORTRAN/nstream-ga.F90 index eaa3ef97a..bc22cc311 100644 --- a/FORTRAN/nstream-ga.F90 +++ b/FORTRAN/nstream-ga.F90 @@ -181,7 +181,7 @@ program main call ga_sync() ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** t0 = 0.0d0 diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index 1b6abc63b..fad8550b4 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -158,7 +158,7 @@ program main call MPI_Bcast(length, 1, MPI_INT64_T, 0, MPI_COMM_WORLD) ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) diff --git a/FORTRAN/nstream-openacc.F90 b/FORTRAN/nstream-openacc.F90 index e3981bde2..6281e718c 100644 --- a/FORTRAN/nstream-openacc.F90 +++ b/FORTRAN/nstream-openacc.F90 @@ -139,7 +139,7 @@ program main write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) diff --git a/FORTRAN/nstream-openmp-target.F90 b/FORTRAN/nstream-openmp-target.F90 index cf5249d0c..9d45ec335 100644 --- a/FORTRAN/nstream-openmp-target.F90 +++ b/FORTRAN/nstream-openmp-target.F90 @@ -120,7 +120,7 @@ program main write(*,'(a,i12)') 'Matrix length = ', length ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) diff --git a/FORTRAN/nstream-openmp.F90 b/FORTRAN/nstream-openmp.F90 index e84f2d471..0057ec3ea 100644 --- a/FORTRAN/nstream-openmp.F90 +++ b/FORTRAN/nstream-openmp.F90 @@ -131,7 +131,7 @@ program main write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) diff --git a/FORTRAN/nstream-pretty.F90 b/FORTRAN/nstream-pretty.F90 index c0068148d..5f414a933 100644 --- a/FORTRAN/nstream-pretty.F90 +++ b/FORTRAN/nstream-pretty.F90 @@ -138,7 +138,7 @@ program main write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) diff --git a/FORTRAN/nstream-stdpar.F90 b/FORTRAN/nstream-stdpar.F90 index d715368ea..4857fb466 100644 --- a/FORTRAN/nstream-stdpar.F90 +++ b/FORTRAN/nstream-stdpar.F90 @@ -139,7 +139,7 @@ program main write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) diff --git a/FORTRAN/nstream-taskloop-openmp.F90 b/FORTRAN/nstream-taskloop-openmp.F90 index 476f772ae..0a235444c 100644 --- a/FORTRAN/nstream-taskloop-openmp.F90 +++ b/FORTRAN/nstream-taskloop-openmp.F90 @@ -131,7 +131,7 @@ program main write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) diff --git a/FORTRAN/nstream.F90 b/FORTRAN/nstream.F90 index 0d0aafb6a..963b2f996 100644 --- a/FORTRAN/nstream.F90 +++ b/FORTRAN/nstream.F90 @@ -139,7 +139,7 @@ program main write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** - ! ** Allocate space for the input and transpose matrix + ! ** Allocate space and perform the computation ! ******************************************************************** allocate( A(length), stat=err) From 7d888e424bf7b89cedc87e57245a092c46e2e861 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 20:55:23 -0700 Subject: [PATCH 076/325] fix comments and target tile bug --- FORTRAN/nstream-coarray.F90 | 2 +- FORTRAN/nstream-cufortran.cuf | 2 +- FORTRAN/nstream-ga.F90 | 2 +- FORTRAN/nstream-mpi.F90 | 2 +- FORTRAN/nstream-openacc.F90 | 2 +- FORTRAN/nstream-openmp-target.F90 | 2 +- FORTRAN/nstream-openmp.F90 | 2 +- FORTRAN/nstream-pretty.F90 | 2 +- FORTRAN/nstream-stdpar.F90 | 2 +- FORTRAN/nstream-taskloop-openmp.F90 | 2 +- FORTRAN/nstream.F90 | 2 +- FORTRAN/transpose-openmp-target.F90 | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/FORTRAN/nstream-coarray.F90 b/FORTRAN/nstream-coarray.F90 index 78060d587..5e2fa9259 100644 --- a/FORTRAN/nstream-coarray.F90 +++ b/FORTRAN/nstream-coarray.F90 @@ -112,7 +112,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a49)') 'Usage: ./transpose <# iterations> ' + write(*,'(a49)') 'Usage: ./nstream <# iterations> ' error stop 1 endif diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.cuf index 173d438ca..ac716c1ff 100644 --- a/FORTRAN/nstream-cufortran.cuf +++ b/FORTRAN/nstream-cufortran.cuf @@ -125,7 +125,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a64)') 'Usage: ./transpose <# iterations> []' + write(*,'(a64)') 'Usage: ./nstream <# iterations> []' stop 1 endif diff --git a/FORTRAN/nstream-ga.F90 b/FORTRAN/nstream-ga.F90 index bc22cc311..1838dd594 100644 --- a/FORTRAN/nstream-ga.F90 +++ b/FORTRAN/nstream-ga.F90 @@ -114,7 +114,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + write(*,'(a62)') 'Usage: ./nstream <# iterations> []' stop 1 endif diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index fad8550b4..43849e4ee 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -127,7 +127,7 @@ program main if (command_argument_count().lt.2) then if (me.eq.0) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a49)') 'Usage: ./transpose <# iterations> ' + write(*,'(a49)') 'Usage: ./nstream <# iterations> ' endif call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) endif diff --git a/FORTRAN/nstream-openacc.F90 b/FORTRAN/nstream-openacc.F90 index 6281e718c..c04364a96 100644 --- a/FORTRAN/nstream-openacc.F90 +++ b/FORTRAN/nstream-openacc.F90 @@ -104,7 +104,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + write(*,'(a62)') 'Usage: ./nstream <# iterations> []' stop 1 endif diff --git a/FORTRAN/nstream-openmp-target.F90 b/FORTRAN/nstream-openmp-target.F90 index 9d45ec335..8b16e5a71 100644 --- a/FORTRAN/nstream-openmp-target.F90 +++ b/FORTRAN/nstream-openmp-target.F90 @@ -95,7 +95,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' + write(*,'(a62)') 'Usage: ./nstream <# iterations> ' stop 1 endif diff --git a/FORTRAN/nstream-openmp.F90 b/FORTRAN/nstream-openmp.F90 index 0057ec3ea..052b3005d 100644 --- a/FORTRAN/nstream-openmp.F90 +++ b/FORTRAN/nstream-openmp.F90 @@ -95,7 +95,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + write(*,'(a62)') 'Usage: ./nstream <# iterations> []' stop 1 endif diff --git a/FORTRAN/nstream-pretty.F90 b/FORTRAN/nstream-pretty.F90 index 5f414a933..59205a6e7 100644 --- a/FORTRAN/nstream-pretty.F90 +++ b/FORTRAN/nstream-pretty.F90 @@ -103,7 +103,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + write(*,'(a62)') 'Usage: ./nstream <# iterations> []' stop 1 endif diff --git a/FORTRAN/nstream-stdpar.F90 b/FORTRAN/nstream-stdpar.F90 index 4857fb466..43c0e442f 100644 --- a/FORTRAN/nstream-stdpar.F90 +++ b/FORTRAN/nstream-stdpar.F90 @@ -104,7 +104,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + write(*,'(a62)') 'Usage: ./nstream <# iterations> []' stop 1 endif diff --git a/FORTRAN/nstream-taskloop-openmp.F90 b/FORTRAN/nstream-taskloop-openmp.F90 index 0a235444c..716d5a83a 100644 --- a/FORTRAN/nstream-taskloop-openmp.F90 +++ b/FORTRAN/nstream-taskloop-openmp.F90 @@ -95,7 +95,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + write(*,'(a62)') 'Usage: ./nstream <# iterations> []' stop 1 endif diff --git a/FORTRAN/nstream.F90 b/FORTRAN/nstream.F90 index 963b2f996..86bc57814 100644 --- a/FORTRAN/nstream.F90 +++ b/FORTRAN/nstream.F90 @@ -104,7 +104,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + write(*,'(a62)') 'Usage: ./nstream <# iterations> []' stop 1 endif diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index 5e953e8f8..7663876de 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -162,7 +162,7 @@ program main if (k.eq.1) t0 = omp_get_wtime() if (tile_size.lt.order) then - !$omp target teams distribute collapse(2) private(T) + !$omp target teams distribute collapse(2) !!! private(T) do jt=1,order,tile_size do it=1,order,tile_size !!$omp parallel do simd collapse(2) schedule(static,1) From 5d8c4008fbb27468fd7b26f9c81e5650a053b13b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 10 Apr 2021 21:44:31 -0700 Subject: [PATCH 077/325] compiles but untested --- FORTRAN/transpose-cufortran.cuf | 211 ++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 FORTRAN/transpose-cufortran.cuf diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.cuf new file mode 100644 index 000000000..f9bf68673 --- /dev/null +++ b/FORTRAN/transpose-cufortran.cuf @@ -0,0 +1,211 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: transpose +! +! PURPOSE: This program measures the time for the transpose of a +! column-major stored matrix into a row-major stored matrix. +! +! USAGE: Program input is the matrix order and the number of times to +! repeat the operation: +! +! transpose <# iterations> [tile size] +! +! An optional parameter specifies the tile size used to divide the +! individual matrix blocks for improved cache and TLB performance. +! +! The output consists of diagnostics to make sure the +! transpose worked and timing statistics. +! +! HISTORY: Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2015 +! ******************************************************************* + +function prk_get_wtime() result(t) + use iso_fortran_env + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +end function prk_get_wtime + +module kernels + use iso_fortran_env + integer(kind=INT32), parameter :: tile_dim = 32 + integer(kind=INT32), parameter :: block_rows = 8 + contains + attributes(global) subroutine transposeNaive(order, A, B) + use iso_fortran_env + implicit none + real(kind=REAL64), intent(inout) :: A(:,:), B(:,:) + integer(kind=INT32), intent(in), value :: order + integer :: x, y, j + x = blockIdx%x * tile_dim + threadIdx%x; + y = blockIdx%y * tile_dim + threadIdx%y; + do j = 1,tile_dim,block_rows + B(x,y+j) = B(x,y+j) + A(y+j,x); + A(y+j,x) = A(y+j,x) + 1; + end do + end subroutine transposeNaive +end module kernels + +program main + use iso_fortran_env + use cudafor + use kernels + implicit none + real(kind=REAL64) :: prk_get_wtime + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations ! number of times to do the transpose + integer(kind=INT32) :: order ! order of a the matrix + real(kind=REAL64), allocatable, managed :: A(:,:)! buffer to hold original matrix + real(kind=REAL64), allocatable, managed :: B(:,:)! buffer to hold transposed matrix + integer(kind=INT64) :: bytes ! combined size of matrices + ! runtime variables + integer(kind=INT32) :: i, j, k + real(kind=REAL64) :: abserr, addit, temp ! squared error + real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters + real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance + ! CUDA stuff + type(dim3) :: grid, tblock + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a41)') 'CUDA Fortran Matrix transpose: B = A^T' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a62)') 'Usage: ./transpose <# iterations> ' + stop 1 + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + stop 1 + endif + + order = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') order + if (order .lt. 1) then + write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order + stop 1 + endif + + write(*,'(a,i8)') 'Number of iterations = ', iterations + write(*,'(a,i8)') 'Matrix order = ', order + + tblock = dim3(order/tile_dim, order/tile_dim, 1) + grid = dim3(tile_dim, block_rows, 1) + + ! ******************************************************************** + ! ** Allocate space for the input and transpose matrix + ! ******************************************************************** + + allocate( A(order,order), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of A returned ',err + stop 1 + endif + + allocate( B(order,order), stat=err ) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of B returned ',err + stop 1 + endif + + t0 = 0 + + do j=1,order + do i=1,order + A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + B(i,j) = 0.0 + enddo + enddo + + do k=0,iterations + + if (k.eq.1) t0 = prk_get_wtime() + + call transposeNaive<<>>(order, A, B) + + enddo ! iterations + + t1 = prk_get_wtime() + + trans_time = t1 - t0 + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + abserr = 0.0 + ! this will overflow if iterations>>1000 + addit = (0.5*iterations) * (iterations+1) + do j=1,order + do i=1,order + temp = ((real(order,REAL64)*real(i-1,REAL64))+real(j-1,REAL64)) & + * real(iterations+1,REAL64) + abserr = abserr + abs(B(i,j) - (temp+addit)) + enddo + enddo + + deallocate( B ) + deallocate( A ) + + if (abserr .lt. epsilon) then + write(*,'(a)') 'Solution validates' + avgtime = trans_time/iterations + bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(A)/8 + write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & + ' Avg time (s): ', avgtime + else + write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & + 'exceeds threshold ',epsilon + stop 1 + endif + +end program main + From 7d796ace9496c56e0f429e9629274e129871ed53 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 11 Apr 2021 20:12:10 -0700 Subject: [PATCH 078/325] update makefile --- FORTRAN/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index b5abd0cc5..30c1ba9fc 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -79,7 +79,7 @@ target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgem openacc: p2p-openacc p2p-innerloop-openacc stencil-openacc transpose-openacc nstream-openacc cuf: cufortran -cufortran: nstream-cufortran +cufortran: nstream-cufortran transpose-cufortran stdpar: nstream-stdpar stencil-stdpar transpose-stdpar @@ -151,4 +151,5 @@ clean: -rm -f *-target -rm -f *-openacc -rm -f *-stdpar + -rm -f *-cufortran -rm -f pic pic_soa From a1700dc7cd9c0eec9de32f2255a8e3a00f0b6f80 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 12 Apr 2021 10:44:14 -0700 Subject: [PATCH 079/325] fix issue with parallel build of fortran modules of same name --- FORTRAN/nstream-cufortran.cuf | 14 ++++++------ FORTRAN/transpose-cufortran.cuf | 39 +++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.cuf index ac716c1ff..6962cb610 100644 --- a/FORTRAN/nstream-cufortran.cuf +++ b/FORTRAN/nstream-cufortran.cuf @@ -72,10 +72,10 @@ function prk_get_wtime() result(t) t = real(c,REAL64) / real(r,REAL64) end function prk_get_wtime -module kernels +module nstream +use iso_fortran_env contains - attributes(global) subroutine nstream(scalar, A, B, C) - use iso_fortran_env + attributes(global) subroutine kernel(scalar, A, B, C) implicit none real(kind=REAL64), intent(inout) :: A(:) real(kind=REAL64), intent(in) :: B(:), C(:) @@ -86,13 +86,13 @@ contains if (i <= n) then A(i) = A(i) + B(i) + scalar * C(i) endif - end subroutine nstream -end module kernels + end subroutine kernel +end module nstream program main use iso_fortran_env use cudafor - use kernels + use nstream implicit none real(kind=REAL64) :: prk_get_wtime ! for argument parsing @@ -198,7 +198,7 @@ program main if (k.eq.1) t0 = prk_get_wtime() - call nstream<<>>(scalar, A, B, C) + call kernel<<>>(scalar, A, B, C) enddo ! iterations diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.cuf index f9bf68673..6c887814c 100644 --- a/FORTRAN/transpose-cufortran.cuf +++ b/FORTRAN/transpose-cufortran.cuf @@ -61,30 +61,30 @@ function prk_get_wtime() result(t) t = real(c,REAL64) / real(r,REAL64) end function prk_get_wtime -module kernels +module transpose use iso_fortran_env integer(kind=INT32), parameter :: tile_dim = 32 integer(kind=INT32), parameter :: block_rows = 8 contains - attributes(global) subroutine transposeNaive(order, A, B) - use iso_fortran_env + attributes(global) subroutine naive(order, A, B) implicit none - real(kind=REAL64), intent(inout) :: A(:,:), B(:,:) + real(kind=REAL64), intent(inout) :: A(order,order) + real(kind=REAL64), intent(inout) :: B(order,order) integer(kind=INT32), intent(in), value :: order integer :: x, y, j - x = blockIdx%x * tile_dim + threadIdx%x; - y = blockIdx%y * tile_dim + threadIdx%y; - do j = 1,tile_dim,block_rows - B(x,y+j) = B(x,y+j) + A(y+j,x); - A(y+j,x) = A(y+j,x) + 1; + x = (blockIdx%x-1) * tile_dim + (threadIdx%x-1); + y = (blockIdx%y-1) * tile_dim + (threadIdx%y-1); + do j = 0,tile_dim-1,block_rows + B(y+j,x) = B(y+j,x) + A(x,y+j); + A(x,y+j) = A(x,y+j) + 1.0d0; end do - end subroutine transposeNaive -end module kernels + end subroutine naive +end module transpose program main use iso_fortran_env use cudafor - use kernels + use transpose implicit none real(kind=REAL64) :: prk_get_wtime ! for argument parsing @@ -169,7 +169,7 @@ program main if (k.eq.1) t0 = prk_get_wtime() - call transposeNaive<<>>(order, A, B) + call naive<<>>(order, A, B) enddo ! iterations @@ -192,9 +192,6 @@ program main enddo enddo - deallocate( B ) - deallocate( A ) - if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' avgtime = trans_time/iterations @@ -202,10 +199,18 @@ program main write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & ' Avg time (s): ', avgtime else - write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & + write(*,'(a,e30.15,a,e30.15)') 'ERROR: Aggregate squared error ',abserr, & 'exceeds threshold ',epsilon + do j=1,order + do i=1,order + print*,i,j,A(i,j),B(i,j) + enddo + enddo stop 1 endif + deallocate( A ) + deallocate( B ) + end program main From 1e4fecd55c173929c3ab7d1821f5c004fb5ec417 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 12 Apr 2021 12:50:01 -0700 Subject: [PATCH 080/325] add support for variants after making correct --- FORTRAN/transpose-cufortran.cuf | 35 +++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.cuf index 6c887814c..283cebccb 100644 --- a/FORTRAN/transpose-cufortran.cuf +++ b/FORTRAN/transpose-cufortran.cuf @@ -72,8 +72,8 @@ module transpose real(kind=REAL64), intent(inout) :: B(order,order) integer(kind=INT32), intent(in), value :: order integer :: x, y, j - x = (blockIdx%x-1) * tile_dim + (threadIdx%x-1); - y = (blockIdx%y-1) * tile_dim + (threadIdx%y-1); + x = (blockIdx%x-1) * tile_dim + (threadIdx%x); + y = (blockIdx%y-1) * tile_dim + (threadIdx%y); do j = 0,tile_dim-1,block_rows B(y+j,x) = B(y+j,x) + A(x,y+j); A(x,y+j) = A(x,y+j) + 1.0d0; @@ -104,17 +104,22 @@ program main real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance ! CUDA stuff type(dim3) :: grid, tblock + integer :: variant + character(len=16), dimension(3) :: variant_name + variant_name(1) = 'naive' + variant_name(2) = 'coalesced' + variant_name(3) = 'no bank conflicts' ! ******************************************************************** ! read and test input parameters ! ******************************************************************** write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a41)') 'CUDA Fortran Matrix transpose: B = A^T' + write(*,'(a38)') 'CUDA Fortran Matrix transpose: B = A^T' if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' + write(*,'(a66)') 'Usage: ./transpose <# iterations> [variant (0/1/2)]' stop 1 endif @@ -134,11 +139,22 @@ program main stop 1 endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order + variant = 2 + if (command_argument_count().gt.2) then + call get_command_argument(3,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') variant + endif + if ((variant .lt. 0).or.(variant.gt.2)) then + write(*,'(a,i5)') 'ERROR: variant must be 0, 1 or 2 : ', variant + stop 1 + endif + + write(*,'(a,i8)') 'Number of iterations = ', iterations + write(*,'(a,i8)') 'Matrix order = ', order + write(*,'(a,a16)') 'Variant = ', variant_name(variant+1) - tblock = dim3(order/tile_dim, order/tile_dim, 1) - grid = dim3(tile_dim, block_rows, 1) + grid = dim3(order/tile_dim, order/tile_dim, 1) + tblock = dim3(tile_dim, block_rows, 1) ! ******************************************************************** ! ** Allocate space for the input and transpose matrix @@ -170,6 +186,7 @@ program main if (k.eq.1) t0 = prk_get_wtime() call naive<<>>(order, A, B) + err = cudaDeviceSynchronize() enddo ! iterations @@ -201,11 +218,13 @@ program main else write(*,'(a,e30.15,a,e30.15)') 'ERROR: Aggregate squared error ',abserr, & 'exceeds threshold ',epsilon + call flush(0) do j=1,order do i=1,order print*,i,j,A(i,j),B(i,j) enddo enddo + call flush(0) stop 1 endif From cd79868342dcd4fc1ba19aa4caafbc9a51cc5b9d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 12 Apr 2021 12:52:55 -0700 Subject: [PATCH 081/325] add more variant skeleton --- FORTRAN/transpose-cufortran.cuf | 34 ++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.cuf index 283cebccb..9c31fac87 100644 --- a/FORTRAN/transpose-cufortran.cuf +++ b/FORTRAN/transpose-cufortran.cuf @@ -79,6 +79,32 @@ module transpose A(x,y+j) = A(x,y+j) + 1.0d0; end do end subroutine naive + attributes(global) subroutine coalesced(order, A, B) + implicit none + real(kind=REAL64), intent(inout) :: A(order,order) + real(kind=REAL64), intent(inout) :: B(order,order) + integer(kind=INT32), intent(in), value :: order + integer :: x, y, j + x = (blockIdx%x-1) * tile_dim + (threadIdx%x); + y = (blockIdx%y-1) * tile_dim + (threadIdx%y); + do j = 0,tile_dim-1,block_rows + B(y+j,x) = B(y+j,x) + A(x,y+j); + A(x,y+j) = A(x,y+j) + 1.0d0; + end do + end subroutine coalesced + attributes(global) subroutine nobankconflicts(order, A, B) + implicit none + real(kind=REAL64), intent(inout) :: A(order,order) + real(kind=REAL64), intent(inout) :: B(order,order) + integer(kind=INT32), intent(in), value :: order + integer :: x, y, j + x = (blockIdx%x-1) * tile_dim + (threadIdx%x); + y = (blockIdx%y-1) * tile_dim + (threadIdx%y); + do j = 0,tile_dim-1,block_rows + B(y+j,x) = B(y+j,x) + A(x,y+j); + A(x,y+j) = A(x,y+j) + 1.0d0; + end do + end subroutine nobankconflicts end module transpose program main @@ -185,7 +211,13 @@ program main if (k.eq.1) t0 = prk_get_wtime() - call naive<<>>(order, A, B) + if (variant.eq.0) then + call naive<<>>(order, A, B) + else if (variant.eq.1) then + call coalesced<<>>(order, A, B) + else if (variant.eq.2) then + call nobankconflicts<<>>(order, A, B) + endif err = cudaDeviceSynchronize() enddo ! iterations From 047c6460af9924c7d453451c7c530c4c34700533 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 12 Apr 2021 14:12:01 -0700 Subject: [PATCH 082/325] coalesced working --- FORTRAN/transpose-cufortran.cuf | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.cuf index 9c31fac87..f359403bf 100644 --- a/FORTRAN/transpose-cufortran.cuf +++ b/FORTRAN/transpose-cufortran.cuf @@ -84,13 +84,20 @@ module transpose real(kind=REAL64), intent(inout) :: A(order,order) real(kind=REAL64), intent(inout) :: B(order,order) integer(kind=INT32), intent(in), value :: order + real(kind=REAL64), shared :: tile(32,32) integer :: x, y, j x = (blockIdx%x-1) * tile_dim + (threadIdx%x); y = (blockIdx%y-1) * tile_dim + (threadIdx%y); do j = 0,tile_dim-1,block_rows - B(y+j,x) = B(y+j,x) + A(x,y+j); + tile(threadIdx%y+j,threadIdx%x) = A(x,y+j); A(x,y+j) = A(x,y+j) + 1.0d0; end do + call syncThreads() + x = (blockIdx%y-1) * tile_dim + (threadIdx%x); + y = (blockIdx%x-1) * tile_dim + (threadIdx%y); + do j = 0,tile_dim-1,block_rows + B(x,y+j) = B(x,y+j) + tile(threadIdx%x,threadIdx%y+j) + end do end subroutine coalesced attributes(global) subroutine nobankconflicts(order, A, B) implicit none @@ -253,7 +260,9 @@ program main call flush(0) do j=1,order do i=1,order - print*,i,j,A(i,j),B(i,j) + temp = ((real(order,REAL64)*real(i-1,REAL64))+real(j-1,REAL64)) & + * real(iterations+1,REAL64) + write(*,'(i4,1x,i4,1x,e10.5,1x,e10.5,1x,e10.5)') i,j,A(i,j),B(i,j),temp+addit enddo enddo call flush(0) From 76f82bb222f5e4e1336bd8b55f15dee2cc674596 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 12 Apr 2021 14:15:45 -0700 Subject: [PATCH 083/325] implement nobankconflicts and align with C++ --- FORTRAN/transpose-cufortran.cuf | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.cuf index f359403bf..2495de2b9 100644 --- a/FORTRAN/transpose-cufortran.cuf +++ b/FORTRAN/transpose-cufortran.cuf @@ -89,14 +89,14 @@ module transpose x = (blockIdx%x-1) * tile_dim + (threadIdx%x); y = (blockIdx%y-1) * tile_dim + (threadIdx%y); do j = 0,tile_dim-1,block_rows - tile(threadIdx%y+j,threadIdx%x) = A(x,y+j); + tile(threadIdx%x,threadIdx%y+j) = A(x,y+j); A(x,y+j) = A(x,y+j) + 1.0d0; end do call syncThreads() x = (blockIdx%y-1) * tile_dim + (threadIdx%x); y = (blockIdx%x-1) * tile_dim + (threadIdx%y); do j = 0,tile_dim-1,block_rows - B(x,y+j) = B(x,y+j) + tile(threadIdx%x,threadIdx%y+j) + B(x,y+j) = B(x,y+j) + tile(threadIdx%y+j,threadIdx%x) end do end subroutine coalesced attributes(global) subroutine nobankconflicts(order, A, B) @@ -104,13 +104,20 @@ module transpose real(kind=REAL64), intent(inout) :: A(order,order) real(kind=REAL64), intent(inout) :: B(order,order) integer(kind=INT32), intent(in), value :: order + real(kind=REAL64), shared :: tile(33,32) integer :: x, y, j x = (blockIdx%x-1) * tile_dim + (threadIdx%x); y = (blockIdx%y-1) * tile_dim + (threadIdx%y); do j = 0,tile_dim-1,block_rows - B(y+j,x) = B(y+j,x) + A(x,y+j); + tile(threadIdx%x,threadIdx%y+j) = A(x,y+j); A(x,y+j) = A(x,y+j) + 1.0d0; end do + call syncThreads() + x = (blockIdx%y-1) * tile_dim + (threadIdx%x); + y = (blockIdx%x-1) * tile_dim + (threadIdx%y); + do j = 0,tile_dim-1,block_rows + B(x,y+j) = B(x,y+j) + tile(threadIdx%y+j,threadIdx%x) + end do end subroutine nobankconflicts end module transpose From 246b25d19b9ed39205fb132309f14f6234b423ac Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 12 Apr 2021 15:01:53 -0700 Subject: [PATCH 084/325] fix incorrect use C constants --- FORTRAN/nstream-mpi.F90 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index 43849e4ee..57f1342c1 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -154,8 +154,8 @@ program main write(*,'(a,i12)') 'Number of iterations = ', iterations write(*,'(a,i12)') 'Vector length = ', length endif - call MPI_Bcast(iterations, 1, MPI_INT32_T, 0, MPI_COMM_WORLD) - call MPI_Bcast(length, 1, MPI_INT64_T, 0, MPI_COMM_WORLD) + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + call MPI_Bcast(length, 1, MPI_INTEGER8, 0, MPI_COMM_WORLD) ! ******************************************************************** ! ** Allocate space and perform the computation @@ -289,7 +289,7 @@ program main asum = asum + abs(A(i)-ar) enddo #endif - call MPI_Allreduce(MPI_IN_PLACE, asum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) + call MPI_Allreduce(MPI_IN_PLACE, asum, 1, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD) deallocate( C ) deallocate( B ) From b5107b14b9600a67bc5119c504062da0abb7353a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 12 Apr 2021 15:02:03 -0700 Subject: [PATCH 085/325] update makefile target choices --- FORTRAN/Makefile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 30c1ba9fc..b23925327 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -22,7 +22,7 @@ ifeq ($(findstring ifx,$(FC)),ifx) BLASFLAGS += -heap-arrays endif -.PHONY: all clean serial pretty openmp coarray target openacc ga +.PHONY: all clean serial pretty openmp coarray target stdpar blas ga # Intel ifeq ($(findstring ifort,$(FC)),ifort) @@ -41,14 +41,18 @@ ifeq ($(findstring gfortran,$(FC)),gfortran) EXTRA = target coarray taskloop openacc endif # PGI and LLVM Flang -ifeq ($(findstring pgf,$(FC)),pgf) +ifeq ($(findstring flang,$(FC)),flang) EXTRA = target openacc FCFLAGS += -DPGI endif -ifeq ($(findstring flang,$(FC)),flang) - EXTRA = target openacc +ifeq ($(findstring pgf,$(FC)),pgf) + EXTRA = target openacc cufortran FCFLAGS += -DPGI endif +ifeq ($(findstring nvf,$(FC)),nvf) + EXTRA = target openacc cufortran + FCFLAGS += -DNVHPC +endif ifeq ($(findstring xlf,$(FC)),xlf) EXTRA = target FCFLAGS += $(XLFPP)-DXLF From 9087d0c361d76a33dd94403e20ef057a8e50f5f5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 6 Apr 2021 16:23:13 -0700 Subject: [PATCH 086/325] need SGEMM to measure some GPU properly --- Cxx11/Makefile | 2 +- Cxx11/sgemm-cublas.cu | 372 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 373 insertions(+), 1 deletion(-) create mode 100644 Cxx11/sgemm-cublas.cu diff --git a/Cxx11/Makefile b/Cxx11/Makefile index a9d5667e9..270e1effb 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -125,7 +125,7 @@ hipblas: nstream-hipblas dgemm-hipblas thrust: nstream-host-thrust nstream-device-thrust \ transpose-host-thrust transpose-device-thrust -cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas +cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas sgemm-cublas cblas: transpose-cblas dgemm-cblas diff --git a/Cxx11/sgemm-cublas.cu b/Cxx11/sgemm-cublas.cu new file mode 100644 index 000000000..3b8984acf --- /dev/null +++ b/Cxx11/sgemm-cublas.cu @@ -0,0 +1,372 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: sgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// cblassgemm() +/// cublassgemmStridedBatched() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" + +#if 0 +__global__ void init(unsigned order, float * A, float * B, float * C) +{ + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i [] []"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (argc > 3) { + batches = std::atoi(argv[3]); + } + + if (argc > 4) { + input_copy = std::atoi(argv[4]); + if (input_copy != 0 && input_copy != 1) { + throw "ERROR: input_copy was not 0 or 1"; + } + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches < 0) { + std::cout << "Batch size = " << -batches << " (loop over legacy BLAS)" << std::endl; + } else if (batches > 0) { + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; + } + std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; + + cublasHandle_t h; + prk::CUDA::check( cublasCreate(&h) ); + + const int tile_size = 32; + dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + + info.checkDims(dimBlock, dimGrid); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double sgemm_time(0); + + const int matrices = (batches==0 ? 1 : abs(batches)); + const size_t nelems = (size_t)order * (size_t)order; + + // host buffers + float * h_a = prk::CUDA::malloc_host(nelems); + float * h_b = prk::CUDA::malloc_host(nelems); + float * h_c = prk::CUDA::malloc_host(matrices*nelems); + + // device buffers + float * d_a = prk::CUDA::malloc_device(matrices*nelems); + float * d_b = prk::CUDA::malloc_device(matrices*nelems); + float * d_c = prk::CUDA::malloc_device(matrices*nelems); + + if (input_copy) { + for (int i=0; i>>(order, matrices, d_c); + + } else { + + init<<>>(order, matrices, d_a, d_b, d_c); + + } + prk::CUDA::sync(); + + float xfer(0); + float comp(0); + { + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) { + prk::CUDA::sync(); + sgemm_time = prk::wtime(); + } + + if (input_copy) { + float t0 = prk::wtime(); + for (int b=0; b 0) { + prk_bgemm(h, order, matrices, d_a, d_b, d_c); + } else { + prk_sgemm(h, order, matrices, d_a, d_b, d_c); + } + float t1 = prk::wtime(); + if (iter==1) comp += (t1-t0); + } + } + prk::CUDA::sync(); + sgemm_time = prk::wtime() - sgemm_time; + } + std::cout << "xfer, comp = " << xfer << "," << comp << std::endl; + + // copy output back to host + prk::CUDA::copyD2H(h_c, d_c, matrices*nelems); + + prk::CUDA::free(d_a); + prk::CUDA::free(d_b); + prk::CUDA::free(d_c); + + prk::CUDA::free_host(h_a); + prk::CUDA::free_host(h_b); + + prk::CUDA::check( cublasDestroy(h) ); + + prk::CUDA::sync(); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const auto epsilon = 1.0e-8; + const auto forder = static_cast(order); + const auto reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double residuum(0); + for (int b=0; b(nelems); - float * h_b = prk::CUDA::malloc_host(nelems); - float * h_c = prk::CUDA::malloc_host(matrices*nelems); + auto h_a = prk::CUDA::malloc_host(nelems); + auto h_b = prk::CUDA::malloc_host(nelems); + auto h_c = prk::CUDA::malloc_host(matrices*nelems); // device buffers - float * d_a = prk::CUDA::malloc_device(matrices*nelems); - float * d_b = prk::CUDA::malloc_device(matrices*nelems); - float * d_c = prk::CUDA::malloc_device(matrices*nelems); + auto d_a = prk::CUDA::malloc_device(matrices*nelems); + auto d_b = prk::CUDA::malloc_device(matrices*nelems); + auto d_c = prk::CUDA::malloc_device(matrices*nelems); if (input_copy) { for (int i=0; i 0) { prk_bgemm(h, order, matrices, d_a, d_b, d_c); } else { prk_sgemm(h, order, matrices, d_a, d_b, d_c); } - float t1 = prk::wtime(); + double t1 = prk::wtime(); if (iter==1) comp += (t1-t0); } } prk::CUDA::sync(); - sgemm_time = prk::wtime() - sgemm_time; + gemm_time = prk::wtime() - gemm_time; } std::cout << "xfer, comp = " << xfer << "," << comp << std::endl; @@ -339,7 +337,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// const auto epsilon = 1.0e-8; - const auto forder = static_cast(order); + const auto forder = static_cast(order); const auto reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); double residuum(0); for (int b=0; b Date: Sun, 11 Apr 2021 20:25:39 -0700 Subject: [PATCH 088/325] fix typo --- common/make.defs.nvhpc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index a1b2040af..401d77f92 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -6,7 +6,7 @@ # Base compilers and language options # # C99 is required in some implementations. -CC=${NVHPC_CBIB}nvc -c11 +CC=${NVHPC_CBIN}nvc -c11 # All of the Fortran code is written for the 2008 standard and requires preprocessing. FC=${NVHPC_CBIN}nvfortran -DNVHPC # C++11 may not be required but does no harm here. From b2f7299f90d1117f05e40216405e0206af599075 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 11 Apr 2021 20:39:13 -0700 Subject: [PATCH 089/325] just link cublas because CUDA 10.0 on AGX doesnt have it --- Cxx11/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 270e1effb..a68f8a3f8 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -251,10 +251,10 @@ endif $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -o $@ %-mpi-cublas: %-mpi-cublas.cu prk_util.h prk_cuda.h prk_mpi.h - $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $(MPIINC) -DPRK_USE_CUBLAS $< -lcublas -lcublasLt $(MPILIB) -o $@ + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $(MPIINC) -DPRK_USE_CUBLAS $< -lcublas $(MPILIB) -o $@ %-cublas: %-cublas.cu prk_util.h prk_cuda.h - $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) -DPRK_USE_CUBLAS $< -lcublas -lcublasLt -o $@ + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) -DPRK_USE_CUBLAS $< -lcublas -o $@ %-hip: %-hip.cc prk_util.h prk_hip.h $(HIPCC) $(HIPFLAGS) $(CPPFLAGS) $< -o $@ From c9a340a5c18bf36920033e016e868997315f86e3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 11 Apr 2021 20:42:08 -0700 Subject: [PATCH 090/325] use generic MPI --- common/make.defs.cuda | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index e7fb5163c..ec460964c 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -167,14 +167,10 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED # # MPI-3 # -# We assume you have Intel MPI and have setup your environment with e.g. -# . /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh -# in your .bashrc. -# -# mpiicc wraps icc. mpicc and mpigcc wrap gcc. -MPIDIR=/opt/intel/inteloneapi/mpi/2021.1-beta06 -MPICC=${MPIDIR}/bin/mpiicc -MPICXX=${MPIDIR}/bin/mpiicpc +MPIDIR=/usr/lib/aarch64-linux-gnu/openmpi +MPICC=mpicc +MPICXX=mpicxx +MPIFC=mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi From 7527d7a3e50337a4cc62e5a3bd68049e51894689 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 7 Apr 2021 11:44:28 -0700 Subject: [PATCH 091/325] ignore AMD GPU compiler temps --- .gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.gitignore b/.gitignore index 2aa16a256..651549971 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,16 @@ octave-workspace # Octave crashes *.cub *.ptx +# AMD GPU compiler +*.cmdx +*.cmod +*.ilm +*.stb +*.cmdx +*.cmod +*.ilm +*.stb + common/make.defs scripts/small/runfgmpi scripts/wide/runfgmpi From 2eb5aff437a12fae2217f51d0d92b361ace63590 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 7 Apr 2021 11:44:52 -0700 Subject: [PATCH 092/325] add CUDA wrappers into HIP header too --- Cxx11/prk_hip.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/Cxx11/prk_hip.h b/Cxx11/prk_hip.h index e9b186608..f2d78005b 100644 --- a/Cxx11/prk_hip.h +++ b/Cxx11/prk_hip.h @@ -8,7 +8,6 @@ #include #include -#include #include #include #include @@ -18,7 +17,6 @@ #include "prk_ranges.h" #endif -//typedef float prk_float; typedef double prk_float; namespace prk @@ -133,6 +131,78 @@ namespace prk } }; + template + T * malloc_device(size_t n) { + T * ptr; + size_t bytes = n * sizeof(T); + prk::HIP::check( hipMalloc((void**)&ptr, bytes) ); + return ptr; + } + + template + T * malloc_host(size_t n) { + T * ptr; + size_t bytes = n * sizeof(T); + prk::HIP::check( hipHostMalloc((void**)&ptr, bytes) ); + return ptr; + } + + template + T * malloc_managed(size_t n) { + T * ptr; + size_t bytes = n * sizeof(T); + prk::HIP::check( hipMallocManaged((void**)&ptr, bytes) ); + return ptr; + } + + template + void free(T * ptr) { + prk::HIP::check( hipFree((void*)ptr) ); + } + + template + void free_host(T * ptr) { + prk::HIP::check( hipHostFree((void*)ptr) ); + } + + template + void copyD2H(T * output, T * const input, size_t n) { + size_t bytes = n * sizeof(T); + prk::HIP::check( hipMemcpy(output, input, bytes, hipMemcpyDeviceToHost) ); + } + + template + void copyH2D(T * output, T * const input, size_t n) { + size_t bytes = n * sizeof(T); + prk::HIP::check( hipMemcpy(output, input, bytes, hipMemcpyHostToDevice) ); + } + + template + void copyD2Hasync(T * output, T * const input, size_t n) { + size_t bytes = n * sizeof(T); + prk::HIP::check( hipMemcpyAsync(output, input, bytes, hipMemcpyDeviceToHost) ); + } + + template + void copyH2Dasync(T * output, T * const input, size_t n) { + size_t bytes = n * sizeof(T); + prk::HIP::check( hipMemcpyAsync(output, input, bytes, hipMemcpyHostToDevice) ); + } + + template + void prefetch(T * ptr, size_t n, int device = 0) { + //size_t bytes = n * sizeof(T); + //std::cout << "device=" << device << "\n"; + } + + void sync(void) { + prk::HIP::check( hipDeviceSynchronize() ); + } + + void set_device(int i) { + prk::HIP::check( hipSetDevice(i) ); + } + } // HIP namespace } // prk namespace From d56cc3be5c76171587340a9b9cb296f8de2aace6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 7 Apr 2021 11:45:10 -0700 Subject: [PATCH 093/325] match CUDA impl --- Cxx11/transpose-hip.cc | 161 +++++++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 70 deletions(-) diff --git a/Cxx11/transpose-hip.cc b/Cxx11/transpose-hip.cc index f10f23e17..c77c9a83c 100644 --- a/Cxx11/transpose-hip.cc +++ b/Cxx11/transpose-hip.cc @@ -1,6 +1,6 @@ /// /// Copyright (c) 2013, Intel Corporation -/// Copyright (c) 2015, NVIDIA CORPORATION. +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -56,38 +56,70 @@ #include "prk_util.h" #include "prk_hip.h" -#define TILED 1 +// The kernel was derived from https://github.com/parallel-forall/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu -#if TILED -// The kernel was derived from https://github.com/parallel-forall/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu, -// which is the reason for the additional copyright noted above. - -const int tile_dim = 32; +const int tile_dim = 64; const int block_rows = 8; -__global__ void transpose(int order, prk_float * A, prk_float * B) +__global__ void transposeNoBankConflict(int order, double * A, double * B) { + __shared__ double tile[tile_dim][tile_dim+1]; + auto x = blockIdx.x * tile_dim + threadIdx.x; auto y = blockIdx.y * tile_dim + threadIdx.y; auto width = gridDim.x * tile_dim; + for (int j = 0; j < tile_dim; j += block_rows) { + tile[threadIdx.y+j][threadIdx.x] = A[(y+j)*width + x]; + A[(y+j)*width + x] += (double)1; + } + + __syncthreads(); + + x = blockIdx.y * tile_dim + threadIdx.x; + y = blockIdx.x * tile_dim + threadIdx.y; + for (int j = 0; j < tile_dim; j+= block_rows) { - B[x*width + (y+j)] += A[(y+j)*width + x]; - A[(y+j)*width + x] += (prk_float)1; + B[(y+j)*width + x] += tile[threadIdx.x][threadIdx.y + j]; } } -#else -__global__ void transpose(unsigned order, prk_float * A, prk_float * B) + +__global__ void transposeCoalesced(int order, double * A, double * B) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto j = blockIdx.y * blockDim.y + threadIdx.y; + __shared__ double tile[tile_dim][tile_dim]; - if ((i vnames = {"naive", "coalesced", "no bank conflicts"}; int main(int argc, char * argv[]) { @@ -102,10 +134,10 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// int iterations; - int order, tile_size; + int order, variant; try { if (argc < 3) { - throw "Usage: <# iterations> "; + throw "Usage: <# iterations> [variant (0/1/2)]"; } iterations = std::atoi(argv[1]); @@ -120,20 +152,13 @@ int main(int argc, char * argv[]) throw "ERROR: matrix dimension too large - overflow risk"; } -#if TILED - if (order % tile_dim != 0) { - std::cout << "Sorry, but order (" << order << ") must be evenly divible by " << tile_dim - << " or the results are going to be wrong.\n"; - } -#else - // default tile size for tiling of local transpose - tile_size = 32; + variant = 2; // transposeNoBankConflicts if (argc > 3) { - tile_size = std::atoi(argv[3]); - if (tile_size <= 0) tile_size = order; - if (tile_size > order) tile_size = order; + variant = std::atoi(argv[3]); + } + if (variant < 0 || variant > 2) { + throw "Please select a valid variant (0: naive 1: coalesced, 2: no bank conflicts)"; } -#endif } catch (const char * e) { std::cout << e << std::endl; @@ -142,19 +167,10 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; -#if TILED - std::cout << "Tile size = " << tile_dim << std::endl; -#else - std::cout << "Tile size = " << tile_size << std::endl; -#endif + std::cout << "Variant = " << vnames[variant] << std::endl; -#if TILED dim3 dimGrid(order/tile_dim, order/tile_dim, 1); dim3 dimBlock(tile_dim, block_rows, 1); -#else - dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); - dim3 dimBlock(tile_size, tile_size, 1); -#endif info.checkDims(dimBlock, dimGrid); @@ -163,48 +179,53 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// const size_t nelems = (size_t)order * (size_t)order; - const size_t bytes = nelems * sizeof(prk_float); - prk_float * h_a; - prk_float * h_b; - prk::HIP::check( hipHostMalloc((void**)&h_a, bytes) ); - prk::HIP::check( hipHostMalloc((void**)&h_b, bytes) ); + + double * h_a = prk::HIP::malloc_host(nelems); + double * h_b = prk::HIP::malloc_host(nelems); + // fill A with the sequence 0 to order^2-1 for (int j=0; j(order*j+i); - h_b[j*order+i] = static_cast(0); + h_a[j*order+i] = static_cast(order*j+i); + h_b[j*order+i] = static_cast(0); } } // copy input from host to device - prk_float * d_a; - prk_float * d_b; - prk::HIP::check( hipMalloc((void**)&d_a, bytes) ); - prk::HIP::check( hipMalloc((void**)&d_b, bytes) ); - prk::HIP::check( hipMemcpy(d_a, &(h_a[0]), bytes, hipMemcpyHostToDevice) ); - prk::HIP::check( hipMemcpy(d_b, &(h_b[0]), bytes, hipMemcpyHostToDevice) ); + double * d_a = prk::HIP::malloc_device(nelems); + double * d_b = prk::HIP::malloc_device(nelems); - auto trans_time = 0.0; + prk::HIP::copyH2D(d_a, h_a, nelems); + prk::HIP::copyH2D(d_b, h_b, nelems); + + double trans_time{0}; for (int iter = 0; iter<=iterations; iter++) { - if (iter==1) trans_time = prk::wtime(); + if (iter==1) { + prk::HIP::sync(); + trans_time = prk::wtime(); + } - hipLaunchKernelGGL(transpose, dim3(dimGrid), dim3(dimBlock), 0, 0, order, d_a, d_b); - prk::HIP::check( hipDeviceSynchronize() ); + if (variant==0) { + hipLaunchKernelGGL(transposeNaive, dim3(dimGrid), dim3(dimBlock), 0, 0, order, d_a, d_b); + } else if (variant==1) { + hipLaunchKernelGGL(transposeCoalesced, dim3(dimGrid), dim3(dimBlock), 0, 0, order, d_a, d_b); + } else if (variant==2) { + hipLaunchKernelGGL(transposeNoBankConflict, dim3(dimGrid), dim3(dimBlock), 0, 0, order, d_a, d_b); + } + prk::HIP::sync(); } trans_time = prk::wtime() - trans_time; - // copy output back to host - prk::HIP::check( hipMemcpy(&(h_b[0]), d_b, bytes, hipMemcpyDeviceToHost) ); + prk::HIP::copyD2H(h_b, d_b, nelems); #ifdef VERBOSE - // copy input back to host - debug only - prk::HIP::check( hipMemcpy(&(h_a[0]), d_a, bytes, hipMemcpyDeviceToHost) ); + prk::HIP::copyD2H(h_a, d_a, nelems); #endif - prk::HIP::check( hipFree(d_b) ); - prk::HIP::check( hipFree(d_a) ); + prk::HIP::free(d_a); + prk::HIP::free(d_b); ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -225,14 +246,11 @@ int main(int argc, char * argv[]) std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif - prk::HIP::check( hipHostFree(h_b) ); - prk::HIP::check( hipHostFree(h_a) ); - - const auto epsilon = 1.0e-8; + const double epsilon = 1.0e-8; if (abserr < epsilon) { std::cout << "Solution validates" << std::endl; auto avgtime = trans_time/iterations; - auto bytes = (size_t)order * (size_t)order * sizeof(prk_float); + auto bytes = (size_t)order * (size_t)order * sizeof(double); std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime << " Avg time (s): " << avgtime << std::endl; } else { @@ -248,6 +266,9 @@ int main(int argc, char * argv[]) return 1; } + prk::HIP::free_host(h_a); + prk::HIP::free_host(h_b); + return 0; } From aae9eab4852a76bad69255f1030ef0dcd52f3a27 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 7 Apr 2021 11:46:50 -0700 Subject: [PATCH 094/325] match CUDA version --- Cxx11/transpose-hipblas.cc | 213 +++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 Cxx11/transpose-hipblas.cc diff --git a/Cxx11/transpose-hipblas.cc b/Cxx11/transpose-hipblas.cc new file mode 100644 index 000000000..bdc5ba2f9 --- /dev/null +++ b/Cxx11/transpose-hipblas.cc @@ -0,0 +1,213 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2021, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_hip.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/HIPBLAS Matrix transpose: B = A^T" << std::endl; + + prk::HIP::info info; + info.print(); + + ////////////////////////////////////////////////////////////////////// + // Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + hipblasHandle_t h; + //prk::HIP::check( cublasInit() ); + prk::HIP::check( hipblasCreate(&h) ); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + const size_t nelems = (size_t)order * (size_t)order; + + double * h_a = prk::HIP::malloc_host(nelems); + double * h_b = prk::HIP::malloc_host(nelems); + double * h_o = prk::HIP::malloc_host(1); + + // fill A with the sequence 0 to order^2-1 + for (int j=0; j(order*j+i); + h_b[j*order+i] = static_cast(0); + } + } + h_o[0] = 1; + + // copy input from host to device + double * d_a = prk::HIP::malloc_device(nelems); + double * d_b = prk::HIP::malloc_device(nelems); + double * d_o = prk::HIP::malloc_device(1); + + prk::HIP::copyH2D(d_a, h_a, nelems); + prk::HIP::copyH2D(d_b, h_b, nelems); + prk::HIP::copyH2D(d_o, h_o, 1); + + double trans_time{0}; + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) { + prk::HIP::sync(); + trans_time = prk::wtime(); + } + + double one(1); + // B += trans(A) i.e. B = trans(A) + B + prk::HIP::check( hipblasDgeam(h, + HIPBLAS_OP_T, HIPBLAS_OP_N, // opA, opB + order, order, // m, n + &one, d_a, order, // alpha, A, lda + &one, d_b, order, // beta, B, ldb + d_b, order) ); // C, ldc (in-place for B) + + // A += 1.0 i.e. A = 1.0 * 1.0 + A + prk::HIP::check( hipblasDaxpy(h, + order*order, // n + &one, // alpha + d_o, 0, // x, incx + d_a, 1) ); // y, incy + prk::HIP::sync(); + } + trans_time = prk::wtime() - trans_time; + + prk::HIP::copyD2H(h_b, d_b, nelems); + + prk::HIP::free(d_a); + prk::HIP::free(d_b); + prk::HIP::free(d_o); + prk::HIP::free_host(h_o); + + prk::HIP::check( hipblasDestroy(h) ); + //prk::HIP::check( cublasShutdown() ); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double addit = (iterations+1.) * (iterations/2.); + double abserr(0); + for (int j=0; j(ij)*(1.+iterations)+addit; + abserr += prk::abs(h_b[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + prk::HIP::free_host(h_a); + prk::HIP::free_host(h_b); + + const double epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { +#ifdef VERBOSE + for (int i=0; i Date: Wed, 7 Apr 2021 11:46:59 -0700 Subject: [PATCH 095/325] add transpose-hipblas --- Cxx11/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index a68f8a3f8..ebc20d9ce 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -120,7 +120,7 @@ hip: nstream-hip transpose-hip stencil-hip nstream-managed-hip hipstl: nstream-hipstl -hipblas: nstream-hipblas dgemm-hipblas +hipblas: nstream-hipblas dgemm-hipblas transpose-hipblas thrust: nstream-host-thrust nstream-device-thrust \ transpose-host-thrust transpose-device-thrust From 6b7e1abf6165debe265620ac24e10ac1d844699c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 13 Apr 2021 16:53:54 -0700 Subject: [PATCH 096/325] updates to match CUBLAS version --- Cxx11/dgemm-hipblas.cc | 66 +++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/Cxx11/dgemm-hipblas.cc b/Cxx11/dgemm-hipblas.cc index a0edd4a42..b41f389ce 100644 --- a/Cxx11/dgemm-hipblas.cc +++ b/Cxx11/dgemm-hipblas.cc @@ -1,5 +1,6 @@ /// /// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -135,7 +136,6 @@ void prk_dgemm(const hipblasHandle_t & h, &beta, // beta pC, order) ); // C, ldc } - prk::HIP::check( hipDeviceSynchronize() ); } void prk_bgemm(const hipblasHandle_t & h, @@ -157,7 +157,6 @@ void prk_bgemm(const hipblasHandle_t & h, &beta, C, order, order*order, batches) ); - prk::HIP::check( hipDeviceSynchronize() ); // hipblasStatus_t hipblasDgemmBatched(hipblasHandle_t handle, // hipblasOperation_t transa, @@ -222,7 +221,6 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; - if (batches == 0) { std::cout << "No batching" << std::endl; } else if (batches < 0) { @@ -245,30 +243,22 @@ int main(int argc, char * argv[]) // Allocate space for matrices ////////////////////////////////////////////////////////////////////// - double dgemm_time(0); + double gemm_time(0); const int matrices = (batches==0 ? 1 : abs(batches)); const size_t nelems = (size_t)order * (size_t)order; - const size_t bytes = nelems * sizeof(double); // host buffers - double * h_a; - double * h_b; - double * h_c; - prk::HIP::check( hipHostMalloc((void**)&h_a, bytes) ); - prk::HIP::check( hipHostMalloc((void**)&h_b, bytes) ); - prk::HIP::check( hipHostMalloc((void**)&h_c, matrices*bytes) ); + auto h_a = prk::HIP::malloc_host(nelems); + auto h_b = prk::HIP::malloc_host(nelems); + auto h_c = prk::HIP::malloc_host(matrices*nelems); // device buffers - double * d_a; - double * d_b; - double * d_c; - prk::HIP::check( hipMalloc((void**)&d_a, matrices*bytes) ); - prk::HIP::check( hipMalloc((void**)&d_b, matrices*bytes) ); - prk::HIP::check( hipMalloc((void**)&d_c, matrices*bytes) ); + auto d_a = prk::HIP::malloc_device(matrices*nelems); + auto d_b = prk::HIP::malloc_device(matrices*nelems); + auto d_c = prk::HIP::malloc_device(matrices*nelems); if (input_copy) { - for (int i=0; i Date: Tue, 13 Apr 2021 17:06:53 -0700 Subject: [PATCH 097/325] add SGEMM HIP BLAS tester --- Cxx11/Makefile | 2 +- Cxx11/sgemm-hipblas.cc | 375 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 376 insertions(+), 1 deletion(-) create mode 100644 Cxx11/sgemm-hipblas.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index ebc20d9ce..7c55484f7 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -120,7 +120,7 @@ hip: nstream-hip transpose-hip stencil-hip nstream-managed-hip hipstl: nstream-hipstl -hipblas: nstream-hipblas dgemm-hipblas transpose-hipblas +hipblas: nstream-hipblas sgemm-hipblas dgemm-hipblas transpose-hipblas thrust: nstream-host-thrust nstream-device-thrust \ transpose-host-thrust transpose-device-thrust diff --git a/Cxx11/sgemm-hipblas.cc b/Cxx11/sgemm-hipblas.cc new file mode 100644 index 000000000..36736a00d --- /dev/null +++ b/Cxx11/sgemm-hipblas.cc @@ -0,0 +1,375 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: sgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// cblasSgemm() +/// hipblasSgemmStridedBatched() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_hip.h" + +#if 0 +__global__ void init(unsigned order, float * A, float * B, float * C) +{ + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i [] []"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (argc > 3) { + batches = std::atoi(argv[3]); + } + + if (argc > 4) { + input_copy = std::atoi(argv[4]); + if (input_copy != 0 && input_copy != 1) { + throw "ERROR: input_copy was not 0 or 1"; + } + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches < 0) { + std::cout << "Batch size = " << -batches << " (loop over legacy BLAS)" << std::endl; + } else if (batches > 0) { + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; + } + std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; + + hipblasHandle_t h; + prk::HIP::check( hipblasCreate(&h) ); + + const int tile_size = 32; + dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + + info.checkDims(dimBlock, dimGrid); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double gemm_time(0); + + const int matrices = (batches==0 ? 1 : abs(batches)); + const size_t nelems = (size_t)order * (size_t)order; + + // host buffers + auto h_a = prk::HIP::malloc_host(nelems); + auto h_b = prk::HIP::malloc_host(nelems); + auto h_c = prk::HIP::malloc_host(matrices*nelems); + + // device buffers + auto d_a = prk::HIP::malloc_device(matrices*nelems); + auto d_b = prk::HIP::malloc_device(matrices*nelems); + auto d_c = prk::HIP::malloc_device(matrices*nelems); + + if (input_copy) { + for (int i=0; i 0) { + prk_bgemm(h, order, matrices, d_a, d_b, d_c); + } else { + prk_sgemm(h, order, matrices, d_a, d_b, d_c); + } + double t1 = prk::wtime(); + if (iter==1) comp += (t1-t0); + } + } + prk::HIP::sync(); + gemm_time = prk::wtime() - gemm_time; + } + std::cout << "xfer, comp = " << xfer << "," << comp << std::endl; + + // copy output back to host + prk::HIP::copyD2H(h_c, d_c, matrices*nelems); + + prk::HIP::free(d_a); + prk::HIP::free(d_b); + prk::HIP::free(d_c); + + prk::HIP::free_host(h_a); + prk::HIP::free_host(h_b); + + prk::HIP::check( hipblasDestroy(h) ); + + prk::HIP::sync(); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.0e-8; + const double forder = static_cast(order); + const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double residuum{0}; + for (int b=0; b prk::get_max_matrix_size()) { throw "ERROR: matrix dimension too large - overflow risk"; + } else if (order % TILE_DIM) { + throw "ERROR: matrix dimension not divisible by tile size"; } variant = 2; // transposeNoBankConflicts From b6fa63e572377850e5db3e525753d0ed5385cba8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 15 Apr 2021 16:23:44 -0700 Subject: [PATCH 101/325] cleanup cuda --- Cxx11/prk_cuda.h | 14 -------------- Cxx11/transpose-cuda.cu | 2 +- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/Cxx11/prk_cuda.h b/Cxx11/prk_cuda.h index d147eb0e0..45a531940 100644 --- a/Cxx11/prk_cuda.h +++ b/Cxx11/prk_cuda.h @@ -8,21 +8,11 @@ #include #include -#ifndef __NVCC__ -#warning Please compile CUDA code with CC=nvcc. #include #include #include #include -#endif - -#if defined(PRK_USE_CUBLAS) -#if defined(__NVCC__) #include -#else -#error Sorry, no CUBLAS without NVCC. -#endif -#endif typedef double prk_float; @@ -40,9 +30,6 @@ namespace prk } } -#if defined(PRK_USE_CUBLAS) - // It seems that Coriander defines cublasStatus_t to cudaError_t - // because the compiler complains that this is a redefinition. void check(cublasStatus_t rc) { if (rc==CUBLAS_STATUS_SUCCESS) { @@ -52,7 +39,6 @@ namespace prk std::abort(); } } -#endif class info { diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu index d995b01f8..36f1c9777 100644 --- a/Cxx11/transpose-cuda.cu +++ b/Cxx11/transpose-cuda.cu @@ -147,7 +147,7 @@ int main(int argc, char * argv[]) throw "ERROR: Matrix Order must be greater than 0"; } else if (order > prk::get_max_matrix_size()) { throw "ERROR: matrix dimension too large - overflow risk"; - } else if (order % TILE_DIM) { + } else if (order % tile_dim) { throw "ERROR: matrix dimension not divisible by tile size"; } From 71e03994b0982784ae1d94240ab197214c5dc955 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 15 Apr 2021 16:26:51 -0700 Subject: [PATCH 102/325] add mpifort --- common/make.defs.nvhpc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index 401d77f92..a0f47f74e 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -1,5 +1,6 @@ # # This file shows the NVHPC toolchain options. +#NVHPC_PATH=/proj/nv/Linux_x86_64/21.3 #NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_x86_64/2021 #NVHPC_CBIN=${NVHPC_PATH}/compilers/bin # @@ -104,6 +105,7 @@ MPIDIR=${NVHPC_PATH}/comm_libs/openmpi/openmpi-3.1.5 #MPIDIR=${NVHPC_PATH}/comm_libs/openmpi4/openmpi-4.0.5 MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx +MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi From 3f9f8e444c88b29df1cc1a55b122887884e4e6b2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 22 Apr 2021 17:59:50 -0700 Subject: [PATCH 103/325] test all devices, not just the first GPU --- Cxx11/nstream-opencl.cc | 88 +++++++++++++++++++---------------------- Cxx11/nstream32.cl | 8 ++++ Cxx11/nstream64.cl | 10 +++++ 3 files changed, 58 insertions(+), 48 deletions(-) create mode 100644 Cxx11/nstream32.cl create mode 100644 Cxx11/nstream64.cl diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 9d2a6d6a1..1c302f9e7 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -68,8 +68,9 @@ template void run(cl::Context context, int iterations, size_t length) { auto precision = (sizeof(T)==8) ? 64 : 32; + auto kfile = "nstream"+std::to_string(precision)+".cl"; - cl::Program program(context, prk::opencl::loadProgram("nstream.cl"), true); + cl::Program program(context, prk::opencl::loadProgram(kfile), true); auto function = (precision==64) ? "nstream64" : "nstream32"; @@ -186,54 +187,45 @@ int main(int argc, char* argv[]) /// Setup OpenCL environment ////////////////////////////////////////////////////////////////////// - prk::opencl::listPlatforms(); + std::vector platforms; + cl::Platform::get(&platforms); + for (auto i : platforms) { + std::cout << "CL_PLATFORM_NAME=" << i.getInfo() << "\n"; + std::cout << "CL_PLATFORM_VENDOR=" << i.getInfo() << "\n"; + + std::vector devices; + i.getDevices(CL_DEVICE_TYPE_ALL, &devices); + for (auto j : devices) { + std::cout << " CL_DEVICE_NAME=" << j.getInfo() << "\n"; + std::cout << " CL_DEVICE_VENDOR=" << j.getInfo() << "\n"; + std::cout << " CL_DEVICE_AVAILABLE=" << j.getInfo() << "\n"; + auto t = j.getInfo(); + std::string s; + switch (t) { + case CL_DEVICE_TYPE_CPU: s="CPU"; break; + case CL_DEVICE_TYPE_GPU: s="GPU"; break; + case CL_DEVICE_TYPE_ACCELERATOR: s="ACCELERATOR"; break; + //case CL_DEVICE_TYPE_CUSTOM: s="CUSTOM"; break; + default: s="UNKNOWN"; break; + } + std::cout << " CL_DEVICE_TYPE=" << s << "\n"; + //std::cout << " CL_DEVICE_MAX_COMPUTE_UNITS=" << j.getInfo() << "\n"; + //std::cout << " CL_DEVICE_GLOBAL_MEM_SIZE=" << j.getInfo() << "\n"; + //std::cout << " CL_DEVICE_MAX_CLOCK_FREQUENCY=" << j.getInfo() << "\n"; + //std::cout << " CL_DEVICE_MAX_MEM_ALLOC_SIZE=" << j.getInfo() << "\n"; + //std::cout << " CL_DEVICE_LOCAL_MEM_SIZE=" << j.getInfo() << "\n"; + auto e = j.getInfo(); + auto has64 = prk::stringContains(e,"cl_khr_fp64"); + std::cout << " CL_DEVICE_EXTENSIONS " << (has64 ? "contains" : "does not contain") << " cl_khr_fp64\n"; + std::cout << std::endl; + + cl::Context ctx(j); + if (has64) { + run(ctx, iterations, length); + } + run(ctx, iterations, length); - cl_int err = CL_SUCCESS; - - cl::Context cpu(CL_DEVICE_TYPE_CPU, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(cpu) ) - { - const int precision = prk::opencl::precision(cpu); - - std::cout << "CPU Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(cpu, iterations, length); - } - run(cpu, iterations, length); - } else { - std::cerr << "No CPU" << std::endl; - } - - cl::Context gpu(CL_DEVICE_TYPE_GPU, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(gpu) ) - { - const int precision = prk::opencl::precision(gpu); - - std::cout << "GPU Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(gpu, iterations, length); - } - run(gpu, iterations, length); - } else { - std::cerr << "No GPU" << std::endl; - } - - cl::Context acc(CL_DEVICE_TYPE_ACCELERATOR, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(acc) ) - { - - const int precision = prk::opencl::precision(acc); - - std::cout << "ACC Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(acc, iterations, length); - } - run(acc, iterations, length); - } else { - std::cerr << "No ACC" << std::endl; + } } return 0; diff --git a/Cxx11/nstream32.cl b/Cxx11/nstream32.cl new file mode 100644 index 000000000..9a43d88ff --- /dev/null +++ b/Cxx11/nstream32.cl @@ -0,0 +1,8 @@ +__kernel void nstream32(const int length, const float scalar, __global float * A, __global float * B, __global float * C) +{ + const int i = get_global_id(0); + + if (i Date: Sat, 24 Apr 2021 14:18:35 -0700 Subject: [PATCH 104/325] test all CPU and GPU nicely --- Cxx11/nstream-opencl.cc | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 1c302f9e7..603bb52c7 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -190,41 +190,20 @@ int main(int argc, char* argv[]) std::vector platforms; cl::Platform::get(&platforms); for (auto i : platforms) { - std::cout << "CL_PLATFORM_NAME=" << i.getInfo() << "\n"; - std::cout << "CL_PLATFORM_VENDOR=" << i.getInfo() << "\n"; - std::vector devices; i.getDevices(CL_DEVICE_TYPE_ALL, &devices); for (auto j : devices) { - std::cout << " CL_DEVICE_NAME=" << j.getInfo() << "\n"; - std::cout << " CL_DEVICE_VENDOR=" << j.getInfo() << "\n"; - std::cout << " CL_DEVICE_AVAILABLE=" << j.getInfo() << "\n"; auto t = j.getInfo(); - std::string s; - switch (t) { - case CL_DEVICE_TYPE_CPU: s="CPU"; break; - case CL_DEVICE_TYPE_GPU: s="GPU"; break; - case CL_DEVICE_TYPE_ACCELERATOR: s="ACCELERATOR"; break; - //case CL_DEVICE_TYPE_CUSTOM: s="CUSTOM"; break; - default: s="UNKNOWN"; break; - } - std::cout << " CL_DEVICE_TYPE=" << s << "\n"; - //std::cout << " CL_DEVICE_MAX_COMPUTE_UNITS=" << j.getInfo() << "\n"; - //std::cout << " CL_DEVICE_GLOBAL_MEM_SIZE=" << j.getInfo() << "\n"; - //std::cout << " CL_DEVICE_MAX_CLOCK_FREQUENCY=" << j.getInfo() << "\n"; - //std::cout << " CL_DEVICE_MAX_MEM_ALLOC_SIZE=" << j.getInfo() << "\n"; - //std::cout << " CL_DEVICE_LOCAL_MEM_SIZE=" << j.getInfo() << "\n"; - auto e = j.getInfo(); - auto has64 = prk::stringContains(e,"cl_khr_fp64"); - std::cout << " CL_DEVICE_EXTENSIONS " << (has64 ? "contains" : "does not contain") << " cl_khr_fp64\n"; - std::cout << std::endl; - - cl::Context ctx(j); - if (has64) { - run(ctx, iterations, length); + if (t == CL_DEVICE_TYPE_CPU || t == CL_DEVICE_TYPE_GPU) { + std::cout << "\n" << "CL_DEVICE_NAME=" << j.getInfo() << "\n"; + auto e = j.getInfo(); + auto has64 = prk::stringContains(e,"cl_khr_fp64"); + cl::Context ctx(j); + run(ctx, iterations, length); + if (has64) { + run(ctx, iterations, length); + } } - run(ctx, iterations, length); - } } From 2b18872f9d3ed00a343b24b40b82cb8c1df64ad4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 24 Apr 2021 14:22:58 -0700 Subject: [PATCH 105/325] improve OpenCL --- Cxx11/nstream-opencl.cc | 2 +- Cxx11/nstream.cl | 19 -------- Cxx11/transpose-opencl.cc | 66 ++++++++------------------ Cxx11/transpose32.cl | 18 +++++++ Cxx11/{transpose.cl => transpose64.cl} | 11 ----- 5 files changed, 40 insertions(+), 76 deletions(-) delete mode 100644 Cxx11/nstream.cl create mode 100644 Cxx11/transpose32.cl rename Cxx11/{transpose.cl => transpose64.cl} (70%) diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 603bb52c7..42f860ae7 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -70,7 +70,7 @@ void run(cl::Context context, int iterations, size_t length) auto precision = (sizeof(T)==8) ? 64 : 32; auto kfile = "nstream"+std::to_string(precision)+".cl"; - cl::Program program(context, prk::opencl::loadProgram(kfile), true); + cl::Program program(context, prk::opencl::loadProgram("transpose.cl"), true); auto function = (precision==64) ? "nstream64" : "nstream32"; diff --git a/Cxx11/nstream.cl b/Cxx11/nstream.cl deleted file mode 100644 index ec6cfb1b2..000000000 --- a/Cxx11/nstream.cl +++ /dev/null @@ -1,19 +0,0 @@ -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -__kernel void nstream32(const int length, const float scalar, __global float * A, __global float * B, __global float * C) -{ - const int i = get_global_id(0); - - if (i void run(cl::Context context, int iterations, int order) { auto precision = (sizeof(T)==8) ? 64 : 32; + auto kfile = "transpose"+std::to_string(precision)+".cl"; - cl::Program program(context, prk::opencl::loadProgram("transpose.cl"), true); + cl::Program program(context, prk::opencl::loadProgram(kfile), true); auto function = (precision==64) ? "transpose64" : "transpose32"; @@ -142,7 +143,7 @@ int main(int argc, char* argv[]) std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; std::cout << "C++11/OpenCL Matrix transpose: B = A^T" << std::endl; - prk::opencl::listPlatforms(); + //prk::opencl::listPlatforms(); ////////////////////////////////////////////////////////////////////// /// Read and test input parameters @@ -181,49 +182,24 @@ int main(int argc, char* argv[]) /// Setup OpenCL environment ////////////////////////////////////////////////////////////////////// - cl_int err = CL_SUCCESS; - - cl::Context cpu(CL_DEVICE_TYPE_CPU, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(cpu) ) - { - const int precision = prk::opencl::precision(cpu); - - std::cout << "CPU Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(cpu, iterations, order); - } else { - run(cpu, iterations, order); - } - } - - cl::Context gpu(CL_DEVICE_TYPE_GPU, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(gpu) ) - { - const int precision = prk::opencl::precision(gpu); - - std::cout << "GPU Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(gpu, iterations, order); - } else { - run(gpu, iterations, order); - } - } - - cl::Context acc(CL_DEVICE_TYPE_ACCELERATOR, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(acc) ) - { - - const int precision = prk::opencl::precision(acc); - - std::cout << "ACC Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(acc, iterations, order); - } else { - run(acc, iterations, order); - } + std::vector platforms; + cl::Platform::get(&platforms); + for (auto i : platforms) { + std::vector devices; + i.getDevices(CL_DEVICE_TYPE_ALL, &devices); + for (auto j : devices) { + auto t = j.getInfo(); + if (t == CL_DEVICE_TYPE_CPU || t == CL_DEVICE_TYPE_GPU) { + std::cout << "\n" << "CL_DEVICE_NAME=" << j.getInfo() << "\n"; + auto e = j.getInfo(); + auto has64 = prk::stringContains(e,"cl_khr_fp64"); + cl::Context ctx(j); + run(ctx, iterations, order); + if (has64) { + run(ctx, iterations, order); + } + } + } } return 0; diff --git a/Cxx11/transpose32.cl b/Cxx11/transpose32.cl new file mode 100644 index 000000000..4db1c162d --- /dev/null +++ b/Cxx11/transpose32.cl @@ -0,0 +1,18 @@ +// +// This is a NAIVE implementation that may perform badly. +// +// Examples of better implementations include: +// - https://developer.apple.com/library/content/samplecode/OpenCL_Matrix_Transpose_Example/Introduction/Intro.html +// - https://github.com/sschaetz/nvidia-opencl-examples/blob/master/OpenCL/src/oclTranspose/transpose.cl +// + +__kernel void transpose32(const int order, __global float * a, __global float * b) +{ + const int i = get_global_id(0); + const int j = get_global_id(1); + + if ((i Date: Sat, 24 Apr 2021 14:28:43 -0700 Subject: [PATCH 106/325] improve stencil OpenCL --- Cxx11/add32.cl | 9 +++++ Cxx11/{add.cl => add64.cl} | 10 ----- Cxx11/generate-opencl-stencil.py | 4 +- Cxx11/stencil-opencl.cc | 67 ++++++++++---------------------- 4 files changed, 32 insertions(+), 58 deletions(-) create mode 100644 Cxx11/add32.cl rename Cxx11/{add.cl => add64.cl} (55%) diff --git a/Cxx11/add32.cl b/Cxx11/add32.cl new file mode 100644 index 000000000..f155df8a8 --- /dev/null +++ b/Cxx11/add32.cl @@ -0,0 +1,9 @@ +__kernel void add32(const int n, __global float * inout) +{ + const int i = get_global_id(0); + const int j = get_global_id(1); + + if ( (i(cpu, iterations, n, radius, star); - } else { - run(cpu, iterations, n, radius, star); - } - } - - cl::Context gpu(CL_DEVICE_TYPE_GPU, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(gpu) ) - { - const int precision = prk::opencl::precision(gpu); - - std::cout << "GPU Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(gpu, iterations, n, radius, star); - } else { - run(gpu, iterations, n, radius, star); - } - } - - cl::Context acc(CL_DEVICE_TYPE_ACCELERATOR, NULL, NULL, NULL, &err); - if ( err == CL_SUCCESS && prk::opencl::available(acc) ) - { - - const int precision = prk::opencl::precision(acc); - - std::cout << "ACC Precision = " << precision << "-bit" << std::endl; - - if (precision==64) { - run(acc, iterations, n, radius, star); - } else { - run(acc, iterations, n, radius, star); - } + std::vector platforms; + cl::Platform::get(&platforms); + for (auto i : platforms) { + std::vector devices; + i.getDevices(CL_DEVICE_TYPE_ALL, &devices); + for (auto j : devices) { + auto t = j.getInfo(); + if (t == CL_DEVICE_TYPE_CPU || t == CL_DEVICE_TYPE_GPU) { + std::cout << "\n" << "CL_DEVICE_NAME=" << j.getInfo() << "\n"; + auto e = j.getInfo(); + auto has64 = prk::stringContains(e,"cl_khr_fp64"); + cl::Context ctx(j); + run(ctx, iterations, n, radius, star); + if (has64) { + run(ctx, iterations, n, radius, star); + } + } + } } return 0; From 4e7b5555e1ac181a3fd6bb22cdd6d40d04b3bf64 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 24 Apr 2021 14:28:54 -0700 Subject: [PATCH 107/325] ignore better --- .gitignore | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 138a7b278..0716e97b6 100644 --- a/.gitignore +++ b/.gitignore @@ -128,15 +128,7 @@ Cxx11/dgemm-vector Cxx11/dgemm-vector-raja Cxx11/sgemm-cublas Cxx11/sgemm-hipblas -Cxx11/grid1.cl -Cxx11/grid2.cl -Cxx11/grid3.cl -Cxx11/grid4.cl -Cxx11/grid5.cl -Cxx11/grid6.cl -Cxx11/grid7.cl -Cxx11/grid8.cl -Cxx11/grid9.cl +Cxx11/grid*.cl Cxx11/nstream Cxx11/nstream-boost-compute Cxx11/nstream-celerity @@ -207,16 +199,7 @@ Cxx11/pic-taskloop Cxx11/pic-tbb Cxx11/sparse Cxx11/sparse-vector -Cxx11/star1.cl -Cxx11/star10.cl -Cxx11/star2.cl -Cxx11/star3.cl -Cxx11/star4.cl -Cxx11/star5.cl -Cxx11/star6.cl -Cxx11/star7.cl -Cxx11/star8.cl -Cxx11/star9.cl +Cxx11/star*.cl Cxx11/stencil Cxx11/stencil-cilk Cxx11/stencil-cuda From de6635e4030887ca6c34aa30ceea1e22e39594ab Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 24 Apr 2021 14:37:46 -0700 Subject: [PATCH 108/325] fix bug and print file load error --- Cxx11/nstream-opencl.cc | 2 +- Cxx11/prk_opencl.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 42f860ae7..603bb52c7 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -70,7 +70,7 @@ void run(cl::Context context, int iterations, size_t length) auto precision = (sizeof(T)==8) ? 64 : 32; auto kfile = "nstream"+std::to_string(precision)+".cl"; - cl::Program program(context, prk::opencl::loadProgram("transpose.cl"), true); + cl::Program program(context, prk::opencl::loadProgram(kfile), true); auto function = (precision==64) ? "nstream64" : "nstream32"; diff --git a/Cxx11/prk_opencl.h b/Cxx11/prk_opencl.h index 880a9f32f..69b2e62ab 100644 --- a/Cxx11/prk_opencl.h +++ b/Cxx11/prk_opencl.h @@ -34,6 +34,7 @@ namespace prk { { std::ifstream stream(input.c_str()); if (!stream.is_open()) { + std::cerr << "loadProgram failed on " << input << std::endl; return std::string("FAIL"); } From d4a45e7c2ec567206e3d5485f3288fe56610ae52 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 29 Mar 2021 14:57:18 -0700 Subject: [PATCH 109/325] fix bug and improve output formatting --- FORTRAN/stencil-stdpar.F90 | 69 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/FORTRAN/stencil-stdpar.F90 b/FORTRAN/stencil-stdpar.F90 index 35be153fd..f0ea2ad3c 100644 --- a/FORTRAN/stencil-stdpar.F90 +++ b/FORTRAN/stencil-stdpar.F90 @@ -112,35 +112,34 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) integer(kind=INT32) :: i, j, ii, jj, it, jt if (is_star) then if (.not.tiling) then + ! this is here to work around a bug + !$acc enter data copyin(a,w) do concurrent (j=r:n-r-1, i=r:n-r-1) - !do j=r,n-r-1 - ! do i=r,n-r-1 - do jj=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) - enddo - do ii=-r,-1 - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - do ii=1,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - ! enddo + do jj=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) + enddo + do ii=-r,-1 + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) + enddo + do ii=1,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) + enddo enddo else ! tiling - do jt=r,n-r-1,tile_size - do it=r,n-r-1,tile_size - do j=jt,min(n-r-1,jt+tile_size-1) - do i=it,min(n-r-1,it+tile_size-1) - do jj=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) - enddo - do ii=-r,-1 - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - do ii=1,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - enddo + ! this is here to work around a bug + !$acc enter data copyin(a,w) + do concurrent (jt=r:n-r-1:tile_size, & + it=r:n-r-1:tile_size) + do concurrent (j=jt:min(n-r-1,jt+tile_size-1), & + i=it:min(n-r-1,it+tile_size-1)) + do jj=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) + enddo + do ii=-r,-1 + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) + enddo + do ii=1,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) enddo enddo enddo @@ -278,22 +277,22 @@ program main norm = 0.d0 active_points = int(n-2*r,INT64)**2 - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Grid size = ', n - write(*,'(a,i8)') 'Radius of stencil = ', r + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Grid size = ', n + write(*,'(a22,i8)') 'Radius of stencil = ', r if (is_star) then - write(*,'(a,a)') 'Type of stencil = star' + write(*,'(a22,a8)') 'Type of stencil = ', 'star' stencil_size = 4*r+1 else - write(*,'(a,a)') 'Type of stencil = grid' + write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - write(*,'(a)') 'Data type = double precision' - write(*,'(a)') 'Compact representation of stencil loop body' + !write(*,'(a)') 'Data type = double precision' + !write(*,'(a)') 'Compact representation of stencil loop body' if (tiling) then - write(*,'(a,i5)') 'Tile size = ', tile_size + write(*,'(a22,i8)') 'Tile size = ', tile_size else - write(*,'(a)') 'Untiled' + write(*,'(a22)') 'Untiled' endif call initialize_w(is_star,r,W) From 7ef65a9d2fe93bface4f084064a7adc050549fce Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 23 Apr 2021 10:29:06 -0700 Subject: [PATCH 110/325] the -mkl flag doesn't work with dpcpp --- common/make.defs.oneapi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index f0ba0b49e..cfd040dc8 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -93,8 +93,8 @@ CBLASFLAG=-DMKL -mkl #MKLROOT=/opt/intel/inteloneapi/mkl/latest #ONEMKLFLAG=-I$(MKLROOT)/include -DMKL_ILP64 ${MKLROOT}/lib/intel64/libmkl_sycl.a -L${MKLROOT}/lib/intel64 -lmkl_intel_ilp64 -lmkl_tbb_thread ${TBBFLAG} -lmkl_core -lOpenCL -ldl #ONEMKLFLAG+=-I/opt/intel/oneapi/mkl/latest/include/ -#ONEMKLFLAG=-I$(MKLROOT)/include -DMKL_ILP64 ${MKLROOT}/lib/intel64/libmkl_sycl.a -L${MKLROOT}/lib/intel64 -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lOpenCL -ldl -ONEMKLFLAG=-mkl +ONEMKLFLAG=-I$(MKLROOT)/include -DMKL_ILP64 ${MKLROOT}/lib/intel64/libmkl_sycl.a -L${MKLROOT}/lib/intel64 -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lOpenCL -ldl +#ONEMKLFLAG=-mkl # # CUDA flags # From 9aad7aae8419922456bd141278ad19063446d9f2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 23 Apr 2021 10:29:44 -0700 Subject: [PATCH 111/325] use in-order queue for USM oneMKL --- Cxx11/nstream-onemkl.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Cxx11/nstream-onemkl.cc b/Cxx11/nstream-onemkl.cc index 679ccc01e..1afeea839 100644 --- a/Cxx11/nstream-onemkl.cc +++ b/Cxx11/nstream-onemkl.cc @@ -106,7 +106,7 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Vector length = " << length << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector{}, sycl::property::queue::in_order{}); ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -141,15 +141,15 @@ int main(int argc, char * argv[]) double one(1); mkl::blas::axpy(q, length, - one, // alpha - d_B, 1, // x, incx - d_A, 1).wait(); // y, incy + one, // alpha + d_B, 1, // x, incx + d_A, 1); // y, incy mkl::blas::axpy(q, length, - scalar, // alpha - d_C, 1, // x, incx - d_A, 1).wait(); // y, incy - q.wait(); + scalar, // alpha + d_C, 1, // x, incx + d_A, 1); // y, incy } + q.wait(); nstream_time = prk::wtime() - nstream_time; } From 8a0f83815425c503aa0f142ad1efed56c9043fd1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 7 May 2021 14:09:00 -0700 Subject: [PATCH 112/325] Block fortran dgemm (#580) * add stdpar and seperate dgemm and dgemm-openmp * blocking isn't really better here * fix comment --- FORTRAN/Makefile | 5 +- FORTRAN/dgemm-openmp.F90 | 42 ------- FORTRAN/dgemm-stdpar.F90 | 238 +++++++++++++++++++++++++++++++++++++++ FORTRAN/dgemm.F90 | 86 +++----------- 4 files changed, 257 insertions(+), 114 deletions(-) create mode 100644 FORTRAN/dgemm-stdpar.F90 diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index b23925327..3cd82fe25 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -82,11 +82,12 @@ target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgem openacc: p2p-openacc p2p-innerloop-openacc stencil-openacc transpose-openacc nstream-openacc + +stdpar: nstream-stdpar stencil-stdpar transpose-stdpar dgemm-stdpar + cuf: cufortran cufortran: nstream-cufortran transpose-cufortran -stdpar: nstream-stdpar stencil-stdpar transpose-stdpar - blas: dgemm-blas %: %.F90 diff --git a/FORTRAN/dgemm-openmp.F90 b/FORTRAN/dgemm-openmp.F90 index a30704357..a2387a535 100644 --- a/FORTRAN/dgemm-openmp.F90 +++ b/FORTRAN/dgemm-openmp.F90 @@ -71,65 +71,41 @@ subroutine prk_dgemm(order, tile_size, A, B, C) integer(kind=INT32) :: i,j,k,it,jt,kt if (tile_size.lt.order) then -#if defined(_OPENMP) !$omp do collapse(3) private(i,j,k,it,jt,kt) do jt=1,order,tile_size do kt=1,order,tile_size do it=1,order,tile_size -#else - do concurrent (jt=1:order:tile_size) - do concurrent (kt=1:order:tile_size) - do concurrent (it=1:order:tile_size) -#endif do j=jt,min(order,jt+tile_size-1) do k=kt,min(order,kt+tile_size-1) -#if defined(_OPENMP) !$omp simd -#endif do i=it,min(order,it+tile_size-1) C(i,j) = C(i,j) + A(i,k) * B(k,j) enddo -#if defined(_OPENMP) !$omp end simd -#endif enddo enddo enddo enddo enddo -#ifdef _OPENMP !$omp end do -#endif else -#if defined(_OPENMP) !$omp do private(i,j,k,it,jt,kt) do j=1,order do k=1,order !$omp simd do i=1,order -#else - do concurrent (j=1:order) - do concurrent (k=1:order) - do concurrent (i=1:order) -#endif C(i,j) = C(i,j) + A(i,k) * B(k,j) enddo -#if defined(_OPENMP) !$omp end simd -#endif enddo enddo -#ifdef _OPENMP !$omp end do -#endif endif end subroutine prk_dgemm program main use iso_fortran_env -#ifdef _OPENMP use omp_lib -#endif implicit none real(kind=REAL64) :: prk_get_wtime ! for argument parsing @@ -156,11 +132,7 @@ program main ! ******************************************************************** write(*,'(a25)') 'Parallel Research Kernels' -#ifdef _OPENMP write(*,'(a61)') 'Fortran OpenMP Dense matrix-matrix multiplication: C += A x B' -#else - write(*,'(a61)') 'Fortran Serial Dense matrix-matrix multiplication: C += A x B' -#endif if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() @@ -195,9 +167,7 @@ program main tile_size = order ! no tiling endif -#ifdef _OPENMP write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() -#endif write(*,'(a,i8)') 'Number of iterations = ', iterations write(*,'(a,i8)') 'Matrix order = ', order write(*,'(a,i8)') 'Tile size = ', tile_size @@ -224,12 +194,10 @@ program main stop 1 endif -#ifdef _OPENMP !$omp parallel default(none) & !$omp& shared(A,B,C,t0,t1) & !$omp& firstprivate(order,iterations,tile_size) & !$omp& private(k) -#endif !$omp do private(i) do i=1, order @@ -243,30 +211,20 @@ program main do k=0,iterations if (k.eq.1) then -#ifdef _OPENMP !$omp barrier !$omp master -#endif t0 = prk_get_wtime() -#ifdef _OPENMP !$omp end master -#endif endif call prk_dgemm(order, tile_size, A, B, C) enddo -#ifdef _OPENMP !$omp barrier !$omp master -#endif t1 = prk_get_wtime() -#ifdef _OPENMP !$omp end master -#endif -#ifdef _OPENMP !$omp end parallel -#endif dgemm_time = t1 - t0 diff --git a/FORTRAN/dgemm-stdpar.F90 b/FORTRAN/dgemm-stdpar.F90 new file mode 100644 index 000000000..f55ad5b35 --- /dev/null +++ b/FORTRAN/dgemm-stdpar.F90 @@ -0,0 +1,238 @@ +! +! Copyright (c) 2017, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: dgemm +! +! PURPOSE: This program tests the efficiency with which a dense matrix +! dense multiplication is carried out +! +! USAGE: The program takes as input the matrix order and +! the number of times the matrix-matrix multiplication +! is carried out. +! +! <# iterations> +! +! The output consists of diagnostics to make sure the +! algorithm worked, and of timing statistics. +! +! HISTORY: Written by Rob Van der Wijngaart, February 2009. +! Converted to C++11 by Jeff Hammond, December, 2017. +! Converted to Fortran by Jeff Hammond, December, 2017. +! +! ******************************************************************* + +function prk_get_wtime() result(t) + use iso_fortran_env + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +end function prk_get_wtime + +subroutine prk_dgemm(order, tile_size, A, B, C) + use iso_fortran_env + implicit none + integer(kind=INT32), intent(in) :: order, tile_size + real(kind=REAL64), intent(in) :: A(order,order) + real(kind=REAL64), intent(in) :: B(order,order) + real(kind=REAL64), intent(inout) :: C(order,order) + integer(kind=INT32) :: i,j,k,it,jt,kt + + if (tile_size.lt.order) then + do concurrent (jt=1:order:tile_size) + do concurrent (kt=1:order:tile_size) + do concurrent (it=1:order:tile_size) + do j=jt,min(order,jt+tile_size-1) + do k=kt,min(order,kt+tile_size-1) + do i=it,min(order,it+tile_size-1) + C(i,j) = C(i,j) + A(i,k) * B(k,j) + enddo + enddo + enddo + enddo + enddo + enddo + else + do concurrent (j=1:order) + do concurrent (k=1:order) + do concurrent (i=1:order) + C(i,j) = C(i,j) + A(i,k) * B(k,j) + enddo + enddo + enddo + endif +end subroutine prk_dgemm + +program main + use iso_fortran_env + implicit none + real(kind=REAL64) :: prk_get_wtime + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations ! number of times to do the kernel + integer(kind=INT32) :: order ! order of the matrix + integer(kind=INT32) :: tile_size + real(kind=REAL64) :: forder ! order as a double + real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold input matrix + real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold input matrix + real(kind=REAL64), allocatable :: C(:,:) ! buffer to hold output matrix + integer(kind=INT64) :: nflops + ! runtime variables + integer(kind=INT32) :: i,j,k + real(kind=REAL64) :: checksum, reference, residuum + real(kind=REAL64) :: t0, t1, dgemm_time, avgtime ! timing parameters + real(kind=REAL64), parameter :: epsilon=1.0d-8 ! error tolerance + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a61)') 'Fortran STDPAR Dense matrix-matrix multiplication: C += A x B' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a66)') 'Usage: ./dgemm-pretty <# iterations> []' + stop 1 + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + stop 1 + endif + + order = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') order + if (order .lt. 1) then + write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order + stop 1 + endif + + tile_size = 32 + if (command_argument_count().gt.2) then + call get_command_argument(3,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') tile_size + endif + if ((tile_size.lt.1).or.(tile_size.gt.order)) then + write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size, & + ' must be >= 1 and <= ',order + tile_size = order ! no tiling + endif + + write(*,'(a,i8)') 'Number of iterations = ', iterations + write(*,'(a,i8)') 'Matrix order = ', order + write(*,'(a,i8)') 'Tile size = ', tile_size + + ! ******************************************************************** + ! ** Allocate space for the input and output matrices + ! ******************************************************************** + + allocate( A(order,order), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of A returned ',err + stop 1 + endif + + allocate( B(order,order), stat=err ) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of B returned ',err + stop 1 + endif + + allocate( C(order,order), stat=err ) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of C returned ',err + stop 1 + endif + + do concurrent (i=1:order) + A(:,i) = real(i-1,REAL64) + B(:,i) = real(i-1,REAL64) + C(:,i) = real(0,REAL64) + enddo + + t0 = 0 + + do k=0,iterations + if (k.eq.1) then + t0 = prk_get_wtime() + endif + call prk_dgemm(order, tile_size, A, B, C) + enddo + + t1 = prk_get_wtime() + + dgemm_time = t1 - t0 + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + deallocate( A ) + deallocate( B ) + + forder = real(order,REAL64) + reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) + checksum = 0.0d0 + do j=1,order + do i=1,order + checksum = checksum + C(i,j) + enddo + enddo + + deallocate( C ) + + residuum = abs(checksum-reference)/reference + if (residuum .lt. epsilon) then + write(*,'(a)') 'Solution validates' + avgtime = dgemm_time/iterations + nflops = 2 * int(order,INT64)**3 + write(*,'(a,f13.3,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, & + ' Avg time (s): ', avgtime + else + write(*,'(a,e30.15)') 'Reference checksum = ', reference + write(*,'(a,e30.15)') 'Actual checksum = ', checksum + stop 1 + endif + +end program main + diff --git a/FORTRAN/dgemm.F90 b/FORTRAN/dgemm.F90 index a30704357..ff5afe2ab 100644 --- a/FORTRAN/dgemm.F90 +++ b/FORTRAN/dgemm.F90 @@ -68,68 +68,50 @@ subroutine prk_dgemm(order, tile_size, A, B, C) real(kind=REAL64), intent(in) :: A(order,order) real(kind=REAL64), intent(in) :: B(order,order) real(kind=REAL64), intent(inout) :: C(order,order) + !real(kind=REAL64) :: TA(tile_size,tile_size) + real(kind=REAL64) :: TB(tile_size,tile_size) + !real(kind=REAL64) :: TTB(tile_size,tile_size) + !real(kind=REAL64) :: TC(tile_size,tile_size) integer(kind=INT32) :: i,j,k,it,jt,kt if (tile_size.lt.order) then -#if defined(_OPENMP) - !$omp do collapse(3) private(i,j,k,it,jt,kt) do jt=1,order,tile_size do kt=1,order,tile_size + TB(1:tile_size,1:tile_size) = B(kt:kt+tile_size,jt:jt+tile_size) + !TTB = transpose(TB) + !TTB = transpose(B(kt:kt+tile_size,jt:jt+tile_size)) do it=1,order,tile_size -#else - do concurrent (jt=1:order:tile_size) - do concurrent (kt=1:order:tile_size) - do concurrent (it=1:order:tile_size) -#endif + !TA(1:tile_size,1:tile_size) = A(it:it+tile_size,kt:kt+tile_size) + !!TC = C(it:it+tile_size,jt:jt+tile_size) do j=jt,min(order,jt+tile_size-1) do k=kt,min(order,kt+tile_size-1) -#if defined(_OPENMP) - !$omp simd -#endif do i=it,min(order,it+tile_size-1) - C(i,j) = C(i,j) + A(i,k) * B(k,j) + !C(i,j) = C(i,j) + A(i,k) * B(k,j) ! original + C(i,j) = C(i,j) + A(i,k) * TB(1+k-kt,1+j-jt) ! before TTB + !C(i,j) = C(i,j) + A(i,k) * TTB(1+j-jt,1+k-kt) ! after TT + !C(i,j) = C(i,j) + TA(1+i-it,1+k-kt) * TB(1+k-kt,1+j-jt) ! with TA + !!TC(1+i-it,1+j-jt) = TC(1+i-it,1+j-jt) + TA(1+i-it,1+k-kt) * TB(1+k-kt,1+j-jt) ! with TA and TB enddo -#if defined(_OPENMP) - !$omp end simd -#endif enddo enddo + !!C(it:it+tile_size,jt:jt+tile_size) = C(it:it+tile_size,jt:jt+tile_size) + TC + !!C(it:it+tile_size,jt:jt+tile_size) = TC enddo enddo enddo -#ifdef _OPENMP - !$omp end do -#endif else -#if defined(_OPENMP) - !$omp do private(i,j,k,it,jt,kt) do j=1,order do k=1,order - !$omp simd do i=1,order -#else - do concurrent (j=1:order) - do concurrent (k=1:order) - do concurrent (i=1:order) -#endif C(i,j) = C(i,j) + A(i,k) * B(k,j) enddo -#if defined(_OPENMP) - !$omp end simd -#endif enddo enddo -#ifdef _OPENMP - !$omp end do -#endif endif end subroutine prk_dgemm program main use iso_fortran_env -#ifdef _OPENMP - use omp_lib -#endif implicit none real(kind=REAL64) :: prk_get_wtime ! for argument parsing @@ -156,11 +138,7 @@ program main ! ******************************************************************** write(*,'(a25)') 'Parallel Research Kernels' -#ifdef _OPENMP - write(*,'(a61)') 'Fortran OpenMP Dense matrix-matrix multiplication: C += A x B' -#else write(*,'(a61)') 'Fortran Serial Dense matrix-matrix multiplication: C += A x B' -#endif if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() @@ -195,9 +173,6 @@ program main tile_size = order ! no tiling endif -#ifdef _OPENMP - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() -#endif write(*,'(a,i8)') 'Number of iterations = ', iterations write(*,'(a,i8)') 'Matrix order = ', order write(*,'(a,i8)') 'Tile size = ', tile_size @@ -224,49 +199,22 @@ program main stop 1 endif -#ifdef _OPENMP - !$omp parallel default(none) & - !$omp& shared(A,B,C,t0,t1) & - !$omp& firstprivate(order,iterations,tile_size) & - !$omp& private(k) -#endif - - !$omp do private(i) do i=1, order A(:,i) = real(i-1,REAL64) B(:,i) = real(i-1,REAL64) C(:,i) = real(0,REAL64) enddo - !$omp end do t0 = 0 do k=0,iterations if (k.eq.1) then -#ifdef _OPENMP - !$omp barrier - !$omp master -#endif t0 = prk_get_wtime() -#ifdef _OPENMP - !$omp end master -#endif endif call prk_dgemm(order, tile_size, A, B, C) enddo -#ifdef _OPENMP - !$omp barrier - !$omp master -#endif t1 = prk_get_wtime() -#ifdef _OPENMP - !$omp end master -#endif - -#ifdef _OPENMP - !$omp end parallel -#endif dgemm_time = t1 - t0 @@ -280,13 +228,11 @@ program main forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = 0.0d0 - !$omp parallel do reduction(+:checksum) do j=1,order do i=1,order checksum = checksum + C(i,j) enddo enddo - !$omp end parallel do deallocate( C ) From 5bbee4788af1473a0ec1f5c4a841dba4cf8e08b2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 7 May 2021 15:18:22 -0700 Subject: [PATCH 113/325] nvcc_wrapper sucks and is not necessary --- Cxx11/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index ba1a31ff0..0663ee992 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -232,7 +232,7 @@ pic-sycl: pic-sycl.cc prk_util.h prk_sycl.h random_draw.c ifeq ($(PRK_KOKKOS_BACKEND),Cuda) %-kokkos: %-kokkos.cc prk_util.h prk_kokkos.h - ${KOKKOSDIR}/bin/nvcc_wrapper $(CPPFLAGS) $(CUDAFLAGS) $< $(KOKKOSFLAG) -DPRK_KOKKOS_BACKEND=Cuda -o $@ + $(NVCC) -x cu $(CPPFLAGS) $(CUDAFLAGS) $< $(KOKKOSFLAG) -DPRK_KOKKOS_BACKEND=Cuda -o $@ else %-kokkos: %-kokkos.cc prk_util.h prk_kokkos.h $(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP) From effe09744ad86d56cdfd684ef8579313599c4fc3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 7 May 2021 15:18:49 -0700 Subject: [PATCH 114/325] use fabs and fix initializer list issue with nvcc --- Cxx11/nstream-kokkos.cc | 3 ++- Cxx11/transpose-kokkos.cc | 11 ++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc index eed5aec8a..6ec3528de 100644 --- a/Cxx11/nstream-kokkos.cc +++ b/Cxx11/nstream-kokkos.cc @@ -163,7 +163,8 @@ int main(int argc, char * argv[]) double asum(0); Kokkos::parallel_reduce(length, KOKKOS_LAMBDA(size_t const i, double & inner) { - inner += prk::abs(A(i)); + using Kokkos::Experimental::fabs; + inner += fabs(A(i)); }, asum); Kokkos::fence(); diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc index 49adf66f8..e46d3e179 100644 --- a/Cxx11/transpose-kokkos.cc +++ b/Cxx11/transpose-kokkos.cc @@ -125,11 +125,11 @@ int main(int argc, char * argv[]) auto order2 = {order,order}; auto tile2 = {tile_size,tile_size}; - auto policy = Kokkos::MDRangePolicy>({0,0},order2,tile2); + const auto policy = Kokkos::MDRangePolicy>({0,0}, {order,order}, {tile_size,tile_size}); typedef Kokkos::Rank<2,Kokkos::Iterate::Right,Kokkos::Iterate::Left > rl; typedef Kokkos::Rank<2,Kokkos::Iterate::Left, Kokkos::Iterate::Right> lr; - auto policy_lr = Kokkos::MDRangePolicy({0,0},order2,tile2); - auto policy_rl = Kokkos::MDRangePolicy({0,0},order2,tile2); + const auto policy_lr = Kokkos::MDRangePolicy({0,0}, {order,order}, {tile_size,tile_size}); + const auto policy_rl = Kokkos::MDRangePolicy({0,0}, {order,order}, {tile_size,tile_size}); { Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i, int j) { @@ -170,13 +170,15 @@ int main(int argc, char * argv[]) Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(int i, int j, double & update) { size_t const ij = i*order+j; double const reference = static_cast(ij)*(1.+iterations)+addit; - update += prk::abs(B(j,i) - reference); + using Kokkos::Experimental::fabs; + update += fabs(B(j,i) - reference); }, abserr); #ifdef VERBOSE std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif + double epsilon(1.0e-8); if (abserr < epsilon) { std::cout << "Solution validates" << std::endl; @@ -192,7 +194,6 @@ int main(int argc, char * argv[]) } Kokkos::finalize(); - return 0; } From 65511720b6dedfcc00da3e3d62000aaf84114158 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 7 May 2021 15:18:59 -0700 Subject: [PATCH 115/325] add Kokkos_MathematicalFunctions.hpp --- Cxx11/prk_kokkos.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Cxx11/prk_kokkos.h b/Cxx11/prk_kokkos.h index b41c3107a..d58be300b 100644 --- a/Cxx11/prk_kokkos.h +++ b/Cxx11/prk_kokkos.h @@ -35,5 +35,6 @@ #include #include #include +#include #endif /* PRK_KOKKOS_H */ From e038bf6198812416fed9562ea4b9faa58815526b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 7 May 2021 15:22:15 -0700 Subject: [PATCH 116/325] update for thrust, kokkos, etc --- common/make.defs.nvhpc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index a0f47f74e..0eb9330d7 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -49,17 +49,19 @@ BOOSTFLAG= #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} -KOKKOSDIR= -KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} +KOKKOSDIR=../deps/kokkos-cuda +PRK_KOKKOS_BACKEND=Cuda +KOKKOSCXX=${KOKKOSDIR}/bin/nvcc_wrapper +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkoscore RAJADIR= RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} -THRUSTDIR=../deps/thrust +THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/compilers/include-stdpar THRUSTFLAG=-I${THRUSTDIR} # # CBLAS for C++ DGEMM # -BLASFLAG= -CBLASFLAG= +BLASFLAG=-L${NVHPC_PATH}/REDIST/compilers/lib -lblas +CBLASFLAG=${BLASFLAG} # # CUDA flags # @@ -67,9 +69,9 @@ CBLASFLAG= # Use appropriate arch or code is compiled to ancient features. #NVCC=${NVHPC_CBIN}nvc++ NVCC=${NVHPC_CBIN}nvcc -CUDAFLAGS=-g -O3 -std=c++14 +CUDAFLAGS=-g -O3 -std=c++17 CUDAFLAGS+=--extended-lambda -CUDAFLAGS+=--gpu-architecture=sm_70 +CUDAFLAGS+=--gpu-architecture=sm_75 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' From c236e1fb6bb6f7f809cb7631a7699720358ac4e7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 7 May 2021 16:43:27 -0700 Subject: [PATCH 117/325] fix stencil kokkos, again by removing initializer lists --- Cxx11/stencil-kokkos.cc | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc index 9c67e8798..fe4954422 100644 --- a/Cxx11/stencil-kokkos.cc +++ b/Cxx11/stencil-kokkos.cc @@ -81,7 +81,7 @@ int main(int argc, char* argv[]) std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; std::cout << "C++11/Kokkos Stencil execution on 2D grid" << std::endl; - Kokkos::initialize (argc, argv); + Kokkos::initialize(argc, argv); { ////////////////////////////////////////////////////////////////////// // Process and test input parameters @@ -94,7 +94,6 @@ int main(int argc, char* argv[]) throw "Usage: <# iterations> [ ]"; } - // number of times to run the algorithm iterations = std::atoi(argv[1]); if (iterations < 1) { throw "ERROR: iterations must be >= 1"; @@ -175,10 +174,7 @@ int main(int argc, char* argv[]) matrix in("in", n, n); matrix out("out", n, n); - auto z2 = {0,0}; - auto n2 = {n,n}; - auto tile2 = {tile_size,tile_size}; - auto full = Kokkos::MDRangePolicy>(z2,n2,tile2); + auto full = Kokkos::MDRangePolicy>({0,0},{n,n},{tile_size,tile_size}); { Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) { @@ -210,12 +206,11 @@ int main(int argc, char* argv[]) size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); - double norm(0); - auto r2 = {radius,radius}; - auto nr2 = {n-radius,n-radius}; - auto inside = Kokkos::MDRangePolicy>(r2,nr2,tile2); + double norm{0}; + auto inside = Kokkos::MDRangePolicy>({radius,radius},{n-radius,n-radius},{tile_size,tile_size}); Kokkos::parallel_reduce(inside, KOKKOS_LAMBDA(int i, int j, double & norm) { - norm += prk::abs(out(i,j)); + using Kokkos::Experimental::fabs; + norm += fabs(out(i,j)); }, norm); Kokkos::fence(); norm /= active_points; From 3919131701e46fe3958c6d42d445d2a39061cac9 Mon Sep 17 00:00:00 2001 From: Mattson Date: Thu, 20 May 2021 16:12:25 -0700 Subject: [PATCH 118/325] fixed python issues plus added the new PyOMP interface --- PYTHON/dgemm-numpy.py | 37 +++++++---- PYTHON/dgemm.py | 53 ++++++++------- PYTHON/dgemm_list.py | 145 +++++++++++++++++++++++++++++++++++++++++ PYTHON/dgemm_omp.py | 147 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 346 insertions(+), 36 deletions(-) create mode 100755 PYTHON/dgemm_list.py create mode 100755 PYTHON/dgemm_omp.py diff --git a/PYTHON/dgemm-numpy.py b/PYTHON/dgemm-numpy.py index 49a9ac280..b569a52f0 100755 --- a/PYTHON/dgemm-numpy.py +++ b/PYTHON/dgemm-numpy.py @@ -48,6 +48,7 @@ # # HISTORY: Written by Rob Van der Wijngaart, February 2009. # Converted to Python by Jeff Hammond, February 2016. +# Fixed timing err, ave+std_dev, numpy.dot by Tim Mattson, May 2021 # ******************************************************************* import sys @@ -72,15 +73,15 @@ def main(): print('argument count = ', len(sys.argv)) sys.exit("Usage: ./dgemm <# iterations> ") - iterations = int(sys.argv[1]) - if iterations < 1: + iters = int(sys.argv[1]) + if iters < 1: sys.exit("ERROR: iterations must be >= 1") order = int(sys.argv[2]) if order < 1: sys.exit("ERROR: order must be >= 1") - print('Number of iterations = ', iterations) + print('Number of iterations = ', iters) print('Matrix order = ', order) # ******************************************************************** @@ -91,15 +92,24 @@ def main(): B = numpy.fromfunction(lambda i,j: j, (order,order), dtype=float) C = numpy.zeros((order,order)) - for k in range(0,iterations+1): + for kiter in range(0,iters+1): + if kiter==1: + t0 = timer() + tSum=0.0 + tsqSum=0.0 - if k<1: t0 = timer() + C += numpy.matmul(A,B) # requires Numpy 1.10 or later + #C += numpy.dot(A,B) - #C += numpy.matmul(A,B) # requires Numpy 1.10 or later - C += numpy.dot(A,B) + if kiter>0: + tkiter = timer() + t = tkiter - t0 + tSum = tSum + t + tsqSum = tsqSum+t*t + t0 = tkiter - t1 = timer() - dgemm_time = t1 - t0 + dgemmAve = tSum/iters + dgemmStdDev = ((tsqSum-iters*dgemmAve*dgemmAve)/(iters-1))**0.5 # ******************************************************************** # ** Analyze and output results. @@ -108,14 +118,16 @@ def main(): checksum = numpy.linalg.norm(numpy.reshape(C,order*order),ord=1) ref_checksum = 0.25*order*order*order*(order-1.0)*(order-1.0) - ref_checksum *= (iterations+1) + ref_checksum *= (iters+1) epsilon=1.e-8 if abs((checksum - ref_checksum)/ref_checksum) < epsilon: print('Solution validates') - avgtime = dgemm_time/iterations nflops = 2.0*order*order*order - print('Rate (MF/s): ',1.e-6*nflops/avgtime, ' Avg time (s): ', avgtime) + recipDiff = (1.0/(dgemmAve-dgemmStdDev) - 1.0/(dgemmAve+dgemmStdDev)) + GfStdDev = 1.e-6*nflops*recipDiff/2.0 + print('nflops: ',nflops) + print('Rate: ',1.e-6*nflops/dgemmAve,' +/- (MF/s): ',GfStdDev) else: print('ERROR: Checksum = ', checksum,', Reference checksum = ', ref_checksum,'\n') sys.exit("ERROR: solution did not validate") @@ -123,4 +135,3 @@ def main(): if __name__ == '__main__': main() - diff --git a/PYTHON/dgemm.py b/PYTHON/dgemm.py index 9830eeb51..909723356 100755 --- a/PYTHON/dgemm.py +++ b/PYTHON/dgemm.py @@ -48,8 +48,10 @@ # # HISTORY: Written by Rob Van der Wijngaart, February 2009. # Converted to Python by Jeff Hammond, February 2016. +# Fixed timing err, Ave+std_dev, more pythonic, Tim Mattson May 2021 # ******************************************************************* +import numpy as np import sys print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) if sys.version_info >= (3, 3): @@ -70,43 +72,47 @@ def main(): print('argument count = ', len(sys.argv)) sys.exit("Usage: ./dgemm <# iterations> ") - iterations = int(sys.argv[1]) - if iterations < 1: + iters = int(sys.argv[1]) + if iters < 1: sys.exit("ERROR: iterations must be >= 1") order = int(sys.argv[2]) if order < 1: sys.exit("ERROR: order must be >= 1") - print('Number of iterations = ', iterations) + print('Number of iterations = ', iters) print('Matrix order = ', order) # ******************************************************************** # ** Allocate space for the input and transpose matrix # ******************************************************************** - # 0.0 is a float, which is 64b (53b of precision) - A = [[0.0 for x in range(order)] for x in range(order)] - B = [[0.0 for x in range(order)] for x in range(order)] - C = [[0.0 for x in range(order)] for x in range(order)] - - # this is surely not the Pythonic way of doing this + A = np.zeros((order,order)) + B = np.zeros((order,order)) + C = np.zeros((order,order)) for i in range(order): - for j in range(order): - A[i][j] = float(j) - B[i][j] = float(j) + A[:,i] = float(i) + B[:,i] = float(i) - for k in range(0,iterations+1): - - if k<1: t0 = timer() + for kiter in range(0,iters+1): + if kiter==1: + t0 = timer() + tSum=0.0 + tsqSum=0.0 for i in range(order): - for j in range(order): - for k in range(order): + for k in range(order): + for j in range(order): C[i][j] += A[i][k] * B[k][j] + if kiter>0: + tkiter = timer() + t = tkiter - t0 + tSum = tSum + t + tsqSum = tsqSum+t*t + t0 = tkiter - t1 = timer() - dgemm_time = t1 - t0 + dgemmAve = tSum/iters + dgemmStdDev = ((tsqSum-iters*dgemmAve*dgemmAve)/(iters-1))**0.5 # ******************************************************************** # ** Analyze and output results. @@ -118,14 +124,16 @@ def main(): checksum += C[i][j]; ref_checksum = 0.25*order*order*order*(order-1.0)*(order-1.0) - ref_checksum *= (iterations+1) + ref_checksum *= (iters+1) epsilon=1.e-8 if abs((checksum - ref_checksum)/ref_checksum) < epsilon: print('Solution validates') - avgtime = dgemm_time/iterations nflops = 2.0*order*order*order - print('Rate (MF/s): ',1.e-6*nflops/avgtime, ' Avg time (s): ', avgtime) + recipDiff = (1.0/(dgemmAve-dgemmStdDev) - 1.0/(dgemmAve+dgemmStdDev)) + GfStdDev = 1.e-6*nflops*recipDiff/2.0 + print('nflops: ',nflops) + print('Rate: ',1.e-6*nflops/dgemmAve,' +/- (MF/s): ',GfStdDev) else: print('ERROR: Checksum = ', checksum,', Reference checksum = ', ref_checksum,'\n') sys.exit("ERROR: solution did not validate") @@ -133,4 +141,3 @@ def main(): if __name__ == '__main__': main() - diff --git a/PYTHON/dgemm_list.py b/PYTHON/dgemm_list.py new file mode 100755 index 000000000..9a6cd7590 --- /dev/null +++ b/PYTHON/dgemm_list.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2015, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: dgemm +# +# PURPOSE: This program tests the efficiency with which a dense matrix +# dense multiplication is carried out +# +# USAGE: The program takes as input the matrix order, +# the number of times the matrix-matrix multiplication +# is carried out. +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# Fixed timing error, Added Ave+std_dev by Tim Mattson, May 2021. +# ******************************************************************* + +import sys +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer + +def main(): + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python Dense matrix-matrix multiplication: C = A x B') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./dgemm <# iterations> ") + + iters = int(sys.argv[1]) + if iters < 1: + sys.exit("ERROR: iterations must be >= 1") + + order = int(sys.argv[2]) + if order < 1: + sys.exit("ERROR: order must be >= 1") + + print('Number of iterations = ', iters) + print('Matrix order = ', order) + + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + # 0.0 is a float, which is 64b (53b of precision) + A = [[0.0 for x in range(order)] for x in range(order)] + B = [[0.0 for x in range(order)] for x in range(order)] + C = [[0.0 for x in range(order)] for x in range(order)] + + for i in range(order): + for j in range(order): + A[i][j] = float(j) + B[i][j] = float(j) + + for kiter in range(0,iters+1): + if kiter==1: + t0 = timer() + tSum=0.0 + tsqSum=0.0 + for i in range(order): + for k in range(order): + for j in range(order): + C[i][j] += A[i][k] * B[k][j] + if kiter>0: + tkiter = timer() + t = tkiter - t0 + tSum = tSum + t + tsqSum = tsqSum+t*t + t0 = tkiter + + dgemmAve = tSum/iters + dgemmStdDev = ((tsqSum-iters*dgemmAve*dgemmAve)/(iters-1))**0.5 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + checksum = 0.0; + for i in range(order): + for j in range(order): + checksum += C[i][j]; + + ref_checksum = 0.25*order*order*order*(order-1.0)*(order-1.0) + ref_checksum *= (iters+1) + + epsilon=1.e-8 + if abs((checksum - ref_checksum)/ref_checksum) < epsilon: + print('Solution validates') + nflops = 2.0*order*order*order + recipDiff = (1.0/(dgemmAve-dgemmStdDev) - 1.0/(dgemmAve+dgemmStdDev)) + GfStdDev = 1.e-6*nflops*recipDiff/2.0 + print('nflops: ',nflops) + print('Rate: ',1.e-6*nflops/dgemmAve,' +/- (MF/s): ',GfStdDev) + else: + print('ERROR: Checksum = ', checksum,', Reference checksum = ', ref_checksum,'\n') + sys.exit("ERROR: solution did not validate") + + +if __name__ == '__main__': + main() + diff --git a/PYTHON/dgemm_omp.py b/PYTHON/dgemm_omp.py new file mode 100755 index 000000000..ccff8fb01 --- /dev/null +++ b/PYTHON/dgemm_omp.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2015, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: dgemm +# +# PURPOSE: This program tests the efficiency with which a dense matrix +# dense multiplication is carried out +# +# USAGE: The program takes as input the matrix order, +# the number of times the matrix-matrix multiplication +# is carried out. +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# PyOMP support, ave+std_dev by Tim Mattson, May 2021 +# ******************************************************************* + +import sys +from numba import njit +from numba.openmp import openmp_context as openmp +from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime +import numpy as np +#from time import process_time as timer + +#@njit(enable_ssa=False, cache=True) What does "enable_ssa" mean? +@njit(fastmath=True) +def dgemm(iters,order): + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + print('inside dgemm') + A = np.zeros((order,order)) + B = np.zeros((order,order)) + C = np.zeros((order,order)) + + for i in range(order): + A[:,i] = float(i) + B[:,i] = float(i) + +# print(omp_get_num_threads()) + for kiter in range(0,iters+1): + if kiter==1: + t0 = omp_get_wtime() + tSum=0.0 + tsqSum=0.0 + with openmp("parallel for schedule(static) private(j,k)"): + for i in range(order): + for k in range(order): + for j in range(order): + C[i][j] += A[i][k] * B[k][j] + if kiter>0: + tkiter = omp_get_wtime() + t = tkiter - t0 + tSum = tSum + t + tsqSum = tsqSum+t*t + t0 = tkiter + + dgemmAve = tSum/iters + dgemmStdDev = ((tsqSum-iters*dgemmAve*dgemmAve)/(iters-1))**0.5 + print('finished with computations') + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + checksum = 0.0; + for i in range(order): + for j in range(order): + checksum += C[i][j]; + + ref_checksum = order*order*order + ref_checksum *= 0.25*(order-1.0)*(order-1.0) + ref_checksum *= (iters+1) + epsilon=1.e-8 + if abs((checksum - ref_checksum)/ref_checksum) < epsilon: + print('Solution validates') + nflops = 2.0*order*order*order + recipDiff = (1.0/(dgemmAve-dgemmStdDev) - 1.0/(dgemmAve+dgemmStdDev)) + GfStdDev = 1.e-6*nflops*recipDiff/2.0 + print('nflops: ',nflops) + print('Rate: ',1.e-6*nflops/dgemmAve,' +/- (MF/s): ',GfStdDev) + else: + print('ERROR: Checksum = ', checksum,', Reference checksum = ', ref_checksum,'\n') +# sys.exit("ERROR: solution did not validate") + + +# ******************************************************************** +# read and test input parameters +# ******************************************************************** + +print('Parallel Research Kernels version ') #, PRKVERSION +print('Python Dense matrix-matrix multiplication: C = A x B') + +if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./dgemm <# iterations> ") + +itersIn = int(sys.argv[1]) +if itersIn < 1: + sys.exit("ERROR: iterations must be >= 1") + +orderIn = int(sys.argv[2]) +if orderIn < 1: + sys.exit("ERROR: order must be >= 1") + +print('Number of iterations = ', itersIn) +print('Matrix order = ', orderIn) + +dgemm(itersIn, orderIn) + From 8ab591d133a052a7f0f9e1d33b17c86f53662cf5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 21 May 2021 17:05:23 -0700 Subject: [PATCH 119/325] add CuPy nstream --- PYTHON/nstream-cupy.py | 159 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100755 PYTHON/nstream-cupy.py diff --git a/PYTHON/nstream-cupy.py b/PYTHON/nstream-cupy.py new file mode 100755 index 000000000..cca8f6f1d --- /dev/null +++ b/PYTHON/nstream-cupy.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2017, Intel Corporation +# Copyright (c) 2021, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: nstream +# +# PURPOSE: To compute memory bandwidth when adding a vector of a given +# number of double precision values to the scalar multiple of +# another vector of the same length, and storing the result in +# a third vector. +# +# USAGE: The program takes as input the number +# of iterations to loop over the triad vectors, the length of the +# vectors, and the offset between vectors +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# NOTES: Bandwidth is determined as the number of words read, plus the +# number of words written, times the size of the words, divided +# by the execution time. For a vector length of N, the total +# number of words read and written is 4*N*sizeof(double). +# +# +# HISTORY: This code is loosely based on the Stream benchmark by John +# McCalpin, but does not follow all the Stream rules. Hence, +# reported results should not be associated with Stream in +# external publications +# +# Converted to Python by Jeff Hammond, October 2017. +# +# ******************************************************************* + +import sys +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +#import numpy +#print('Numpy version = ', numpy.version.version) + +import cupy + +def main(): + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python CuPy STREAM triad: A = B + scalar * C') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: python nstream.py <# iterations> ") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + length = int(sys.argv[2]) + if length < 1: + sys.exit("ERROR: length must be positive") + + #offset = int(sys.argv[3]) + #if offset < 0: + # sys.exit("ERROR: offset must be nonnegative") + + print('Number of iterations = ', iterations) + print('Vector length = ', length) + #print('Offset = ', offset) + + # ******************************************************************** + # ** Allocate space for the input and execute STREAM triad + # ******************************************************************** + + # 0.0 is a float, which is 64b (53b of precision) + A = cupy.zeros(length,dtype='float64') + B = cupy.full(length,2.0,dtype='float64') + C = cupy.full(length,2.0,dtype='float64') + + scalar = 3.0 + + for k in range(0,iterations+1): + + if k<1: t0 = timer() + + A += B + scalar * C + + cupy.cuda.runtime.deviceSynchronize() + + t1 = timer() + nstream_time = t1 - t0 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + ar = 0.0 + br = 2.0 + cr = 2.0 + ref = 0.0 + for k in range(0,iterations+1): + ar += br + scalar * cr + + ar *= length + + asum = cupy.linalg.norm(A, ord=1) + + epsilon=1.e-8 + if abs(ar-asum)/asum > epsilon: + print('Failed Validation on output array'); + print(' Expected checksum: ',ar); + print(' Observed checksum: ',asum); + sys.exit("ERROR: solution did not validate") + else: + print('Solution validates') + avgtime = nstream_time/iterations + nbytes = 4.0 * length * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + + +if __name__ == '__main__': + main() + From d880151cff53679ae03f97b1b59fdc7a03ca99f6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 21 May 2021 17:11:00 -0700 Subject: [PATCH 120/325] add PRK transpose --- PYTHON/transpose-cupy.py | 130 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100755 PYTHON/transpose-cupy.py diff --git a/PYTHON/transpose-cupy.py b/PYTHON/transpose-cupy.py new file mode 100755 index 000000000..07c48a1d0 --- /dev/null +++ b/PYTHON/transpose-cupy.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2015, Intel Corporation +# Copyright (c) 2021, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: transpose +# +# PURPOSE: This program measures the time for the transpose of a +# column-major stored matrix into a row-major stored matrix. +# +# USAGE: Program input is the matrix order and the number of times to +# repeat the operation: +# +# transpose <# iterations> +# +# The output consists of diagnostics to make sure the +# transpose worked and timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# ******************************************************************* + +import sys +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +import numpy +print('Numpy version = ', numpy.version.version) + +import cupy + +def main(): + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python CuPy Matrix transpose: B = A^T') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./transpose <# iterations> ") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + order = int(sys.argv[2]) + if order < 1: + sys.exit("ERROR: order must be >= 1") + + print('Number of iterations = ', iterations) + print('Matrix order = ', order) + + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + T = numpy.fromfunction(lambda i,j: i*order+j, (order,order), dtype=float) + A = cupy.array(T) + B = cupy.zeros((order,order)) + + for k in range(0,iterations+1): + + if k<1: t0 = timer() + + # this actually forms the transpose of A + # B += numpy.transpose(A) + # this only uses the transpose _view_ of A + B += A.T + A += 1.0 + + t1 = timer() + trans_time = t1 - t0 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + T = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) + A = cupy.array(T) + abserr = cupy.linalg.norm(cupy.reshape(B-A,order*order),ord=1) + + epsilon=1.e-8 + nbytes = 2 * order**2 * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + if abserr < epsilon: + print('Solution validates') + avgtime = trans_time/iterations + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + else: + print('error ',abserr, ' exceeds threshold ',epsilon) + sys.exit("ERROR: solution did not validate") + + +if __name__ == '__main__': + main() + From ace38b7cbe7f3742393a47318a38a4352fe9d460 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 21 May 2021 17:13:06 -0700 Subject: [PATCH 121/325] add CuPy stencil --- PYTHON/stencil-cupy.py | 199 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100755 PYTHON/stencil-cupy.py diff --git a/PYTHON/stencil-cupy.py b/PYTHON/stencil-cupy.py new file mode 100755 index 000000000..e36bc0ee3 --- /dev/null +++ b/PYTHON/stencil-cupy.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2015, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# ******************************************************************* +# +# NAME: Stencil +# +# PURPOSE: This program tests the efficiency with which a space-invariant, +# linear, symmetric filter (stencil) can be applied to a square +# grid or image. +# +# USAGE: The program takes as input the linear +# dimension of the grid, and the number of iterations on the grid +# +# +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: - Written by Rob Van der Wijngaart, February 2009. +# - RvdW: Removed unrolling pragmas for clarity; +# added constant to array "in" at end of each iteration to force +# refreshing of neighbor data in parallel versions; August 2013 +# - Converted to Python by Jeff Hammond, February 2016. +# +# ******************************************************************* + +import sys +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +import numpy +print('Numpy version = ', numpy.version.version) + +import cupy + +def main(): + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + print('Parallel Research Kernels') + print('Python CuPy stencil execution on 2D grid') + + if len(sys.argv) < 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./stencil <# iterations> [ ]") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + n = int(sys.argv[2]) + if n < 1: + sys.exit("ERROR: array dimension must be >= 1") + + if len(sys.argv) > 3: + pattern = sys.argv[3] + else: + pattern = 'star' + + if len(sys.argv) > 4: + r = int(sys.argv[4]) + if r < 1: + sys.exit("ERROR: Stencil radius should be positive") + if (2*r+1) > n: + sys.exit("ERROR: Stencil radius exceeds grid size") + else: + r = 2 + + print('Number of iterations = ', iterations) + print('Grid size = ', n) + print('Radius of stencil = ', r) + if pattern == 'star': + print('Type of stencil = ','star') + else: + print('Type of stencil = ','stencil') + print('Data type = double precision') + print('Compact representation of stencil loop body') + + # there is certainly a more Pythonic way to initialize W, + # but it will have no impact on performance. + W = cupy.zeros(((2*r+1),(2*r+1))) + if pattern == 'star': + stencil_size = 4*r+1 + for i in range(1,r+1): + W[r,r+i] = +1./(2*i*r) + W[r+i,r] = +1./(2*i*r) + W[r,r-i] = -1./(2*i*r) + W[r-i,r] = -1./(2*i*r) + + else: + stencil_size = (2*r+1)**2 + for j in range(1,r+1): + for i in range(-j+1,j): + W[r+i,r+j] = +1./(4*j*(2*j-1)*r) + W[r+i,r-j] = -1./(4*j*(2*j-1)*r) + W[r+j,r+i] = +1./(4*j*(2*j-1)*r) + W[r-j,r+i] = -1./(4*j*(2*j-1)*r) + + W[r+j,r+j] = +1./(4*j*r) + W[r-j,r-j] = -1./(4*j*r) + + T = numpy.fromfunction(lambda i,j: i+j, (n,n), dtype=float) + A = cupy.array(T) + B = cupy.zeros((n,n)) + + for k in range(iterations+1): + # start timer after a warmup iteration + if k<1: t0 = timer() + + if pattern == 'star': + if r==2: + B[2:n-2,2:n-2] += W[2,2] * A[2:n-2,2:n-2] \ + + W[2,0] * A[2:n-2,0:n-4] \ + + W[2,1] * A[2:n-2,1:n-3] \ + + W[2,3] * A[2:n-2,3:n-1] \ + + W[2,4] * A[2:n-2,4:n-0] \ + + W[0,2] * A[0:n-4,2:n-2] \ + + W[1,2] * A[1:n-3,2:n-2] \ + + W[3,2] * A[3:n-1,2:n-2] \ + + W[4,2] * A[4:n-0,2:n-2] + else: + b = n-r + B[r:b,r:b] += W[r,r] * A[r:b,r:b] + for s in range(1,r+1): + B[r:b,r:b] += W[r,r-s] * A[r:b,r-s:b-s] \ + + W[r,r+s] * A[r:b,r+s:b+s] \ + + W[r-s,r] * A[r-s:b-s,r:b] \ + + W[r+s,r] * A[r+s:b+s,r:b] + else: # stencil + if r>0: + b = n-r + for s in range(-r, r+1): + for t in range(-r, r+1): + B[r:b,r:b] += W[r+t,r+s] * A[r+t:b+t,r+s:b+s] + + A += 1.0 + + t1 = timer() + stencil_time = t1 - t0 + + #****************************************************************************** + #* Analyze and output results. + #****************************************************************************** + + norm = cupy.linalg.norm(cupy.reshape(B,n*n),ord=1) + active_points = (n-2*r)**2 + norm /= active_points + + epsilon=1.e-8 + + # verify correctness + reference_norm = 2*(iterations+1) + if abs(norm-reference_norm) < epsilon: + print('Solution validates') + flops = (2*stencil_size+1) * active_points + avgtime = stencil_time/iterations + print('Rate (MFlops/s): ',1.e-6*flops/avgtime, ' Avg time (s): ',avgtime) + else: + print('ERROR: L1 norm = ', norm,' Reference L1 norm = ', reference_norm) + sys.exit() + + +if __name__ == '__main__': + main() From de57160da24876698714bbd14116b83182a9eaf3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 21 May 2021 17:14:59 -0700 Subject: [PATCH 122/325] add necesary device sync --- PYTHON/stencil-cupy.py | 2 ++ PYTHON/transpose-cupy.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/PYTHON/stencil-cupy.py b/PYTHON/stencil-cupy.py index e36bc0ee3..73821be06 100755 --- a/PYTHON/stencil-cupy.py +++ b/PYTHON/stencil-cupy.py @@ -170,6 +170,8 @@ def main(): A += 1.0 + cupy.cuda.runtime.deviceSynchronize() + t1 = timer() stencil_time = t1 - t0 diff --git a/PYTHON/transpose-cupy.py b/PYTHON/transpose-cupy.py index 07c48a1d0..01476ca9f 100755 --- a/PYTHON/transpose-cupy.py +++ b/PYTHON/transpose-cupy.py @@ -103,6 +103,8 @@ def main(): B += A.T A += 1.0 + cupy.cuda.runtime.deviceSynchronize() + t1 = timer() trans_time = t1 - t0 From e33cedc78b3aaf985de6b607ee51d1172d0034f4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 24 May 2021 18:26:17 -0700 Subject: [PATCH 123/325] avoid fromfunction since cupy does not support --- PYTHON/transpose-cupy.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/PYTHON/transpose-cupy.py b/PYTHON/transpose-cupy.py index 01476ca9f..1d354e540 100755 --- a/PYTHON/transpose-cupy.py +++ b/PYTHON/transpose-cupy.py @@ -89,8 +89,7 @@ def main(): # ** Allocate space for the input and transpose matrix # ******************************************************************** - T = numpy.fromfunction(lambda i,j: i*order+j, (order,order), dtype=float) - A = cupy.array(T) + A = cupy.arange(order*order,dtype=float).reshape(order,order) B = cupy.zeros((order,order)) for k in range(0,iterations+1): @@ -112,8 +111,7 @@ def main(): # ** Analyze and output results. # ******************************************************************** - T = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) - A = cupy.array(T) + A = (iterations+1.)*(cupy.arange(order*order).reshape(order,order).T+iterations/2.0) abserr = cupy.linalg.norm(cupy.reshape(B-A,order*order),ord=1) epsilon=1.e-8 From 80a1d0abb39a7fe6efa44a999bd9103e4bb24558 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 24 May 2021 18:27:18 -0700 Subject: [PATCH 124/325] remove numpy references that are not needed --- PYTHON/transpose-cupy.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/PYTHON/transpose-cupy.py b/PYTHON/transpose-cupy.py index 1d354e540..bacb38ccb 100755 --- a/PYTHON/transpose-cupy.py +++ b/PYTHON/transpose-cupy.py @@ -56,8 +56,6 @@ from time import process_time as timer else: from timeit import default_timer as timer -import numpy -print('Numpy version = ', numpy.version.version) import cupy @@ -96,9 +94,6 @@ def main(): if k<1: t0 = timer() - # this actually forms the transpose of A - # B += numpy.transpose(A) - # this only uses the transpose _view_ of A B += A.T A += 1.0 From b113c336dc74af842c050c2c920d807a8f2f653e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 28 Jun 2021 09:05:48 -0700 Subject: [PATCH 125/325] add GCC ranges --- Cxx11/prk_ranges.h | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h index d5fe848b5..eab958366 100644 --- a/Cxx11/prk_ranges.h +++ b/Cxx11/prk_ranges.h @@ -1,5 +1,6 @@ /// /// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -32,32 +33,37 @@ #ifndef PRK_RANGES_H #define PRK_RANGES_H -#if defined(USE_RANGES) -# if defined(USE_BOOST_IRANGE) -# include "boost/range/irange.hpp" -# elif defined(USE_RANGES_TS) -# include "range/v3/view/iota.hpp" -# include "range/v3/view/slice.hpp" -# include "range/v3/view/stride.hpp" -# else -# error You have not provided a version of ranges to use. -# endif +#if defined(USE_GCC_RANGES) +# include +#elif defined(USE_BOOST_IRANGE) +# include "boost/range/irange.hpp" +#elif defined(USE_RANGES_TS) +# include "range/v3/view/iota.hpp" +# include "range/v3/view/slice.hpp" +# include "range/v3/view/stride.hpp" +#else +# error You have not provided a version of ranges to use. #endif namespace prk { template auto range(S start, E end) { -#if defined(USE_BOOST_IRANGE) +#if defined(USE_GCC_RANGES) + return std::ranges::views::iota(static_cast(start), end); +#elif defined(USE_BOOST_IRANGE) return boost::irange(static_cast(start), end); #elif defined(USE_RANGES_TS) return ranges::views::iota(static_cast(start), end); #endif } +#if UNUSED template auto range(S start, E end, B blocking) { -#if defined(USE_BOOST_IRANGE) +#if defined(USE_GCC_RANGES) +#error FIXME +#elif defined(USE_BOOST_IRANGE) return boost::irange(static_cast(start), end, static_cast(blocking) ); #elif defined(USE_RANGES_TS) // NOTE: @@ -68,6 +74,7 @@ namespace prk { ranges::views::stride(static_cast(blocking)); #endif } +#endif } // namespace prk From 53ef729b2c894985449f8c77f396c1c56132bee6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 10 Aug 2021 03:01:57 -0700 Subject: [PATCH 126/325] fix MPI --- C1z/Makefile | 2 +- Cxx11/Makefile | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/C1z/Makefile b/C1z/Makefile index 5e7b0ac81..f8927c191 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -106,7 +106,7 @@ p2p-2d: p2p-2d.c prk_util.h $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ %-mpi: %-mpi.c prk_util.h - $(CC) $(CFLAGS) $(MPIINC) $< $(MPILIB) $(EXTRA_CLIBS) -o $@ + $(MPICC) $(CFLAGS) $(MPIINC) $< $(MPILIB) $(EXTRA_CLIBS) -o $@ %-petsc: %-petsc.c prk_util.h $(CC) $(CFLAGS) $(MPIINC) $< $(PETSCFLAG) $(MPILIB) $(EXTRA_CLIBS) -o $@ diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 0663ee992..6f652c6f9 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -155,10 +155,7 @@ p2p-hyperplane-vector: p2p-hyperplane-openmp.cc prk_util.h # $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ %-mpi: %-mpi.cc prk_util.h prk_mpi.h - $(CXX) $(CXXFLAGS) $(MPIINC) $< $(MPILIB) -o $@ - -%-mpi: %-mpi.cc prk_util.h prk_mpi.h - $(CXX) $(CXXFLAGS) $(MPIINC) $< $(MPILIB) -o $@ + $(MPICXX) $(CXXFLAGS) $(MPIINC) $< $(MPILIB) -o $@ %-opencl: %-opencl.cc prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ From bc7d5a250f5c6516e326e7a0e83fe9248cd8d1dd Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Tue, 31 Aug 2021 12:33:39 -0500 Subject: [PATCH 127/325] Add option for sgemm-cublas to use TF32 --- Cxx11/sgemm-cublas.cu | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/Cxx11/sgemm-cublas.cu b/Cxx11/sgemm-cublas.cu index becb2ab91..f96c3953d 100644 --- a/Cxx11/sgemm-cublas.cu +++ b/Cxx11/sgemm-cublas.cu @@ -186,9 +186,10 @@ int main(int argc, char * argv[]) int order; int batches = 0; int input_copy = 0; + int tf32 = 0; try { if (argc < 2) { - throw "Usage: <# iterations> [] []"; + throw "Usage: <# iterations> [] [] []"; } iterations = std::atoi(argv[1]); @@ -213,6 +214,13 @@ int main(int argc, char * argv[]) throw "ERROR: input_copy was not 0 or 1"; } } + + if (argc > 5) { + tf32 = std::atoi(argv[5]); + if (tf32 != 0 && tf32 != 1) { + throw "ERROR: tf32 was not 0 or 1"; + } + } } catch (const char * e) { std::cout << e << std::endl; @@ -229,10 +237,15 @@ int main(int argc, char * argv[]) std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; } std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; + std::cout << "TF32 = " << (tf32 ? "yes" : "no") << std::endl; cublasHandle_t h; prk::CUDA::check( cublasCreate(&h) ); + if (tf32) { + cublasSetMathMode(h, CUBLAS_TF32_TENSOR_OP_MATH); + } + const int tile_size = 32; dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); dim3 dimBlock(tile_size, tile_size, 1); @@ -336,7 +349,12 @@ int main(int argc, char * argv[]) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - const auto epsilon = 1.0e-8; + double epsilon; + if(tf32) { + epsilon = 1.0e-4; + } else { + epsilon = 1.0e-8; + } const auto forder = static_cast(order); const auto reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); double residuum(0); From 4d7c9b1ce669aa9c7d72a247645392f2aae194bc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 1 Sep 2021 16:18:11 +0300 Subject: [PATCH 128/325] add DGEMM CBLAS --- Cxx11/dgemm-mpi-cblas.cc | 372 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100644 Cxx11/dgemm-mpi-cblas.cc diff --git a/Cxx11/dgemm-mpi-cblas.cc b/Cxx11/dgemm-mpi-cblas.cc new file mode 100644 index 000000000..11afc9de9 --- /dev/null +++ b/Cxx11/dgemm-mpi-cblas.cc @@ -0,0 +1,372 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: dgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// cblas_dgemm() +/// cblas_dgemm_batch() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_mpi.h" + +#if defined(MKL) +#include +#ifdef MKL_ILP64 +#error Use the MKL library for 32-bit integers! +#endif +#elif defined(ACCELERATE) +// The location of cblas.h is not in the system include path when -framework Accelerate is provided. +#include +#else +#include +#endif + +#ifdef _OPENMP +#include +#endif + +#ifdef PRK_DEBUG +#include +void prk_dgemm_loops(const int order, + const std::vector & A, + const std::vector & B, + std::vector & C) +{ + for (int i=0; i & A, + const std::vector & B, + std::vector & C) +{ + const int n = order; + const double alpha = 1.0; + const double beta = 1.0; + + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, alpha, A.data(), n, B.data(), n, beta, C.data(), n); +} + +void prk_dgemm(const int order, const int batches, + const std::vector> & A, + const std::vector> & B, + std::vector> & C) +{ + const int n = order; + const double alpha = 1.0; + const double beta = 1.0; + + for (int b=0; b> & A, + const std::vector> & B, + std::vector> & C) +{ + const int n = order; + const double alpha = 1.0; + const double beta = 1.0; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic) num_threads(nt) +#endif + for (int b=0; b [ ]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (argc > 3) { + batches = std::atoi(argv[3]); + } + + if (argc>4) { + batch_threads = std::atoi(argv[4]); + } else { +#ifdef _OPENMP + batch_threads = omp_get_max_threads(); +#endif + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + if (me == 0) { + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches > 0) { +#ifdef MKL + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; +#else + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; +#endif + } else if (batches < 0) { + if (batch_threads > 1) { + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS with " << batch_threads << " threads)" << std::endl; + } else { + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; + } + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double dgemm_time{0}; + + const int matrices = (batches==0 ? 1 : abs(batches)); + + std::vector const M(order*order,0); + std::vector> A(matrices,M); + std::vector> B(matrices,M); + std::vector> C(matrices,M); + for (int b=0; b 0) { + prk_dgemm(order, matrices, pA, pB, pC); + } + } + prk::MPI::barrier(); + dgemm_time = prk::wtime() - dgemm_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.0e-8; + const double forder = static_cast(order); + const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double residuum{0}; + for (int b=0; b= epsilon) +#endif + { + for (int r=0; r Date: Wed, 1 Sep 2021 16:18:14 +0300 Subject: [PATCH 129/325] add DGEMM CBLAS --- Cxx11/Makefile | 5 ++++- Cxx11/dgemm-mpi-cublas.cu | 6 +++--- common/make.defs.gcc | 16 ++++++++-------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 6f652c6f9..bc3ccec5f 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -88,7 +88,7 @@ target: stencil-openmp-target transpose-openmp-target nstream-openmp-target taskloop: stencil-taskloop transpose-taskloop nstream-taskloop -mpi: nstream-mpi stencil-mpi +mpi: nstream-mpi stencil-mpi dgemm-mpi-cblas opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl @@ -157,6 +157,9 @@ p2p-hyperplane-vector: p2p-hyperplane-openmp.cc prk_util.h %-mpi: %-mpi.cc prk_util.h prk_mpi.h $(MPICXX) $(CXXFLAGS) $(MPIINC) $< $(MPILIB) -o $@ +%-mpi-cblas: %-mpi-cblas.cc prk_util.h prk_mpi.h + $(CXX) $(CXXFLAGS) $(MPIINC) $< $(MPILIB) $(CBLASFLAGS) -o $@ + %-opencl: %-opencl.cc prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ diff --git a/Cxx11/dgemm-mpi-cublas.cu b/Cxx11/dgemm-mpi-cublas.cu index 93cb7456a..9eb3aaa82 100644 --- a/Cxx11/dgemm-mpi-cublas.cu +++ b/Cxx11/dgemm-mpi-cublas.cu @@ -52,7 +52,7 @@ /// Other than OpenMP or standard C functions, the following /// functions are used in this program: /// -/// cblasDgemm() +/// cublasDgemm() /// /// HISTORY: Written by Rob Van der Wijngaart, February 2009. /// Converted to C++11 by Jeff Hammond, December, 2017. @@ -157,7 +157,7 @@ int main(int argc, char * argv[]) // Allocate space for matrices ////////////////////////////////////////////////////////////////////// - double dgemm_time(0); + double dgemm_time{0}; const size_t nelems = (size_t)order * (size_t)order; const size_t bytes = nelems * sizeof(double); @@ -219,7 +219,7 @@ int main(int argc, char * argv[]) const double epsilon = 1.0e-8; const double forder = static_cast(order); const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); - double residuum(0); + double residuum{0}; const auto checksum = prk::reduce( &(h_c[0]), &(h_c[nelems]), 0.0); residuum += std::abs(checksum-reference)/reference; diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 36de132e8..9e088b0c2 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -4,7 +4,7 @@ # # Base compilers and language options # -VERSION=-10 +VERSION=-11 # C99 is required in some implementations. CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt @@ -30,7 +30,6 @@ DEFAULT_OPT_FLAGS+=-g3 # #DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed DEFAULT_OPT_FLAGS+=-Wall #-Werror -DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations # silence warnings in third-party headers #DEFAULT_OPT_FLAGS+=-mavx -mfma # these should be used on Haswell and later # # OpenMP flags @@ -162,10 +161,10 @@ UPCXXFLAG+=-mtune=native -ffast-math # #BLASFLAG=-L${HOME}/BLIS/lib -lblis #-fopenmp -lpthread #CBLASFLAG=-I${HOME}/BLIS/include -#BLASFLAG=-DACCELERATE -framework Accelerate -#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions -BLASFLAG=-lblas -CBLASFLAG=-lblas +BLASFLAG=-DACCELERATE -framework Accelerate +CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +#BLASFLAG=-lblas +#`CBLASFLAG=-lblas # # CUDA flags # @@ -195,12 +194,13 @@ ISPCFLAG=-O3 --target=host --opt=fast-math # # MPI-3 # -MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.0 +MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.1_2 MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include -MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi +MPILIB=-L${MPIDIR}/lib -lmpi_usempif08 -lmpi +#MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi #MPIINC=-I/usr/include/mpich-3.2-x86_64 #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi From 3ae6ce72cbefdc51ade50b25c962b5269eb28118 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 1 Sep 2021 16:37:30 +0300 Subject: [PATCH 130/325] make suggested changes --- Cxx11/sgemm-cublas.cu | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/Cxx11/sgemm-cublas.cu b/Cxx11/sgemm-cublas.cu index f96c3953d..8263b44f1 100644 --- a/Cxx11/sgemm-cublas.cu +++ b/Cxx11/sgemm-cublas.cu @@ -185,8 +185,8 @@ int main(int argc, char * argv[]) int iterations; int order; int batches = 0; - int input_copy = 0; - int tf32 = 0; + bool input_copy{false}; + bool tf32{false}; try { if (argc < 2) { throw "Usage: <# iterations> [] [] []"; @@ -209,17 +209,11 @@ int main(int argc, char * argv[]) } if (argc > 4) { - input_copy = std::atoi(argv[4]); - if (input_copy != 0 && input_copy != 1) { - throw "ERROR: input_copy was not 0 or 1"; - } + input_copy = prk::parse_boolean(std::string(argv[4])); } if (argc > 5) { - tf32 = std::atoi(argv[5]); - if (tf32 != 0 && tf32 != 1) { - throw "ERROR: tf32 was not 0 or 1"; - } + tf32 = prk::parse_boolean(std::string(argv[5])); } } catch (const char * e) { From b660e916a68ebb3028cd769f1054e0e36dea076a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 2 Sep 2021 13:10:54 +0300 Subject: [PATCH 131/325] ignore FORTRAN/transpose-cufortran --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0716e97b6..74164a40e 100644 --- a/.gitignore +++ b/.gitignore @@ -302,6 +302,7 @@ FORTRAN/stencil-stdpar FORTRAN/stencil-taskloop-openmp FORTRAN/transpose FORTRAN/transpose-coarray +FORTRAN/transpose-cufortran FORTRAN/transpose-ga FORTRAN/transpose-openmp FORTRAN/transpose-openmp-target From 582f243b7165fb05274dbb6fb88a278224ec7920 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Fri, 10 Sep 2021 15:46:50 -0500 Subject: [PATCH 132/325] Fix namespace and use structural binding --- Cxx11/nstream-onedpl.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cxx11/nstream-onedpl.cc b/Cxx11/nstream-onedpl.cc index 8c07b7937..963683945 100644 --- a/Cxx11/nstream-onedpl.cc +++ b/Cxx11/nstream-onedpl.cc @@ -136,10 +136,10 @@ int main(int argc, char *argv[]) { nstream_time = prk::wtime(); auto begin = dpl::make_zip_iterator(d_A, d_B, d_C); - std::transform(dpl::execution::make_device_policy(q), begin, + dpl::transform(dpl::execution::make_device_policy(q), begin, begin + length, d_A, [=](const auto &t) { - using std::get; - return get<0>(t) + get<1>(t) + scalar * get<2>(t); + auto [a, b, c] = t; + return a + b + scalar * c; }); } nstream_time = prk::wtime() - nstream_time; From c3da7f66e7c768e4c5d40d1cecec5e03e3ca33b6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 5 Nov 2021 04:10:57 -0700 Subject: [PATCH 133/325] add managed check --- Cxx11/nstream-managed-hip.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Cxx11/nstream-managed-hip.cc b/Cxx11/nstream-managed-hip.cc index 3cde5e765..725b94708 100644 --- a/Cxx11/nstream-managed-hip.cc +++ b/Cxx11/nstream-managed-hip.cc @@ -153,6 +153,11 @@ int main(int argc, char * argv[]) B = new double[length]; C = new double[length]; } else { + + int managed_memory = 0; + prk::HIP::check( hipDeviceGetAttribute(&managed_memory, hipDeviceAttributeManagedMemory, 0) ); + std::cout << "hipDeviceGetAttribute(..hipDeviceAttributeManagedMemory..) => " << managed_memory << std::endl; + prk::HIP::check( hipMallocManaged((void**)&A, bytes) ); prk::HIP::check( hipMallocManaged((void**)&B, bytes) ); prk::HIP::check( hipMallocManaged((void**)&C, bytes) ); From a54842d74596fdd6d56d996487502606b23d37a0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 5 Nov 2021 04:14:56 -0700 Subject: [PATCH 134/325] small --- common/make.defs.hip | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/make.defs.hip b/common/make.defs.hip index 754361604..46b9977b5 100644 --- a/common/make.defs.hip +++ b/common/make.defs.hip @@ -10,7 +10,7 @@ VERSION= CC=${ROCM_PATH}/llvm/bin/clang -std=gnu11 -pthread #EXTRA_CLIBS=-lrt # All of the Fortran code is written for the 2008 standard and requires preprocessing. -FC=${ROCM_PATH}/llvm/bin/flang #-std=f2008 -cpp +FC=${ROCM_PATH}/llvm/bin/flang -DAOMP #-std=f2008 -cpp # C++11 may not be required but does no harm here. CXX=${ROCM_PATH}/llvm/bin/clang++ -std=gnu++17 -pthread # @@ -38,14 +38,14 @@ OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL # # hipSYCL # -SYCLDIR=/opt/hipSYCL +SYCLDIR=${HOME}/AMD/hipSYCL SYCLCXX=${SYCLDIR}/bin/syclcc-clang SYCLFLAG=-std=c++17 -O3 SYCLFLAG+=-DHIPSYCL # CPU platform SYCLFLAG+=--hipsycl-platform=rocm -SYCLFLAG+=--hipsycl-gpu-arch=gfx900 -#SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib +SYCLFLAG+=--hipsycl-gpu-arch=gfx908 +SYCLFLAG+=-Wl,-rpath=/opt/rocm/llvm/lib # #CELERITYDIR=${SYCLDIR} #CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor From 0fe498328507dd8bf1ca4758c9103b173f28080a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 15 Dec 2021 12:55:14 +0200 Subject: [PATCH 135/325] F18 isn't PGI Flang --- FORTRAN/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 3cd82fe25..2624ee225 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -43,7 +43,7 @@ endif # PGI and LLVM Flang ifeq ($(findstring flang,$(FC)),flang) EXTRA = target openacc - FCFLAGS += -DPGI + #FCFLAGS += -DPGI endif ifeq ($(findstring pgf,$(FC)),pgf) EXTRA = target openacc cufortran From 445517fb759681ab4fbeda3d7d797ca7d117f156 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 15 Dec 2021 12:55:44 +0200 Subject: [PATCH 136/325] oneMKL nstream comments only --- Cxx11/nstream-onemkl.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cxx11/nstream-onemkl.cc b/Cxx11/nstream-onemkl.cc index 1afeea839..0c69f9808 100644 --- a/Cxx11/nstream-onemkl.cc +++ b/Cxx11/nstream-onemkl.cc @@ -140,10 +140,12 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); double one(1); + // A += B mkl::blas::axpy(q, length, one, // alpha d_B, 1, // x, incx d_A, 1); // y, incy + // A += scalar * C mkl::blas::axpy(q, length, scalar, // alpha d_C, 1, // x, incx From dfbb2300272a4fc463e2046f05df803ff14ec9c8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 29 Dec 2021 17:17:36 +0200 Subject: [PATCH 137/325] add GitHub Actions, mostly remove Travis CI (#593) * disable C++ ranges from CI for now * Fortran OpenMP CI ok * C OMP target disable CI * fix numpy CI * no more Travis --- .github/workflows/makefile.yml | 31 +++ .travis.yml | 16 +- CODEOWNERS | 2 +- GettingStarted.md | 4 +- {travis => ci}/build-run-prk.sh | 244 +++++++++++---------- {travis => ci}/install-armci-mpi.sh | 4 +- {travis => ci}/install-autotools.sh | 38 ++-- {travis => ci}/install-berkeley-upc.sh | 10 +- {travis => ci}/install-boost.sh | 6 +- {travis => ci}/install-chapel.sh | 10 +- {travis => ci}/install-charm++.sh | 20 +- {travis => ci}/install-clang.sh | 2 +- {travis => ci}/install-cmake.sh | 16 +- ci/install-deps.sh | 167 ++++++++++++++ {travis => ci}/install-executors.sh | 6 +- {travis => ci}/install-fgmpi.sh | 18 +- ci/install-ga.sh | 22 ++ {travis => ci}/install-gasnet.sh | 10 +- {travis => ci}/install-gcc.sh | 2 +- {travis => ci}/install-grappa.sh | 16 +- {travis => ci}/install-hpx3.sh | 17 +- {travis => ci}/install-hpx5.sh | 10 +- {travis => ci}/install-hydra.sh | 6 +- {travis => ci}/install-intrepid-upc.sh | 26 +-- {travis => ci}/install-julia.sh | 12 +- {travis => ci}/install-kokkos.sh | 15 +- {travis => ci}/install-legion.sh | 8 +- {travis => ci}/install-libfabric.sh | 10 +- {travis => ci}/install-mpi.sh | 24 +- {travis => ci}/install-musl.sh | 9 +- {travis => ci}/install-occa.sh | 10 +- {travis => ci}/install-octave.sh | 2 +- {travis => ci}/install-opencoarrays.sh | 20 +- {travis => ci}/install-ornl-openshmem.sh | 2 +- {travis => ci}/install-oshmpi.sh | 8 +- ci/install-petsc.sh | 25 +++ {travis => ci}/install-pstl.sh | 12 +- {travis => ci}/install-python.sh | 2 +- {travis => ci}/install-raja.sh | 8 +- ci/install-ranges.sh | 13 ++ {travis => ci}/install-rust.sh | 2 +- {travis => ci}/install-sandia-openshmem.sh | 8 +- ci/install-sycl.sh | 8 + {travis => ci}/install-tbb.sh | 13 +- travis/install-deps.sh | 166 -------------- travis/install-ga.sh | 22 -- travis/install-petsc.sh | 25 --- travis/install-ranges.sh | 12 - travis/install-sycl.sh | 8 - 49 files changed, 594 insertions(+), 553 deletions(-) create mode 100644 .github/workflows/makefile.yml rename {travis => ci}/build-run-prk.sh (86%) rename {travis => ci}/install-armci-mpi.sh (98%) rename {travis => ci}/install-autotools.sh (76%) rename {travis => ci}/install-berkeley-upc.sh (92%) rename {travis => ci}/install-boost.sh (86%) rename {travis => ci}/install-chapel.sh (76%) rename {travis => ci}/install-charm++.sh (81%) rename {travis => ci}/install-clang.sh (97%) rename {travis => ci}/install-cmake.sh (66%) create mode 100644 ci/install-deps.sh rename {travis => ci}/install-executors.sh (71%) rename {travis => ci}/install-fgmpi.sh (62%) create mode 100644 ci/install-ga.sh rename {travis => ci}/install-gasnet.sh (93%) rename {travis => ci}/install-gcc.sh (97%) rename {travis => ci}/install-grappa.sh (82%) rename {travis => ci}/install-hpx3.sh (80%) rename {travis => ci}/install-hpx5.sh (81%) rename {travis => ci}/install-hydra.sh (83%) rename {travis => ci}/install-intrepid-upc.sh (73%) rename {travis => ci}/install-julia.sh (61%) rename {travis => ci}/install-kokkos.sh (88%) rename {travis => ci}/install-legion.sh (65%) rename {travis => ci}/install-libfabric.sh (66%) rename {travis => ci}/install-mpi.sh (81%) rename {travis => ci}/install-musl.sh (68%) rename {travis => ci}/install-occa.sh (87%) rename {travis => ci}/install-octave.sh (93%) rename {travis => ci}/install-opencoarrays.sh (80%) rename {travis => ci}/install-ornl-openshmem.sh (69%) rename {travis => ci}/install-oshmpi.sh (55%) create mode 100644 ci/install-petsc.sh rename {travis => ci}/install-pstl.sh (55%) rename {travis => ci}/install-python.sh (96%) rename {travis => ci}/install-raja.sh (89%) create mode 100644 ci/install-ranges.sh rename {travis => ci}/install-rust.sh (93%) rename {travis => ci}/install-sandia-openshmem.sh (82%) create mode 100644 ci/install-sycl.sh rename {travis => ci}/install-tbb.sh (77%) delete mode 100644 travis/install-deps.sh delete mode 100644 travis/install-ga.sh delete mode 100644 travis/install-petsc.sh delete mode 100644 travis/install-ranges.sh delete mode 100644 travis/install-sycl.sh diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml new file mode 100644 index 000000000..1fc02e80e --- /dev/null +++ b/.github/workflows/makefile.yml @@ -0,0 +1,31 @@ +name: Makefile CI + +on: + push: + branches: [ default ] + pull_request: + branches: [ default ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Test Python + run: | + python -m pip install --upgrade pip + pip install numpy + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + bash ./ci/build-run-prk.sh /tmp allpython + + - name: Test C++ + run: CXX=g++ bash ./ci/build-run-prk.sh /tmp allcxx + + - name: Test Fortran + run: FC=gfortran bash ./ci/build-run-prk.sh /tmp allfortran + + - name: Test C11 + run: CC=gcc bash ./ci/build-run-prk.sh /tmp allc1z diff --git a/.travis.yml b/.travis.yml index 4ddc34984..ee7c29ed3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,18 +149,18 @@ matrix: env: PRK_TARGET=allcharm++ before_install: - pwd - - export TRAVIS_HOME=$PWD - - export TRAVIS_ROOT=$TRAVIS_HOME/PRK-deps - - mkdir -p $TRAVIS_ROOT + - export CI_HOME=$PWD + - export CI_ROOT=$CI_HOME/PRK-deps + - mkdir -p $CI_ROOT install: - - export PATH=$TRAVIS_ROOT/bin:$PATH - - export PATH=$TRAVIS_ROOT/gcc/bin:$PATH - - export PATH=$TRAVIS_ROOT/cmake/bin:$PATH - - sh ./travis/install-deps.sh $TRAVIS_ROOT $PRK_TARGET + - export PATH=$CI_ROOT/bin:$PATH + - export PATH=$CI_ROOT/gcc/bin:$PATH + - export PATH=$CI_ROOT/cmake/bin:$PATH + - sh ./ci/install-deps.sh $CI_ROOT $PRK_TARGET before_script: - pwd script: - - sh ./travis/build-run-prk.sh $TRAVIS_ROOT $PRK_TARGET + - sh ./ci/build-run-prk.sh $CI_ROOT $PRK_TARGET after_failure: - echo "Sad panda" - find . -name config.log -exec grep -L "configure: exit 0" {} ";" | xargs cat diff --git a/CODEOWNERS b/CODEOWNERS index 6f426a040..bb2d0a9cd 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -30,5 +30,5 @@ RUST/* @jeffhammond SERIAL/* @rfvander SHMEM/* @jdinan @rfvander UPC/* @apokayi -travis/* @jeffhammond +ci/* @jeffhammond diff --git a/GettingStarted.md b/GettingStarted.md index c4ba7886e..766b796f1 100644 --- a/GettingStarted.md +++ b/GettingStarted.md @@ -65,8 +65,8 @@ online for whatever platform you are using. All of the libraries and frameworks supported by the PRK project can be installed using the Travis CI infrastructure. -See `travis/install-${dependency}.sh` for details and look -at how the script is invoked by `travis/install-deps.sh` to +See `ci/install-${dependency}.sh` for details and look +at how the script is invoked by `ci/install-deps.sh` to undestand the options. In many cases, the only required argument is the path to the target directory. We often use `${PRK}/deps/` for this. diff --git a/travis/build-run-prk.sh b/ci/build-run-prk.sh similarity index 86% rename from travis/build-run-prk.sh rename to ci/build-run-prk.sh index 31a9aedf9..e9cc6877d 100644 --- a/travis/build-run-prk.sh +++ b/ci/build-run-prk.sh @@ -3,7 +3,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" PRK_TARGET="$2" case "$os" in @@ -21,7 +21,7 @@ case "$os" in export MPI_ROOT=/usr/local ;; Linux) - export MPI_ROOT=${TRAVIS_ROOT} + export MPI_ROOT=${CI_ROOT} ;; esac @@ -31,7 +31,7 @@ case "$PRK_TARGET" in allpython) echo "Python" # workaround for trusty since cannot find numpy when using /opt/python/2.7.13/bin/python - if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then + if [ "$os" = "Linux" ] ; then export PATH=/usr/bin:$PATH fi which python3 || which python || true @@ -71,7 +71,7 @@ case "$PRK_TARGET" in export JULIA_PATH=/usr/local/bin/ ;; Linux) - export JULIA_PATH=${TRAVIS_ROOT}/julia/bin/ + export JULIA_PATH=${CI_ROOT}/julia/bin/ ;; esac ${JULIA_PATH}julia --version @@ -104,9 +104,9 @@ case "$PRK_TARGET" in which rustc rustc --version export PRK_TARGET_PATH=RUST - cd $TRAVIS_HOME/$PRK_TARGET_PATH/p2p && cargo run 10 100 100 - cd $TRAVIS_HOME/$PRK_TARGET_PATH/stencil && cargo run 10 100 - cd $TRAVIS_HOME/$PRK_TARGET_PATH/transpose && cargo run 10 100 + cd $CI_HOME/$PRK_TARGET_PATH/p2p && cargo run 10 100 100 + cd $CI_HOME/$PRK_TARGET_PATH/stencil && cargo run 10 100 + cd $CI_HOME/$PRK_TARGET_PATH/transpose && cargo run 10 100 ;; allc1z) echo "C1z" @@ -180,7 +180,7 @@ case "$PRK_TARGET" in # C11 with OpenMP # Host OpenMP - if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "${CC}" = "clang" ] ; then + if [ "$os" = "Darwin" ] && [ "${CC}" = "clang" ] ; then LLVMPATH="$(brew --cellar llvm)/$(brew list --versions llvm | tr ' ' '\n' | tail -1)" echo "LLVMPATH=${LLVMPATH}" echo "CC=${LLVMPATH}/bin/clang -std=c99" >> common/make.defs @@ -191,7 +191,7 @@ case "$PRK_TARGET" in export LD_RUN_PATH=${LLVMPATH}/lib:$LD_RUN_PATH export LD_LIBRARY_PATH=${LLVMPATH}/lib:$LD_LIBRARY_PATH export DYLD_LIBRARY_PATH=${LLVMPATH}/lib:$DYLD_LIBRARY_PATH - elif [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${CC}" = "clang" ] ; then + elif [ "$os" = "Linux" ] && [ "${CC}" = "clang" ] ; then LLVMPATH=/usr/lib/llvm-8 # dirty hack FIXME echo "LLVMPATH=${LLVMPATH}" echo "CC=${LLVMPATH}/bin/clang -std=c99" >> common/make.defs @@ -234,7 +234,7 @@ case "$PRK_TARGET" in done done # Target Offload - if [ "${CC}" = "gcc" ] ; then + if [ "${CC}" = "gcc" ] && [ ! true ] ; then echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs ${MAKE} -C $PRK_TARGET_PATH target $PRK_TARGET_PATH/nstream-target 10 16777216 @@ -249,11 +249,11 @@ case "$PRK_TARGET" in fi # Use MUSL for GCC+Linux only - if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$CC" = "gcc" ] ; then + if [ "$os" = "Linux" ] && [ "$CC" = "gcc" ] ; then ${MAKE} -C $PRK_TARGET_PATH clean - sh ./travis/install-musl.sh ${TRAVIS_ROOT} ${PRK_CC} + sh ./ci/install-musl.sh ${CI_ROOT} ${PRK_CC} echo "PRKVERSION=\"'2.16'\"" > common/make.defs - echo "CC=${TRAVIS_ROOT}/musl/bin/musl-gcc -static -std=c11 -DUSE_C11_THREADS" >> common/make.defs + echo "CC=${CI_ROOT}/musl/bin/musl-gcc -static -std=c11 -DUSE_C11_THREADS" >> common/make.defs echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs ${MAKE} -C $PRK_TARGET_PATH transpose-thread $PRK_TARGET_PATH/transpose-thread 10 1024 512 @@ -265,7 +265,7 @@ case "$PRK_TARGET" in export PRK_TARGET_PATH=Cxx11 case $CXX in g++) - if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "x$PRK_CXX" = "x" ] ; then + if [ "$os" = "Darwin" ] && [ "x$PRK_CXX" = "x" ] ; then brew list brew search llvm for version in "9" "8" "7" "6" "5" ; do @@ -291,7 +291,7 @@ case "$PRK_TARGET" in ;; clang++) # Homebrew does not always place the best/latest Clang/LLVM in the default path - if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "x$PRK_CXX" = "x" ] ; then + if [ "$os" = "Darwin" ] && [ "x$PRK_CXX" = "x" ] ; then for version in "" ; do if [ -f "`which /usr/local/opt/llvm${version}/bin/clang++`" ]; then export PRK_CXX="`which /usr/local/opt/llvm${version}/bin/clang++`" @@ -345,7 +345,7 @@ case "$PRK_TARGET" in done # C++11 with CBLAS - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + if [ "$os" = "Darwin" ] ; then echo "CBLASFLAG=-DACCELERATE -framework Accelerate -flax-conversions" >> common/make.defs ${MAKE} -C $PRK_TARGET_PATH transpose-cblas dgemm-cblas $PRK_TARGET_PATH/transpose-cblas 10 1024 @@ -396,7 +396,7 @@ case "$PRK_TARGET" in #$PRK_TARGET_PATH/p2p-hyperplane-openacc 10 1024 64 ;; clang) - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + if [ "$os" = "Darwin" ] ; then # Host LLVMPATH="$(brew --cellar llvm)/$(brew list --versions llvm | tr ' ' '\n' | tail -1)" echo "LLVMPATH=${LLVMPATH}" @@ -408,7 +408,7 @@ case "$PRK_TARGET" in export LD_RUN_PATH=${LLVMPATH}/lib:$LD_RUN_PATH export LD_LIBRARY_PATH=${LLVMPATH}/lib:$LD_LIBRARY_PATH export DYLD_LIBRARY_PATH=${LLVMPATH}/lib:$DYLD_LIBRARY_PATH - elif [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${CC}" = "clang" ] ; then + elif [ "$os" = "Linux" ] && [ "${CC}" = "clang" ] ; then LLVMPATH=/usr/lib/llvm-8 # dirty hack FIXME echo "LLVMPATH=${LLVMPATH}" echo "CC=${LLVMPATH}/bin/clang -std=c99" >> common/make.defs @@ -439,34 +439,36 @@ case "$PRK_TARGET" in ;; esac + exit 0 + # Boost.Compute runs after OpenCL, and only available in Travis with MacOS. - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + if [ "$os" = "Darwin" ] ; then echo "BOOSTFLAG=-I/usr/include -I/usr/local/include" >> common/make.defs - echo "RANGEFLAG=-DUSE_RANGES_TS -I${TRAVIS_ROOT}/range-v3/include" >> common/make.defs + echo "RANGEFLAG=-DUSE_RANGES_TS -I${CI_ROOT}/range-v3/include" >> common/make.defs else echo "BOOSTFLAG=-I/usr/include -I/usr/local/include" >> common/make.defs echo "RANGEFLAG=-DUSE_BOOST_IRANGE -I/usr/local/include" >> common/make.defs fi - # C++11 with rangefor and Boost.Ranges - #if [ ! "${CC}" = "gcc" ] && [ ! "${TRAVIS_OS_NAME}" = "linux" ] ; then - ${MAKE} -C $PRK_TARGET_PATH rangefor - $PRK_TARGET_PATH/stencil-rangefor 10 1000 - $PRK_TARGET_PATH/transpose-rangefor 10 1024 32 - $PRK_TARGET_PATH/nstream-rangefor 10 16777216 32 - #echo "Test stencil code generator" - for s in star grid ; do - for r in 1 2 3 4 5 ; do - $PRK_TARGET_PATH/stencil-rangefor 10 200 20 $s $r + # C++11 with ranges and Boost.Ranges + #if [ ! "${CC}" = "gcc" ] && [ ! "$os" = "Linux" ] ; then + if [ ! true ] ; then + ${MAKE} -C $PRK_TARGET_PATH ranges + $PRK_TARGET_PATH/stencil-ranges 10 1000 + $PRK_TARGET_PATH/transpose-ranges 10 1024 32 + $PRK_TARGET_PATH/nstream-ranges 10 16777216 32 + #echo "Test stencil code generator" + for s in star grid ; do + for r in 1 2 3 4 5 ; do + $PRK_TARGET_PATH/stencil-ranges 10 200 20 $s $r + done done - done - #fi + fi # C++11 with TBB - TBBROOT=${TRAVIS_ROOT}/tbb + TBBROOT=${CI_ROOT}/tbb case "$os" in Linux) - ${CC} --version export TBBFLAG="-I${TBBROOT}/include -L${TBBROOT}/lib/intel64/gcc4.7 -ltbb" echo "TBBFLAG=${TBBFLAG}" >> common/make.defs export LD_LIBRARY_PATH=${TBBROOT}/lib/intel64/gcc4.7:${LD_LIBRARY_PATH} @@ -481,7 +483,7 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/p2p-innerloop-tbb 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-tbb 10 1024 1 $PRK_TARGET_PATH/p2p-hyperplane-tbb 10 1024 32 - $PRK_TARGET_PATH/p2p-tasks-tbb 10 1024 1024 32 32 + $PRK_TARGET_PATH/p2p-tasks-tbb 10 1024 1024 32 32 $PRK_TARGET_PATH/stencil-tbb 10 1000 $PRK_TARGET_PATH/transpose-tbb 10 1024 32 $PRK_TARGET_PATH/nstream-tbb 10 16777216 32 @@ -508,9 +510,9 @@ case "$PRK_TARGET" in # C++17 Parallel STL # disable Linux w/ GCC because GCC-5 is too old FIXME - if [ ! "${CC}" = "gcc" ] && [ ! "${TRAVIS_OS_NAME}" = "linux" ] ; then - echo "PSTLFLAG=${TBBFLAG} -DUSE_LLVM_PSTL -I${TRAVIS_ROOT}/pstl/include ${RANGEFLAG}" >> common/make.defs - if [ "${CC}" = "gcc" ] ; then + if [ ! "${CXX}" = "g++" ] && [ ! "$os" = "Linux" ] ; then + echo "PSTLFLAG=${TBBFLAG} -DUSE_LLVM_PSTL -I${CI_ROOT}/pstl/include ${RANGEFLAG}" >> common/make.defs + if [ "${CXX}" = "g++" ] ; then # omp.h not found with clang-3.9 - just work around instead of fixing. echo "PSTLFLAG+=-fopenmp" >> common/make.defs fi @@ -529,7 +531,7 @@ case "$PRK_TARGET" in fi # C++11 with OpenCL - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + if [ "$os" = "Darwin" ] ; then echo "OPENCLFLAG=-framework OpenCL" >> common/make.defs ${MAKE} -C $PRK_TARGET_PATH opencl # must run programs in same directory as OpenCL source files... @@ -552,57 +554,57 @@ case "$PRK_TARGET" in # (1) We only test OpenCL on MacOS in Travis. # (2) Boost.Compute is not available from APT. # If we ever address 1, we need to enable the Boost.Compute install for Linux. - #if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + #if [ "$os" = "Darwin" ] ; then # ${MAKE} -C $PRK_TARGET_PATH nstream-boost-compute # $PRK_TARGET_PATH/nstream-boost-compute 10 16777216 32 #fi # C++11 with Kokkos, RAJA - case "$CC" in - gcc) + case "$CXX" in + g++) # Kokkos and Raja are built with OpenMP support with GCC - echo "RAJAFLAG=-I${TRAVIS_ROOT}/raja/include -L${TRAVIS_ROOT}/raja/lib -lRAJA ${TBBFLAG} -fopenmp" >> common/make.defs - echo "KOKKOSFLAG=-I${TRAVIS_ROOT}/kokkos/include -L${TRAVIS_ROOT}/kokkos/lib -lkokkoscore -DPRK_KOKKOS_BACKEND=OpenMP -fopenmp -ldl" >> common/make.defs + echo "RAJAFLAG=-I${CI_ROOT}/raja/include -L${CI_ROOT}/raja/lib -lRAJA ${TBBFLAG} -fopenmp" >> common/make.defs + echo "KOKKOSFLAG=-I${CI_ROOT}/kokkos/include -L${CI_ROOT}/kokkos/lib -lkokkoscore -DPRK_KOKKOS_BACKEND=OpenMP -fopenmp -ldl" >> common/make.defs ;; - clang) + clang++) # RAJA can use TBB with Clang - echo "RAJAFLAG=-I${TRAVIS_ROOT}/raja/include -L${TRAVIS_ROOT}/raja/lib -lRAJA ${TBBFLAG}" >> common/make.defs + echo "RAJAFLAG=-I${CI_ROOT}/raja/include -L${CI_ROOT}/raja/lib -lRAJA ${TBBFLAG}" >> common/make.defs # Kokkos is built with Pthread support with Clang - echo "KOKKOSFLAG=-I${TRAVIS_ROOT}/kokkos/include -L${TRAVIS_ROOT}/kokkos/lib -lkokkoscore -DPRK_KOKKOS_BACKEND=Threads -lpthread -ldl" >> common/make.defs + echo "KOKKOSFLAG=-I${CI_ROOT}/kokkos/include -L${CI_ROOT}/kokkos/lib -lkokkoscore -DPRK_KOKKOS_BACKEND=Threads -lpthread -ldl" >> common/make.defs ;; esac # RAJA if [ 0 = 1 ] ; then - ${MAKE} -C $PRK_TARGET_PATH p2p-raja stencil-raja transpose-raja nstream-raja \ - p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja - # New (Views) - $PRK_TARGET_PATH/p2p-raja 10 1024 1024 - $PRK_TARGET_PATH/stencil-raja 10 1000 - $PRK_TARGET_PATH/transpose-raja 10 1024 - $PRK_TARGET_PATH/nstream-raja 10 16777216 32 - # Old (STL) - $PRK_TARGET_PATH/p2p-vector-raja 10 1024 1024 - $PRK_TARGET_PATH/stencil-vector-raja 10 1000 - $PRK_TARGET_PATH/transpose-vector-raja 10 1024 - for f in seq omp tbb ; do - for s in y n ; do - for t in y n ; do - for n in y n ; do - for p in no ij ji ; do - $PRK_TARGET_PATH/transpose-raja 4 200 nested=$n for=$f simd=$s tiled=$t permute=$p - done - done - done - done - done - $PRK_TARGET_PATH/nstream-vector-raja 10 16777216 32 - for s in star grid ; do - for r in 1 2 3 4 5 ; do - $PRK_TARGET_PATH/stencil-raja 10 200 20 $s $r - $PRK_TARGET_PATH/stencil-vector-raja 10 200 20 $s $r - done - done + ${MAKE} -C $PRK_TARGET_PATH p2p-raja stencil-raja transpose-raja nstream-raja \ + p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja + # New (Views) + $PRK_TARGET_PATH/p2p-raja 10 1024 1024 + $PRK_TARGET_PATH/stencil-raja 10 1000 + $PRK_TARGET_PATH/transpose-raja 10 1024 + $PRK_TARGET_PATH/nstream-raja 10 16777216 32 + # Old (STL) + $PRK_TARGET_PATH/p2p-vector-raja 10 1024 1024 + $PRK_TARGET_PATH/stencil-vector-raja 10 1000 + $PRK_TARGET_PATH/transpose-vector-raja 10 1024 + for f in seq omp tbb ; do + for s in y n ; do + for t in y n ; do + for n in y n ; do + for p in no ij ji ; do + $PRK_TARGET_PATH/transpose-raja 4 200 nested=$n for=$f simd=$s tiled=$t permute=$p + done + done + done + done + done + $PRK_TARGET_PATH/nstream-vector-raja 10 16777216 32 + for s in star grid ; do + for r in 1 2 3 4 5 ; do + $PRK_TARGET_PATH/stencil-raja 10 200 20 $s $r + $PRK_TARGET_PATH/stencil-vector-raja 10 200 20 $s $r + done + done fi # Kokkos @@ -618,8 +620,8 @@ case "$PRK_TARGET" in # C++ w/ OCCA # OCCA sets -Wl,-rpath=${OCCA_LIB}, which chokes Mac's ld. - #if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then - # echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs + #if [ "$os" = "Linux" ] ; then + # echo "OCCADIR=${CI_ROOT}/occa" >> common/make.defs # export OCCA_CXX=${PRK_CXX} # ${MAKE} -C $PRK_TARGET_PATH transpose-occa nstream-occa # $PRK_TARGET_PATH/transpose-occa 10 1024 32 @@ -628,8 +630,8 @@ case "$PRK_TARGET" in # C++ w/ SYCL # triSYCL requires Boost. We are having Boost issues with Travis Linux builds. - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then - SYCLDIR=${TRAVIS_ROOT}/triSYCL + if [ "$os" = "Darwin" ] ; then + SYCLDIR=${CI_ROOT}/triSYCL if [ "${CC}" = "clang" ] ; then # SYCL will compile without OpenMP echo "SYCLCXX=${PRK_CXX} -pthread -std=c++1z" >> common/make.defs @@ -654,9 +656,9 @@ case "$PRK_TARGET" in allfortran) echo "Fortran" export PRK_TARGET_PATH=FORTRAN - case "$CC" in - gcc) - for major in "-9" "-8" "-7" "-6" "-5" "-4" "-3" "-2" "-1" "" ; do + case "$FC" in + gfortran) + for major in "-14" "-13" "-12" "-11" "-10" "-9" "-8" "-7" "-6" "-5" "-4" "-3" "-2" "-1" "" ; do if [ -f "`which gfortran$major`" ]; then export PRK_FC="gfortran$major" echo "Found GCC Fortran: $PRK_FC" @@ -671,16 +673,16 @@ case "$PRK_TARGET" in echo "FC=$PRK_FC" >> common/make.defs echo "OPENMPFLAG=-fopenmp" >> common/make.defs echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + if [ "$os" = "Darwin" ] ; then # Homebrew installs a symlink in /usr/local/bin export PRK_CAFC=/usr/local/bin/caf - elif [ "${TRAVIS_OS_NAME}" = "linux" ] ; then - export PRK_CAFC=${TRAVIS_ROOT}/opencoarrays/bin/caf + elif [ "$os" = "Linux" ] ; then + export PRK_CAFC=${CI_ROOT}/opencoarrays/bin/caf fi echo "CAFC=$PRK_CAFC -std=f2008 -cpp" >> common/make.defs echo "COARRAYFLAG=-fcoarray=single" >> common/make.defs ;; - clang) + flang) case "$os" in FreeBSD) echo "FC=flang -Mpreprocess -Mfreeform -I/usr/local/flang/include -lexecinfo" >> common/make.defs @@ -741,11 +743,11 @@ case "$PRK_TARGET" in # Fortran coarrays # Disable GCC Linux because installing OpenCoarrays is not working - if [ "${CC}" = "gcc" ] && [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + if [ "${CC}" = "gcc" ] && [ "$os" = "Darwin" ] ; then ${MAKE} -C ${PRK_TARGET_PATH} coarray export PRK_MPI_PROCS=4 if [ "${CC}" = "gcc" ] ; then - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + if [ "$os" = "Darwin" ] ; then # Homebrew installs a symlink in /usr/local/bin ls -l /usr/local/bin/cafrun || true which cafrun || true @@ -754,8 +756,8 @@ case "$PRK_TARGET" in # see https://github.com/open-mpi/ompi/issues/2956 export PRK_OVERSUBSCRIBE="--oversubscribe" export TMPDIR=/tmp - elif [ "${TRAVIS_OS_NAME}" = "linux" ] ; then - export PRK_LAUNCHER=${TRAVIS_ROOT}/opencoarrays/bin/cafrun + elif [ "$os" = "Linux" ] ; then + export PRK_LAUNCHER=${CI_ROOT}/opencoarrays/bin/cafrun fi $PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-} $PRK_TARGET_PATH/p2p-coarray 10 1024 1024 $PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-} $PRK_TARGET_PATH/stencil-coarray 10 1000 @@ -772,7 +774,7 @@ case "$PRK_TARGET" in ;; allopenmp) echo "OpenMP" - if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "${CC}" = "clang" ] ; then + if [ "$os" = "Darwin" ] && [ "${CC}" = "clang" ] ; then brew install llvm || brew upgrade llvm LLVMPATH="$(brew --cellar llvm)/$(brew list --versions llvm | tr ' ' '\n' | tail -1)" echo "LLVMPATH=${LLVMPATH}" @@ -784,7 +786,7 @@ case "$PRK_TARGET" in export LD_RUN_PATH=${LLVMPATH}/lib:$LD_RUN_PATH export LD_LIBRARY_PATH=${LLVMPATH}/lib:$LD_LIBRARY_PATH export DYLD_LIBRARY_PATH=${LLVMPATH}/lib:$DYLD_LIBRARY_PATH - elif [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${CC}" = "clang" ] ; then + elif [ "$os" = "Linux" ] && [ "${CC}" = "clang" ] ; then LLVMPATH=/usr/lib/llvm-8 # dirty hack FIXME echo "LLVMPATH=${LLVMPATH}" echo "CC=${LLVMPATH}/bin/clang -std=c99" >> common/make.defs @@ -827,7 +829,7 @@ case "$PRK_TARGET" in export PRK_LAUNCHER=$MPI_ROOT/bin/mpirun fi # We use Open-MPI on Mac now... - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + if [ "$os" = "Darwin" ] ; then # see https://github.com/open-mpi/ompi/issues/2956 export PRK_OVERSUBSCRIBE="--oversubscribe" export TMPDIR=/tmp @@ -835,17 +837,17 @@ case "$PRK_TARGET" in # Inline the Homebrew OpenMP stuff here so versions do not diverge. # Note that -cc= likely only works with MPICH. - #if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "${CC}" = "gcc" ] ; then + #if [ "$os" = "Darwin" ] && [ "${CC}" = "gcc" ] ; then # GCC_VERSION=6 # brew upgrade gcc@$GCC_VERSION || brew install gcc@$GCC_VERSION # export PRK_MPICC="${PRK_MPICC} -cc=/usr/local/opt/gcc@${GCC_VERSION}/bin/gcc-${GCC_VERSION}" #fi - #if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "${CC}" = "clang" ] ; then + #if [ "$os" = "Darwin" ] && [ "${CC}" = "clang" ] ; then # CLANG_VERSION=3.9 # brew install llvm@$CLANG_VERSION || brew upgrade llvm@$CLANG_VERSION # export PRK_MPICC="${PRK_MPICC} -cc=/usr/local/opt/llvm@${CLANG_VERSION}/bin/clang-${CLANG_VERSION}" #fi - #if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${CC}" = "clang" ] ; then + #if [ "$os" = "Linux" ] && [ "${CC}" = "clang" ] ; then # # According to http://openmp.llvm.org/, we need version 3.8 or later to get OpenMP. # for version in "-5" "-4" "-3.9" "-3.8" "" ; do # if [ -f "`which ${CC}${version}`" ]; then @@ -921,13 +923,13 @@ case "$PRK_TARGET" in allshmem) echo "SHMEM" # This should be fixed by rpath (https://github.com/regrant/sandia-shmem/issues/83) - export LD_LIBRARY_PATH=${TRAVIS_ROOT}/sandia-openshmem/lib:${TRAVIS_ROOT}/libfabric/lib:$LD_LIBRARY_PATH - export SHMEM_ROOT=${TRAVIS_ROOT}/sandia-openshmem + export LD_LIBRARY_PATH=${CI_ROOT}/sandia-openshmem/lib:${CI_ROOT}/libfabric/lib:$LD_LIBRARY_PATH + export SHMEM_ROOT=${CI_ROOT}/sandia-openshmem echo "SHMEMTOP=$SHMEM_ROOT\nSHMEMCC=$SHMEM_ROOT/bin/oshcc" >> common/make.defs ${MAKE} $PRK_TARGET export PRK_TARGET_PATH=SHMEM export PRK_SHMEM_PROCS=4 - export OSHRUN_LAUNCHER=${TRAVIS_ROOT}/hydra/bin/mpirun + export OSHRUN_LAUNCHER=${CI_ROOT}/hydra/bin/mpirun export PRK_LAUNCHER=$SHMEM_ROOT/bin/oshrun $PRK_LAUNCHER -n $PRK_SHMEM_PROCS $PRK_TARGET_PATH/Synch_p2p/p2p 10 1024 1024 $PRK_LAUNCHER -n $PRK_SHMEM_PROCS $PRK_TARGET_PATH/Stencil/stencil 10 1000 @@ -941,14 +943,14 @@ case "$PRK_TARGET" in case "$CC" in gcc) # If building from source (impossible) - #export UPC_ROOT=${TRAVIS_ROOT}/gupc + #export UPC_ROOT=${CI_ROOT}/gupc # If installing deb file - export UPC_ROOT=${TRAVIS_ROOT}/gupc/usr/local/gupc + export UPC_ROOT=${CI_ROOT}/gupc/usr/local/gupc ;; clang) echo "Clang UPC is not supported." exit 9 - export UPC_ROOT=${TRAVIS_ROOT}/clupc + export UPC_ROOT=${CI_ROOT}/clupc ;; esac echo "UPCC=$UPC_ROOT/bin/upc" >> common/make.defs @@ -957,7 +959,7 @@ case "$PRK_TARGET" in ${MAKE} $PRK_TARGET ;; bupc) - export UPC_ROOT=${TRAVIS_ROOT}/bupc-$CC + export UPC_ROOT=${CI_ROOT}/bupc-$CC echo "UPCC=$UPC_ROOT/bin/upcc" >> common/make.defs # -N $nodes -n UPC threads -c $cores_per_node # -localhost is only for UDP @@ -967,7 +969,7 @@ case "$PRK_TARGET" in ;; ofi) export GASNET_SSH_SERVERS="localhost" - export LD_LIBRARY_PATH="${TRAVIS_ROOT}/libfabric/lib:$LD_LIBRARY_PATH" + export LD_LIBRARY_PATH="${CI_ROOT}/libfabric/lib:$LD_LIBRARY_PATH" export PRK_LAUNCHER="$UPC_ROOT/bin/upcrun -v -N 1 -n $PRK_UPC_PROCS -c $PRK_UPC_PROCS" ;; mpi) @@ -998,12 +1000,12 @@ case "$PRK_TARGET" in os=`uname` case "$os" in Darwin) - export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-darwin-x86_64-smp + export CHARM_ROOT=${CI_ROOT}/charm/netlrts-darwin-x86_64-smp ;; Linux) - #export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64 - export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64-smp - #export CHARM_ROOT=${TRAVIS_ROOT}/charm/multicore-linux64 + #export CHARM_ROOT=${CI_ROOT}/charm/netlrts-linux-x86_64 + export CHARM_ROOT=${CI_ROOT}/charm/netlrts-linux-x86_64-smp + #export CHARM_ROOT=${CI_ROOT}/charm/multicore-linux64 ;; esac echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs @@ -1011,7 +1013,7 @@ case "$PRK_TARGET" in export PRK_TARGET_PATH=CHARM++ export PRK_CHARM_PROCS=4 export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun - if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then + if [ "$os" = "Linux" ] ; then export PRK_LAUNCHER_ARGS="+autoProvision +isomalloc_sync" else export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS ++local" @@ -1026,12 +1028,12 @@ case "$PRK_TARGET" in os=`uname` case "$os" in Darwin) - export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-darwin-x86_64-smp + export CHARM_ROOT=${CI_ROOT}/charm/netlrts-darwin-x86_64-smp ;; Linux) - #export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64 - export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64-smp - #export CHARM_ROOT=${TRAVIS_ROOT}/charm/multicore-linux64 + #export CHARM_ROOT=${CI_ROOT}/charm/netlrts-linux-x86_64 + export CHARM_ROOT=${CI_ROOT}/charm/netlrts-linux-x86_64-smp + #export CHARM_ROOT=${CI_ROOT}/charm/multicore-linux64 ;; esac echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs @@ -1039,7 +1041,7 @@ case "$PRK_TARGET" in export PRK_TARGET_PATH=AMPI export PRK_CHARM_PROCS=4 export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun - if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then + if [ "$os" = "Linux" ] ; then export PRK_LAUNCHER_ARGS="+autoProvision +isomalloc_sync" else export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS +vp$PRK_CHARM_PROCS +isomalloc_sync ++local" @@ -1064,7 +1066,7 @@ case "$PRK_TARGET" in ;; allfgmpi) echo "Fine-Grain MPI (FG-MPI)" - export FGMPI_ROOT=${TRAVIS_ROOT}/fgmpi + export FGMPI_ROOT=${CI_ROOT}/fgmpi echo "FGMPITOP=$FGMPI_ROOT\nFGMPICC=$FGMPI_ROOT/bin/mpicc -std=c99" >> common/make.defs ${MAKE} $PRK_TARGET export PRK_TARGET_PATH=FG_MPI @@ -1088,11 +1090,11 @@ case "$PRK_TARGET" in allgrappa) echo "Grappa" ######################## - #. ${TRAVIS_ROOT}/grappa/bin/settings.sh - export GRAPPA_PREFIX=${TRAVIS_ROOT}/grappa - export SCRIPT_PATH=${TRAVIS_ROOT}/grappa/bin + #. ${CI_ROOT}/grappa/bin/settings.sh + export GRAPPA_PREFIX=${CI_ROOT}/grappa + export SCRIPT_PATH=${CI_ROOT}/grappa/bin ######################## - echo "GRAPPATOP=${TRAVIS_ROOT}/grappa" >> common/make.defs + echo "GRAPPATOP=${CI_ROOT}/grappa" >> common/make.defs ${MAKE} $PRK_TARGET export PRK_TARGET_PATH=GRAPPA export PRK_MPI_PROCS=2 @@ -1115,7 +1117,7 @@ case "$PRK_TARGET" in ;; alllegion) echo "Legion" - echo "LEGIONTOP=${TRAVIS_ROOT}/legion" > common/make.defs + echo "LEGIONTOP=${CI_ROOT}/legion" > common/make.defs ${MAKE} $PRK_TARGET -k ;; esac diff --git a/travis/install-armci-mpi.sh b/ci/install-armci-mpi.sh similarity index 98% rename from travis/install-armci-mpi.sh rename to ci/install-armci-mpi.sh index 1bfe8d79b..7ffefd76d 100644 --- a/travis/install-armci-mpi.sh +++ b/ci/install-armci-mpi.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -TRAVIS_ROOT=$1 -ARMCI_MPI_DIR=$TRAVIS_ROOT/armci-mpi +CI_ROOT=$1 +ARMCI_MPI_DIR=$CI_ROOT/armci-mpi ARMCI_MPI_TARBALL=$2 if [ ! -z "${MPICC}" ] ; then diff --git a/travis/install-autotools.sh b/ci/install-autotools.sh similarity index 76% rename from travis/install-autotools.sh rename to ci/install-autotools.sh index 2f27c0254..9e4bf26c1 100644 --- a/travis/install-autotools.sh +++ b/ci/install-autotools.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" case "$os" in Darwin) @@ -16,8 +16,8 @@ case "$os" in which glibtool which glibtoolize #glibtool --version - ln -s `which glibtool` ${TRAVIS_ROOT}/bin/libtool - ln -s `which glibtoolize` ${TRAVIS_ROOT}/bin/libtoolize + ln -s `which glibtool` ${CI_ROOT}/bin/libtool + ln -s `which glibtoolize` ${CI_ROOT}/bin/libtoolize ;; Linux) MAKE_JNUM=2 @@ -26,11 +26,11 @@ case "$os" in AUTOCONF_VERSION=2.69 AUTOMAKE_VERSION=1.15 - cd ${TRAVIS_ROOT} + cd ${CI_ROOT} TOOL=m4 TDIR=${TOOL}-${M4_VERSION} FILE=${TDIR}.tar.gz - BIN=${TRAVIS_ROOT}/bin/${TOOL} + BIN=${CI_ROOT}/bin/${TOOL} if [ -f ${FILE} ] ; then echo ${FILE} already exists! Using existing copy. else @@ -45,19 +45,19 @@ case "$os" in if [ -f ${BIN} ] ; then echo ${BIN} already exists! Skipping build. else - cd ${TRAVIS_ROOT}/${TDIR} - ./configure CC=cc --prefix=${TRAVIS_ROOT} && make -j ${MAKE_JNUM} && make install + cd ${CI_ROOT}/${TDIR} + ./configure CC=cc --prefix=${CI_ROOT} && make -j ${MAKE_JNUM} && make install if [ "x$?" != "x0" ] ; then echo FAILURE 1 exit fi fi - cd ${TRAVIS_ROOT} + cd ${CI_ROOT} TOOL=libtool TDIR=${TOOL}-${LIBTOOL_VERSION} FILE=${TDIR}.tar.gz - BIN=${TRAVIS_ROOT}/bin/${TOOL} + BIN=${CI_ROOT}/bin/${TOOL} if [ ! -f ${FILE} ] ; then wget http://ftp.gnu.org/gnu/${TOOL}/${FILE} else @@ -72,19 +72,19 @@ case "$os" in if [ -f ${BIN} ] ; then echo ${BIN} already exists! Skipping build. else - cd ${TRAVIS_ROOT}/${TDIR} - ./configure CC=cc --prefix=${TRAVIS_ROOT} M4=${TRAVIS_ROOT}/bin/m4 && make -j ${MAKE_JNUM} && make install + cd ${CI_ROOT}/${TDIR} + ./configure CC=cc --prefix=${CI_ROOT} M4=${CI_ROOT}/bin/m4 && make -j ${MAKE_JNUM} && make install if [ "x$?" != "x0" ] ; then echo FAILURE 2 exit fi fi - cd ${TRAVIS_ROOT} + cd ${CI_ROOT} TOOL=autoconf TDIR=${TOOL}-${AUTOCONF_VERSION} FILE=${TDIR}.tar.gz - BIN=${TRAVIS_ROOT}/bin/${TOOL} + BIN=${CI_ROOT}/bin/${TOOL} if [ ! -f ${FILE} ] ; then wget http://ftp.gnu.org/gnu/${TOOL}/${FILE} else @@ -99,19 +99,19 @@ case "$os" in if [ -f ${BIN} ] ; then echo ${BIN} already exists! Skipping build. else - cd ${TRAVIS_ROOT}/${TDIR} - ./configure CC=cc --prefix=${TRAVIS_ROOT} M4=${TRAVIS_ROOT}/bin/m4 && make -j ${MAKE_JNUM} && make install + cd ${CI_ROOT}/${TDIR} + ./configure CC=cc --prefix=${CI_ROOT} M4=${CI_ROOT}/bin/m4 && make -j ${MAKE_JNUM} && make install if [ "x$?" != "x0" ] ; then echo FAILURE 3 exit fi fi - cd ${TRAVIS_ROOT} + cd ${CI_ROOT} TOOL=automake TDIR=${TOOL}-${AUTOMAKE_VERSION} FILE=${TDIR}.tar.gz - BIN=${TRAVIS_ROOT}/bin/${TOOL} + BIN=${CI_ROOT}/bin/${TOOL} if [ ! -f ${FILE} ] ; then wget http://ftp.gnu.org/gnu/${TOOL}/${FILE} else @@ -126,8 +126,8 @@ case "$os" in if [ -f ${BIN} ] ; then echo ${BIN} already exists! Skipping build. else - cd ${TRAVIS_ROOT}/${TDIR} - ./configure CC=cc --prefix=${TRAVIS_ROOT} M4=${TRAVIS_ROOT}/bin/m4 && make -j ${MAKE_JNUM} && make install + cd ${CI_ROOT}/${TDIR} + ./configure CC=cc --prefix=${CI_ROOT} M4=${CI_ROOT}/bin/m4 && make -j ${MAKE_JNUM} && make install if [ "x$?" != "x0" ] ; then echo FAILURE 4 exit diff --git a/travis/install-berkeley-upc.sh b/ci/install-berkeley-upc.sh similarity index 92% rename from travis/install-berkeley-upc.sh rename to ci/install-berkeley-upc.sh index fe2dadd3a..a41e070ec 100644 --- a/travis/install-berkeley-upc.sh +++ b/ci/install-berkeley-upc.sh @@ -3,7 +3,7 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" # we can't handle this yet in build-run-prk.sh #if [ "x$GASNET_CONDUIT" -eq "x" ] ; then @@ -11,7 +11,7 @@ TRAVIS_ROOT="$1" #else # BUPC_PREFIX=bupc-$CC-$GASNET_CONDUIT #fi -BUPC_PREFIX=$TRAVIS_ROOT/bupc-$CC +BUPC_PREFIX=$CI_ROOT/bupc-$CC export BUPC_RELEASE=berkeley_upc-2.22.0 @@ -25,7 +25,7 @@ case $os in ;; Linux) BUPC_NO_PTHREADS="" - MPI_ROOT=$TRAVIS_ROOT + MPI_ROOT=$CI_ROOT ;; esac @@ -47,8 +47,8 @@ if [ ! -d "$BUPC_PREFIX" ]; then ofi) # TODO factor Hydra out of Sandia OpenSHMEM install so it can be used as spawner here ../configure --prefix=$BUPC_PREFIX --disable-aligned-segments $BUPC_NO_PTHREADS \ - --enable-$GASNET_CONDUIT --with-ofihome=$TRAVIS_ROOT/libfabric \ - --with-ofi-spawner=pmi --with-pmi=$TRAVIS_ROOT/hydra \ + --enable-$GASNET_CONDUIT --with-ofihome=$CI_ROOT/libfabric \ + --with-ofi-spawner=pmi --with-pmi=$CI_ROOT/hydra \ --disable-auto-conduit-detect ;; mpi) diff --git a/travis/install-boost.sh b/ci/install-boost.sh similarity index 86% rename from travis/install-boost.sh rename to ci/install-boost.sh index dfbb02a1f..209774af5 100644 --- a/travis/install-boost.sh +++ b/ci/install-boost.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" case "$os" in Darwin) @@ -15,7 +15,7 @@ case "$os" in sudo apt-get install libboost-all-dev # We do not test Boost.Compute on Linux because of OpenCL issues... # Boost.Compute is a header-only library - #git clone --depth 1 https://github.com/kylelutz/compute.git ${TRAVIS_ROOT}/compute - #git clone --depth 1 https://github.com/boostorg/compute.git ${TRAVIS_ROOT}/compute + #git clone --depth 1 https://github.com/kylelutz/compute.git ${CI_ROOT}/compute + #git clone --depth 1 https://github.com/boostorg/compute.git ${CI_ROOT}/compute ;; esac diff --git a/travis/install-chapel.sh b/ci/install-chapel.sh similarity index 76% rename from travis/install-chapel.sh rename to ci/install-chapel.sh index 7583d9cda..240f4d7bf 100644 --- a/travis/install-chapel.sh +++ b/ci/install-chapel.sh @@ -3,9 +3,11 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" -if [ "${TRAVIS_OS_NAME}" = "osx" ] || [ "${CHPL_COMM}" = "none" ] ; then +os=`uname` + +if [ "$os" = "Darwin" ] || [ "${CHPL_COMM}" = "none" ] ; then echo "Mac single-locale" brew install chapel || brew upgrade chapel brew test chapel @@ -15,11 +17,11 @@ else if [ "${CC}" = "clang" ] || [ "${CXX}" = "clang++" ] ; then CHPL_LLVM=llvm fi - cd $TRAVIS_ROOT + cd $CI_ROOT wget -q --no-check-certificate https://github.com/chapel-lang/chapel/releases/download/1.12.0/chapel-1.12.0.tar.gz tar -xzf chapel-1.12.0.tar.gz ln -s chapel-1.12.0 chapel cd chapel make - ln -s `find $PWD -type f -name chpl` $TRAVIS_HOME/bin/chpl + ln -s `find $PWD -type f -name chpl` $CI_HOME/bin/chpl fi diff --git a/travis/install-charm++.sh b/ci/install-charm++.sh similarity index 81% rename from travis/install-charm++.sh rename to ci/install-charm++.sh index 09bc966da..ca63a0c32 100644 --- a/travis/install-charm++.sh +++ b/ci/install-charm++.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" # charm++ or AMPI RUNTIME="$2" @@ -12,11 +12,11 @@ RUNTIME="$2" #CHARM_CONDUIT="$3" # unused for now -case "$TRAVIS_OS_NAME" in - linux) +case "$os" in + Linux) CHARM_OS=linux ;; - osx) + Darwin) CHARM_OS=darwin ;; esac @@ -34,8 +34,8 @@ case "$CHARM_CONDUIT" in ;; esac -if [ ! -d "$TRAVIS_ROOT/charm" ]; then - cd $TRAVIS_ROOT +if [ ! -d "$CI_ROOT/charm" ]; then + cd $CI_ROOT git clone --depth 1 -b v6.8.0 https://charm.cs.illinois.edu/gerrit/charm.git charm cd charm case "$os" in @@ -57,12 +57,12 @@ else echo "Charm++ or AMPI already installed..." case "$RUNTIME" in AMPI) - find $TRAVIS_ROOT/charm -name charmrun - find $TRAVIS_ROOT/charm -name ampicc + find $CI_ROOT/charm -name charmrun + find $CI_ROOT/charm -name ampicc ;; charm++) - find $TRAVIS_ROOT/charm -name charmrun - find $TRAVIS_ROOT/charm -name charmc + find $CI_ROOT/charm -name charmrun + find $CI_ROOT/charm -name charmc ;; esac fi diff --git a/travis/install-clang.sh b/ci/install-clang.sh similarity index 97% rename from travis/install-clang.sh rename to ci/install-clang.sh index 45f382fb7..1eea80d83 100644 --- a/travis/install-clang.sh +++ b/ci/install-clang.sh @@ -3,7 +3,7 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" if [ "${CC}" = "clang" ] || [ "${CXX}" = "clang++" ] ; then os=`uname` diff --git a/travis/install-cmake.sh b/ci/install-cmake.sh similarity index 66% rename from travis/install-cmake.sh rename to ci/install-cmake.sh index b7e6595f3..8e48ed3cf 100644 --- a/travis/install-cmake.sh +++ b/ci/install-cmake.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" case "$os" in Darwin) @@ -15,22 +15,22 @@ case "$os" in Linux) echo "Linux" - if [ ! -d "$TRAVIS_ROOT/cmake" ]; then - mkdir -p $TRAVIS_ROOT/cmake + if [ ! -d "$CI_ROOT/cmake" ]; then + mkdir -p $CI_ROOT/cmake # from source #wget --no-check-certificate -q https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz - #tar -C $TRAVIS_ROOT -xzf cmake-3.4.1.tar.gz + #tar -C $CI_ROOT -xzf cmake-3.4.1.tar.gz #cd ~/cmake-3.4.1 #mkdir build && cd build - #cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$TRAVIS_ROOT/cmake + #cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$CI_ROOT/cmake #make -j4 && make install # from binary - cd $TRAVIS_ROOT + cd $CI_ROOT wget --no-check-certificate -q https://github.com/Kitware/CMake/releases/download/v3.13.2/cmake-3.13.2-Linux-x86_64.sh - sh ./cmake-3.13.2-Linux-x86_64.sh --prefix=$TRAVIS_ROOT/cmake --skip-license --exclude-subdir + sh ./cmake-3.13.2-Linux-x86_64.sh --prefix=$CI_ROOT/cmake --skip-license --exclude-subdir else echo "CMake installed..." - find $TRAVIS_ROOT/cmake -name cmake + find $CI_ROOT/cmake -name cmake fi ;; esac diff --git a/ci/install-deps.sh b/ci/install-deps.sh new file mode 100644 index 000000000..9cc0d9efc --- /dev/null +++ b/ci/install-deps.sh @@ -0,0 +1,167 @@ +#!/bin/sh + +set -e +set -x + +CI_ROOT="$1" +PRK_TARGET="$2" +os=`uname` + +# update package managers once at the beginning +case $os in + Darwin) + brew update + ;; + Linux) + sudo apt-get update -y + #sudo apt-get upgrade -y + ;; +esac + +case $os in + Darwin) + MPI_IMPL=openmpi + ;; + Linux) + MPI_IMPL=mpich + ;; +esac + +echo "PWD=$PWD" + +case "$PRK_TARGET" in + allserial) + echo "Serial" + ;; + alloctave) + echo "Octave" + sh ./ci/install-octave.sh $CI_ROOT + ;; + alljulia) + echo "Julia" + sh ./ci/install-julia.sh $CI_ROOT + ;; + allpython) + echo "Python" + sh ./ci/install-python.sh $CI_ROOT + ;; + allrust) + echo "Rust" + sh ./ci/install-rust.sh $CI_ROOT + ;; + allc1z) + echo "C1z" + if [ "${CC}" = "gcc" ] ; then + sh ./ci/install-gcc.sh $CI_ROOT + fi + if [ "${CC}" = "clang" ] ; then + sh ./ci/install-clang.sh $CI_ROOT + fi + #if [ "$os" = "Linux" ] ; then + # sh ./ci/install-musl.sh $CI_ROOT + #fi + ;; + allcxx) + echo "C++11" + if [ "${CC}" = "gcc" ] ; then + sh ./ci/install-gcc.sh $CI_ROOT + fi + if [ "${CC}" = "clang" ] ; then + sh ./ci/install-clang.sh $CI_ROOT + fi + sh ./ci/install-tbb.sh $CI_ROOT + sh ./ci/install-pstl.sh $CI_ROOT + sh ./ci/install-ranges.sh $CI_ROOT + sh ./ci/install-boost.sh $CI_ROOT + # CMake 3.10 or higher is required. + sh ./ci/install-cmake.sh $CI_ROOT + #sh ./ci/install-raja.sh $CI_ROOT + sh ./ci/install-kokkos.sh $CI_ROOT + #sh ./ci/install-occa.sh $CI_ROOT + sh ./ci/install-sycl.sh $CI_ROOT + ;; + allfortran) + echo "Fortran" + if [ "${CC}" = "gcc" ] ; then + sh ./ci/install-gcc.sh $CI_ROOT + sh ./ci/install-opencoarrays.sh $CI_ROOT + fi + ;; + allopenmp) + echo "OpenMP" + if [ "${CC}" = "clang" ] || [ "${CXX}" = "clang++" ] ; then + sh ./ci/install-clang.sh $CI_ROOT 3.9 + fi + ;; + allmpi) + echo "Traditional MPI" + # install except when Intel MPI used + sh ./ci/install-mpi.sh $CI_ROOT $MPI_IMPL 0 + ;; + allshmem) + echo "SHMEM" + sh ./ci/install-hydra.sh $CI_ROOT + sh ./ci/install-libfabric.sh $CI_ROOT + sh ./ci/install-sandia-openshmem.sh $CI_ROOT + ;; + allupc) + echo "UPC" + case "$UPC_IMPL" in + gupc) + # GUPC is working fine + sh ./ci/install-intrepid-upc.sh $CI_ROOT + ;; + bupc) + # BUPC is new + case $GASNET_CONDUIT in + ofi) + sh ./ci/install-hydra.sh $CI_ROOT + sh ./ci/install-libfabric.sh $CI_ROOT + ;; + mpi) + sh ./ci/install-mpi.sh $CI_ROOT $MPI_IMPL 0 + ;; + esac + sh ./ci/install-berkeley-upc.sh $CI_ROOT + ;; + esac + ;; + allcharm++) + echo "Charm++" + sh ./ci/install-charm++.sh $CI_ROOT charm++ + ;; + allampi) + echo "Adaptive MPI (AMPI)" + sh ./ci/install-charm++.sh $CI_ROOT AMPI + ;; + allfgmpi) + echo "Fine-Grain MPI (FG-MPI)" + sh ./ci/install-fgmpi.sh $CI_ROOT + ;; + allgrappa) + echo "Grappa" + sh ./ci/install-cmake.sh $CI_ROOT + sh ./ci/install-mpi.sh $CI_ROOT $MPI_IMPL 0 + sh ./ci/install-grappa.sh $CI_ROOT + ;; + allchapel) + echo "Chapel" + sh ./ci/install-chapel.sh $CI_ROOT + ;; + allhpx3) + echo "HPX-3" + sh ./ci/install-cmake.sh $CI_ROOT + sh ./ci/install-hpx3.sh $CI_ROOT + ;; + allhpx5) + echo "HPX-5" + sh ./ci/install-autotools.sh $CI_ROOT + sh ./ci/install-hpx5.sh $CI_ROOT + ;; + alllegion) + echo "Legion" + # GASNet is not needed, it seems + #sh ./ci/install-gasnet.sh $CI_ROOT + sh ./ci/install-legion.sh $CI_ROOT + ;; +esac diff --git a/travis/install-executors.sh b/ci/install-executors.sh similarity index 71% rename from travis/install-executors.sh rename to ci/install-executors.sh index 484e0bd29..39a3ae508 100644 --- a/travis/install-executors.sh +++ b/ci/install-executors.sh @@ -3,10 +3,10 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" -git clone --depth 1 https://github.com/facebookexperimental/libunifex.git $TRAVIS_ROOT/libunifex -pushd $TRAVIS_ROOT/libunifex +git clone --depth 1 https://github.com/facebookexperimental/libunifex.git $CI_ROOT/libunifex +pushd $CI_ROOT/libunifex mkdir build cd build cmake .. -DCMAKE_CXX_COMPILER=${CXX} -DCMAKE_C_COMPILER=${CC} -DCMAKE_CXX_FLAGS="-std=c++20" diff --git a/travis/install-fgmpi.sh b/ci/install-fgmpi.sh similarity index 62% rename from travis/install-fgmpi.sh rename to ci/install-fgmpi.sh index f1858d253..8ebd5bbcf 100644 --- a/travis/install-fgmpi.sh +++ b/ci/install-fgmpi.sh @@ -3,17 +3,17 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" -if [ ! -d "$TRAVIS_ROOT/fgmpi" ]; then +if [ ! -d "$CI_ROOT/fgmpi" ]; then # TAR build wget --no-check-certificate -q http://www.cs.ubc.ca/~humaira/code/fgmpi-2.0.tar.gz - tar -C $TRAVIS_ROOT -xzf fgmpi-2.0.tar.gz - cd $TRAVIS_ROOT/fgmpi-2.0 + tar -C $CI_ROOT -xzf fgmpi-2.0.tar.gz + cd $CI_ROOT/fgmpi-2.0 # GIT build - #cd $TRAVIS_ROOT + #cd $CI_ROOT #git clone --depth 1 https://github.com/humairakamal/fgmpi.git fgmpi-source #cd fgmpi-source ## this may fail on older autotools @@ -22,16 +22,16 @@ if [ ! -d "$TRAVIS_ROOT/fgmpi" ]; then # TAR or GIT mkdir build && cd build # Clang defaults to C99, which chokes on "Set_PROC_NULL" - ../configure --disable-fortran --disable-romio CFLAGS="-std=gnu89 -w" --prefix=$TRAVIS_ROOT/fgmpi + ../configure --disable-fortran --disable-romio CFLAGS="-std=gnu89 -w" --prefix=$CI_ROOT/fgmpi make -j2 make install # Package install - # TODO (restore from older version but unpack in $TRAVIS_ROOT without sudo) + # TODO (restore from older version but unpack in $CI_ROOT without sudo) else echo "FG-MPI installed..." - find $TRAVIS_ROOT/fgmpi -name mpiexec - find $TRAVIS_ROOT/fgmpi -name mpicc + find $CI_ROOT/fgmpi -name mpiexec + find $CI_ROOT/fgmpi -name mpicc mpicc -show fi diff --git a/ci/install-ga.sh b/ci/install-ga.sh new file mode 100644 index 000000000..3068c5f92 --- /dev/null +++ b/ci/install-ga.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e +set -x + +CI_ROOT="$1" + +if [ ! -d "$CI_ROOT/ga" ]; then + git clone -b develop https://github.com/GlobalArrays/ga.git $CI_ROOT/ga-src + cd $CI_ROOT/ga-src + ./autogen.sh + mkdir build + cd build + #../configure CC=mpicc --prefix=$CI_ROOT/ga + #../configure --with-mpi3 MPICC=mpiicc MPICXX=mpiicpc MPIFC=mpiifort MPIF77=mpiifort --prefix=$CI_ROOT/ga && make -j8 install + ../configure --with-armci=${CI_ROOT}/armci-mpi MPICC=mpiicc MPICXX=mpiicpc MPIFC=mpiifort MPIF77=mpiifort --prefix=$CI_ROOT/ga && make -j8 install + make + make install +else + echo "Global Arrays installed..." + find $CI_ROOT/ga -name ga.h +fi diff --git a/travis/install-gasnet.sh b/ci/install-gasnet.sh similarity index 93% rename from travis/install-gasnet.sh rename to ci/install-gasnet.sh index 1562bcc39..111cc1fe0 100644 --- a/travis/install-gasnet.sh +++ b/ci/install-gasnet.sh @@ -3,9 +3,9 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" -GASNET_PREFIX=$TRAVIS_ROOT/gasnet-$GASNET_CONDUIT +GASNET_PREFIX=$CI_ROOT/gasnet-$GASNET_CONDUIT export GASNET_RELEASE=GASNet-1.26.3 @@ -19,7 +19,7 @@ case $os in ;; Linux) GASNET_NO_PTHREADS="" - MPI_ROOT=$TRAVIS_ROOT + MPI_ROOT=$CI_ROOT ;; esac @@ -41,8 +41,8 @@ if [ ! -d "$GASNET_PREFIX" ]; then ofi) # TODO factor Hydra out of Sandia OpenSHMEM install so it can be used as spawner here ../configure CC=cc --prefix=$GASNET_PREFIX --disable-aligned-segments $GASNET_NO_PTHREADS \ - --enable-$GASNET_CONDUIT --with-ofihome=$TRAVIS_ROOT/libfabric \ - --with-ofi-spawner=pmi --with-pmi=$TRAVIS_ROOT/hydra \ + --enable-$GASNET_CONDUIT --with-ofihome=$CI_ROOT/libfabric \ + --with-ofi-spawner=pmi --with-pmi=$CI_ROOT/hydra \ --disable-auto-conduit-detect ;; mpi) diff --git a/travis/install-gcc.sh b/ci/install-gcc.sh similarity index 97% rename from travis/install-gcc.sh rename to ci/install-gcc.sh index 5f2600349..1c3915b86 100644 --- a/travis/install-gcc.sh +++ b/ci/install-gcc.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" if [ "${CC}" = "gcc" ] || [ "${CXX}" = "g++" ] ; then case "$os" in diff --git a/travis/install-grappa.sh b/ci/install-grappa.sh similarity index 82% rename from travis/install-grappa.sh rename to ci/install-grappa.sh index c964ca8da..e3ef98509 100644 --- a/travis/install-grappa.sh +++ b/ci/install-grappa.sh @@ -4,11 +4,11 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" # TODO: Make compiler and MPI configurable... -if [ ! -d "$TRAVIS_ROOT/grappa" ]; then +if [ ! -d "$CI_ROOT/grappa" ]; then case "$os" in Darwin) echo "Mac" @@ -19,27 +19,27 @@ if [ ! -d "$TRAVIS_ROOT/grappa" ]; then Linux) echo "Linux" - export MPI_ROOT=$TRAVIS_ROOT + export MPI_ROOT=$CI_ROOT ;; esac - cd $TRAVIS_ROOT + cd $CI_ROOT git clone --depth 1 https://github.com/uwsampa/grappa.git grappa-source cd grappa-source # DEBUG #find /usr -name gcc\* -type f - #find $TRAVIS_ROOT + #find $CI_ROOT # END # Invoking CMake directly mkdir build && cd build if [ -f ~/use-intel-compilers ] ; then - cmake .. -DGRAPPA_INSTALL_PREFIX=$TRAVIS_ROOT/grappa \ + cmake .. -DGRAPPA_INSTALL_PREFIX=$CI_ROOT/grappa \ -DCMAKE_C_COMPILER="mpiicc" \ -DCMAKE_CXX_COMPILER="mpiicpc" \ -DMPI_C_COMPILER="mpiicc" \ -DMPI_CXX_COMPILER="mpiicpc" else - cmake .. -DGRAPPA_INSTALL_PREFIX=$TRAVIS_ROOT/grappa \ + cmake .. -DGRAPPA_INSTALL_PREFIX=$CI_ROOT/grappa \ -DCMAKE_C_COMPILER="$MPI_ROOT/bin/mpicc" \ -DCMAKE_CXX_COMPILER="$MPI_ROOT/bin/mpicxx" \ -DMPI_C_COMPILER="$MPI_ROOT/bin/mpicc" \ @@ -55,5 +55,5 @@ if [ ! -d "$TRAVIS_ROOT/grappa" ]; then make install else echo "Grappa installed..." - find $TRAVIS_ROOT -name grappa.mk + find $CI_ROOT -name grappa.mk fi diff --git a/travis/install-hpx3.sh b/ci/install-hpx3.sh similarity index 80% rename from travis/install-hpx3.sh rename to ci/install-hpx3.sh index 14724834a..508f236c1 100644 --- a/travis/install-hpx3.sh +++ b/ci/install-hpx3.sh @@ -3,12 +3,13 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" +os=`uname` -case "$TRAVIS_OS_NAME" in - linux) +case "$os" in + Linux) ;; - osx) + Darwin) set +e if [ "$USE_HPX_TARBALL" ] ; then export HPX_BOOST="homebrew/versions/boost155" @@ -22,8 +23,8 @@ case "$TRAVIS_OS_NAME" in ;; esac -if [ ! -d "$TRAVIS_ROOT/hpx3" ]; then - cd $TRAVIS_ROOT +if [ ! -d "$CI_ROOT/hpx3" ]; then + cd $CI_ROOT #if [ "$USE_HPX_TARBALL" ] ; then # wget -q --no-check-certificate http://stellar.cct.lsu.edu/files/hpx_0.9.11.tar.bz2 # if [ `which md5` ] ; then @@ -40,11 +41,11 @@ if [ ! -d "$TRAVIS_ROOT/hpx3" ]; then #fi mkdir build cd build - cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$TRAVIS_ROOT/hpx3 -DCMAKE_MACOSX_RPATH=YES -DHPX_WITH_HWLOC=OFF + cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$CI_ROOT/hpx3 -DCMAKE_MACOSX_RPATH=YES -DHPX_WITH_HWLOC=OFF make -j2 # make check # target does not exist make install else echo "HPX-3 installed..." - find $TRAVIS_ROOT/hpx3 + find $CI_ROOT/hpx3 fi diff --git a/travis/install-hpx5.sh b/ci/install-hpx5.sh similarity index 81% rename from travis/install-hpx5.sh rename to ci/install-hpx5.sh index 0c14232f5..c676763e8 100644 --- a/travis/install-hpx5.sh +++ b/ci/install-hpx5.sh @@ -3,10 +3,10 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" -if [ ! -d "$TRAVIS_ROOT/hpx5" ] ; then - cd $TRAVIS_ROOT +if [ ! -d "$CI_ROOT/hpx5" ] ; then + cd $CI_ROOT if [ "0" = "1" ] ; then wget -q --no-check-certificate http://hpx.crest.iu.edu/release/HPX_Release_v2.0.0.tar.gz if [ `which shasum` ] ; then @@ -23,11 +23,11 @@ if [ ! -d "$TRAVIS_ROOT/hpx5" ] ; then cd hpx5-source fi ./bootstrap - ./configure --prefix=$TRAVIS_ROOT/hpx5 + ./configure --prefix=$CI_ROOT/hpx5 make -j2 make check make install else echo "HPX-5 installed..." - find $TRAVIS_ROOT/hpx5 -name hpx-config + find $CI_ROOT/hpx5 -name hpx-config fi diff --git a/travis/install-hydra.sh b/ci/install-hydra.sh similarity index 83% rename from travis/install-hydra.sh rename to ci/install-hydra.sh index 6fac75218..6b30d3697 100644 --- a/travis/install-hydra.sh +++ b/ci/install-hydra.sh @@ -3,11 +3,11 @@ set -e set -x -TRAVIS_ROOT="$1" -HYDRA_ROOT=$TRAVIS_ROOT/hydra +CI_ROOT="$1" +HYDRA_ROOT=$CI_ROOT/hydra if [ ! -d "$HYDRA_ROOT" ]; then - cd $TRAVIS_ROOT + cd $CI_ROOT wget --no-check-certificate -q http://www.mpich.org/static/downloads/3.2/hydra-3.2.tar.gz tar -xzf hydra-3.2.tar.gz cd hydra-3.2 diff --git a/travis/install-intrepid-upc.sh b/ci/install-intrepid-upc.sh similarity index 73% rename from travis/install-intrepid-upc.sh rename to ci/install-intrepid-upc.sh index 0d1efd0a8..e82d71032 100644 --- a/travis/install-intrepid-upc.sh +++ b/ci/install-intrepid-upc.sh @@ -6,26 +6,26 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" case "$CC" in gcc) - if [ ! -d "$TRAVIS_ROOT/gupc" ]; then + if [ ! -d "$CI_ROOT/gupc" ]; then case "$os" in Darwin) echo "Mac" # Travis uses Mac OSX 10.9, so this might not work... - mkdir $TRAVIS_ROOT/gupc + mkdir $CI_ROOT/gupc wget --no-check-certificate -q http://www.gccupc.org/gupc-5201-1/28-gupc-5201-x8664-mac-os-1010-yosemiti/file -O upc-5.2.0.1-x86_64-apple-macosx10.10.tar.gz - tar -C $TRAVIS_ROOT/gupc -xzvf upc-5.2.0.1-x86_64-apple-macosx10.10.tar.gz - find $TRAVIS_ROOT/gupc -name gupc -type f + tar -C $CI_ROOT/gupc -xzvf upc-5.2.0.1-x86_64-apple-macosx10.10.tar.gz + find $CI_ROOT/gupc -name gupc -type f ;; Linux) echo "Linux" - mkdir $TRAVIS_ROOT/gupc + mkdir $CI_ROOT/gupc wget --no-check-certificate -q http://www.gccupc.org/gupc-5201-1/30-gupc-5201-x8664-ubuntu-1204/file -O upc-5.2.0.1-x86_64-linux-ubuntu12.4.tar.gz - tar -C $TRAVIS_ROOT/gupc -xzvf upc-5.2.0.1-x86_64-linux-ubuntu12.4.tar.gz - find $TRAVIS_ROOT/gupc -name gupc -type f + tar -C $CI_ROOT/gupc -xzvf upc-5.2.0.1-x86_64-linux-ubuntu12.4.tar.gz + find $CI_ROOT/gupc -name gupc -type f ;; esac # Building from source overflows Travis CI 4 MB output... @@ -34,28 +34,28 @@ case "$CC" in #cd upc-5.2.0.1 #./contrib/download_prerequisites #mkdir build && cd build - #../configure --disable-multilib --enable-languages=c,c++ --prefix=$TRAVIS_ROOT/gupc + #../configure --disable-multilib --enable-languages=c,c++ --prefix=$CI_ROOT/gupc ## Travis has problems with how much output the GCC build creates #make -j4 &> /dev/null #make install else echo "GCC UPC installed..." - find $TRAVIS_ROOT/gupc -name gupc -type f + find $CI_ROOT/gupc -name gupc -type f fi ;; clang) echo "Clang UPC not supported yet..." exit 60 - if [ ! -d "$TRAVIS_ROOT/clupc" ]; then + if [ ! -d "$CI_ROOT/clupc" ]; then # get source files mkdir build && cd build - ../configure --disable-multilib --enable-languages=c,c++ --prefix=$TRAVIS_ROOT/clupc + ../configure --disable-multilib --enable-languages=c,c++ --prefix=$CI_ROOT/clupc make -j4 make install else echo "GCC UPC installed..." - find $TRAVIS_ROOT/clupc -name clang + find $CI_ROOT/clupc -name clang clang --version fi ;; diff --git a/travis/install-julia.sh b/ci/install-julia.sh similarity index 61% rename from travis/install-julia.sh rename to ci/install-julia.sh index 4534f50c1..c5511c21f 100644 --- a/travis/install-julia.sh +++ b/ci/install-julia.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" case "$os" in Darwin) @@ -14,13 +14,13 @@ case "$os" in Linux) echo "Linux" JULIA_NAME=julia-1.3.1 - if [ ! -d "$TRAVIS_ROOT/$JULIA_NAME" ]; then - cd $TRAVIS_ROOT + if [ ! -d "$CI_ROOT/$JULIA_NAME" ]; then + cd $CI_ROOT wget --no-check-certificate -q https://julialang-s3.julialang.org/bin/linux/x64/1.3/julia-1.3.1-linux-x86_64.tar.gz - tar -C $TRAVIS_ROOT -xzvf julia-1.3.1-linux-x86_64.tar.gz + tar -C $CI_ROOT -xzvf julia-1.3.1-linux-x86_64.tar.gz # symbolic link was not working for reasons i cannot explain - ln -s $TRAVIS_ROOT/$JULIA_NAME $TRAVIS_ROOT/julia - find $TRAVIS_ROOT -type f -name julia + ln -s $CI_ROOT/$JULIA_NAME $CI_ROOT/julia + find $CI_ROOT -type f -name julia fi ;; esac diff --git a/travis/install-kokkos.sh b/ci/install-kokkos.sh similarity index 88% rename from travis/install-kokkos.sh rename to ci/install-kokkos.sh index 3f56bc534..ccd61140b 100644 --- a/travis/install-kokkos.sh +++ b/ci/install-kokkos.sh @@ -3,14 +3,15 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" +os=`uname` -case ${TRAVIS_OS_NAME} in - osx) +case $os in + Darwin) brew install gnu-sed || brew upgrade gnu-sed || true SED="gsed" ;; - linux) + Linux) SED="sed" ;; esac @@ -57,17 +58,17 @@ case $CXX in esac ${PRK_CXX} -v -if [ ! -d "$TRAVIS_ROOT/kokkos" ]; then +if [ ! -d "$CI_ROOT/kokkos" ]; then git clone -b develop --depth 1 https://github.com/kokkos/kokkos.git cd kokkos mkdir build cd build ${SED} -i "s/DKokkos_ENABLE_TESTS=ON/DKokkos_ENABLE_TESTS=OFF/g" ../generate_makefile.bash - ../generate_makefile.bash --prefix=${TRAVIS_ROOT}/kokkos \ + ../generate_makefile.bash --prefix=${CI_ROOT}/kokkos \ --compiler=${PRK_CXX} ${KOKKOS_BACKEND} make -j2 make -j2 install else echo "KOKKOS installed..." - find $TRAVIS_ROOT/kokkos -name Kokkos_Core.hpp + find $CI_ROOT/kokkos -name Kokkos_Core.hpp fi diff --git a/travis/install-legion.sh b/ci/install-legion.sh similarity index 65% rename from travis/install-legion.sh rename to ci/install-legion.sh index 183318282..1cdc01496 100644 --- a/travis/install-legion.sh +++ b/ci/install-legion.sh @@ -3,16 +3,16 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" echo "compiler versions:" $CC --version $CXX --version -if [ ! -d "$TRAVIS_ROOT/legion" ]; then - cd $TRAVIS_ROOT +if [ ! -d "$CI_ROOT/legion" ]; then + cd $CI_ROOT git clone -b master --depth 1 https://github.com/StanfordLegion/legion.git else echo "Legion present..." - find $TRAVIS_ROOT/legion + find $CI_ROOT/legion fi diff --git a/travis/install-libfabric.sh b/ci/install-libfabric.sh similarity index 66% rename from travis/install-libfabric.sh rename to ci/install-libfabric.sh index acf1922cc..a565799c9 100644 --- a/travis/install-libfabric.sh +++ b/ci/install-libfabric.sh @@ -3,19 +3,19 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" -if [ ! -d "$TRAVIS_ROOT/libfabric" ]; then - cd $TRAVIS_ROOT +if [ ! -d "$CI_ROOT/libfabric" ]; then + cd $CI_ROOT git clone --depth 1 https://github.com/ofiwg/libfabric.git libfabric-source #git clone -b 'v1.5.2' --depth 1 https://github.com/ofiwg/libfabric.git libfabric-source cd libfabric-source ./autogen.sh - ./configure CC=cc --prefix=$TRAVIS_ROOT/libfabric + ./configure CC=cc --prefix=$CI_ROOT/libfabric make make install export FI_LOG_LEVEL=error else echo "OFI/libfabric installed..." - find $TRAVIS_ROOT -name "fi.h" + find $CI_ROOT -name "fi.h" fi diff --git a/travis/install-mpi.sh b/ci/install-mpi.sh similarity index 81% rename from travis/install-mpi.sh rename to ci/install-mpi.sh index b92ce45f1..acb3bc34e 100644 --- a/travis/install-mpi.sh +++ b/ci/install-mpi.sh @@ -6,7 +6,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" MPI_IMPL="$2" # 1=yes, else no @@ -64,7 +64,7 @@ case "$os" in esac case "$MPI_IMPL" in mpich) - if [ ! -f "$TRAVIS_ROOT/bin/mpichversion" ]; then + if [ ! -f "$CI_ROOT/bin/mpichversion" ]; then MPICH_V=3.3a2 wget --no-check-certificate -q \ http://www.mpich.org/static/downloads/${MPICH_V}/mpich-${MPICH_V}.tar.gz || \ @@ -73,39 +73,39 @@ case "$os" in tar -xzf mpich-${MPICH_V}.tar.gz || tar -xzf v${MPICH_V}.tar.gz cd mpich-${MPICH_V} # Autotools not required with release tarballs - #sh $TRAVIS_HOME/travis/install-autotools.sh $TRAVIS_ROOT + #sh $CI_HOME/ci/install-autotools.sh $CI_ROOT #./autogen.sh mkdir build ; cd build if [ "x$MPI_FORTRAN" != "x1" ] ; then - ../configure --prefix=$TRAVIS_ROOT CC=$PRK_CC CXX=$PRK_CXX --disable-fortran + ../configure --prefix=$CI_ROOT CC=$PRK_CC CXX=$PRK_CXX --disable-fortran else - ../configure --prefix=$TRAVIS_ROOT CC=$PRK_CC CXX=$PRK_CXX FC=$PRK_FC + ../configure --prefix=$CI_ROOT CC=$PRK_CC CXX=$PRK_CXX FC=$PRK_FC fi make -j2 make install else echo "MPICH installed..." - find $TRAVIS_ROOT -name mpiexec - find $TRAVIS_ROOT -name mpicc + find $CI_ROOT -name mpiexec + find $CI_ROOT -name mpicc fi ;; openmpi) - if [ ! -f "$TRAVIS_ROOT/bin/ompi_info" ]; then + if [ ! -f "$CI_ROOT/bin/ompi_info" ]; then wget --no-check-certificate -q https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.1.tar.bz2 tar -xjf openmpi-2.1.1.tar.bz2 cd openmpi-2.1.1 mkdir build && cd build if [ "x$MPI_FORTRAN" != "x1" ] ; then - ../configure --prefix=$TRAVIS_ROOT CC=$PRK_CC CXX=$PRK_CXX --enable-mpi-fortran=none + ../configure --prefix=$CI_ROOT CC=$PRK_CC CXX=$PRK_CXX --enable-mpi-fortran=none else - ../configure --prefix=$TRAVIS_ROOT CC=$PRK_CC CXX=$PRK_CXX FC=$PRK_FC + ../configure --prefix=$CI_ROOT CC=$PRK_CC CXX=$PRK_CXX FC=$PRK_FC fi make -j2 make install else echo "OpenMPI installed..." - find $TRAVIS_ROOT -name mpiexec - find $TRAVIS_ROOT -name mpicc + find $CI_ROOT -name mpiexec + find $CI_ROOT -name mpicc fi ;; esac diff --git a/travis/install-musl.sh b/ci/install-musl.sh similarity index 68% rename from travis/install-musl.sh rename to ci/install-musl.sh index 2cf6090cb..da0b92d40 100644 --- a/travis/install-musl.sh +++ b/ci/install-musl.sh @@ -3,8 +3,9 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" MUSL_CC="$2" +os=`uname` if [ "${MUSL_CC}" = "" ] ; then MUSL_CC=${CC} @@ -14,12 +15,12 @@ WEBSITE=https://www.musl-libc.org VERSION=1.1.16 DIRECTORY=releases -if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then - cd ${TRAVIS_ROOT} +if [ "$os" = "Linux" ] ; then + cd ${CI_ROOT} wget --no-check-certificate -q ${WEBSITE}/${DIRECTORY}/musl-${VERSION}.tar.gz tar -xzf musl-${VERSION}.tar.gz cd musl-${VERSION} - ./configure --prefix=${TRAVIS_ROOT}/musl CC=${MUSL_CC} && make -j2 && make install + ./configure --prefix=${CI_ROOT}/musl CC=${MUSL_CC} && make -j2 && make install else echo "MUSL does not support Mac" exit 99 diff --git a/travis/install-occa.sh b/ci/install-occa.sh similarity index 87% rename from travis/install-occa.sh rename to ci/install-occa.sh index 810580cd0..82bb689d1 100644 --- a/travis/install-occa.sh +++ b/ci/install-occa.sh @@ -3,7 +3,7 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" case $CXX in g++) @@ -37,11 +37,11 @@ case $CXX in esac ${PRK_CXX} -v -if [ ! -d "$TRAVIS_ROOT/occa" ]; then +if [ ! -d "$CI_ROOT/occa" ]; then BRANCH="1.0" - git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa - CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -C $TRAVIS_ROOT/occa + git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $CI_ROOT/occa + CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -C $CI_ROOT/occa else echo "OCCA installed..." - find $TRAVIS_ROOT/occa -name occa.hpp + find $CI_ROOT/occa -name occa.hpp fi diff --git a/travis/install-octave.sh b/ci/install-octave.sh similarity index 93% rename from travis/install-octave.sh rename to ci/install-octave.sh index 90aed3a7d..ebf047cc0 100644 --- a/travis/install-octave.sh +++ b/ci/install-octave.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" case "$os" in Darwin) diff --git a/travis/install-opencoarrays.sh b/ci/install-opencoarrays.sh similarity index 80% rename from travis/install-opencoarrays.sh rename to ci/install-opencoarrays.sh index 3ee454da3..928505312 100644 --- a/travis/install-opencoarrays.sh +++ b/ci/install-opencoarrays.sh @@ -4,9 +4,9 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" -if [ ! -d "$TRAVIS_ROOT/opencoarrays" ] ; then +if [ ! -d "$CI_ROOT/opencoarrays" ] ; then case "$os" in Darwin) echo "Mac" @@ -20,9 +20,9 @@ if [ ! -d "$TRAVIS_ROOT/opencoarrays" ] ; then ;; LinuxNoSudo) echo "Linux" - sh ./travis/install-cmake.sh $TRAVIS_ROOT - sh ./travis/install-mpi.sh $TRAVIS_ROOT mpich 1 - cd $TRAVIS_ROOT + sh ./ci/install-cmake.sh $CI_ROOT + sh ./ci/install-mpi.sh $CI_ROOT mpich 1 + cd $CI_ROOT git clone --depth 1 https://github.com/sourceryinstitute/opencoarrays.git opencoarrays-source cd opencoarrays-source mkdir build @@ -46,17 +46,17 @@ if [ ! -d "$TRAVIS_ROOT/opencoarrays" ] ; then export MPICH_FC=$PRK_FC mpicc -show mpif90 -show - CC=$PRK_CC FC=$PRK_FC cmake .. -DCMAKE_INSTALL_PREFIX=$TRAVIS_ROOT/opencoarrays \ + CC=$PRK_CC FC=$PRK_FC cmake .. -DCMAKE_INSTALL_PREFIX=$CI_ROOT/opencoarrays \ -DMPI_C_COMPILER=mpicc -DMPI_Fortran_COMPILER=mpif90 make -j2 ctest make install - find $TRAVIS_ROOT -name caf - find $TRAVIS_ROOT -name cafrun + find $CI_ROOT -name caf + find $CI_ROOT -name cafrun ;; esac else echo "OpenCoarrays installed..." - find $TRAVIS_ROOT -name caf - find $TRAVIS_ROOT -name cafrun + find $CI_ROOT -name caf + find $CI_ROOT -name cafrun fi diff --git a/travis/install-ornl-openshmem.sh b/ci/install-ornl-openshmem.sh similarity index 69% rename from travis/install-ornl-openshmem.sh rename to ci/install-ornl-openshmem.sh index 87ea60081..82627d02a 100644 --- a/travis/install-ornl-openshmem.sh +++ b/ci/install-ornl-openshmem.sh @@ -4,5 +4,5 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" diff --git a/travis/install-oshmpi.sh b/ci/install-oshmpi.sh similarity index 55% rename from travis/install-oshmpi.sh rename to ci/install-oshmpi.sh index 73d8cd5bd..0fc2151ea 100644 --- a/travis/install-oshmpi.sh +++ b/ci/install-oshmpi.sh @@ -3,16 +3,16 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" -if [ ! -d "$TRAVIS_ROOT/oshmpi" ]; then +if [ ! -d "$CI_ROOT/oshmpi" ]; then git clone --depth 1 https://github.com/jeffhammond/oshmpi.git cd oshmpi ./autogen.sh - ./configure CC=mpicc --prefix=$TRAVIS_ROOT/oshmpi + ./configure CC=mpicc --prefix=$CI_ROOT/oshmpi make make install else echo "OSHMPI installed..." - find $TRAVIS_ROOT/oshmpi -name shmem.h + find $CI_ROOT/oshmpi -name shmem.h fi diff --git a/ci/install-petsc.sh b/ci/install-petsc.sh new file mode 100644 index 000000000..da78b2913 --- /dev/null +++ b/ci/install-petsc.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +set -e +set -x + +CI_ROOT="$1" + +if [ ! -f "$CI_ROOT/petsc/include/petsc.h" ]; then + if [ -d "$CI_ROOT/petsc-src" ]; then + cd $CI_ROOT/petsc-src + git pull + else + git clone -b maint https://gitlab.com/petsc/petsc.git $CI_ROOT/petsc-src + fi + cd $CI_ROOT/petsc-src + ./configure --prefix=$CI_ROOT/petsc \ + --with-blaslapack-dir=$MKLROOT \ + --with-mpi-dir=$I_MPI_ROOT \ + --with-cxx=0 --with-fc=0 + make PETSC_DIR=$CI_ROOT/petsc-src PETSC_ARCH=arch-linux-c-debug all + make PETSC_DIR=$CI_ROOT/petsc-src PETSC_ARCH=arch-linux-c-debug install +else + echo "PETSc installed..." + cat $CI_ROOT/petsc/lib/petsc/conf/reconfigure*.py +fi diff --git a/travis/install-pstl.sh b/ci/install-pstl.sh similarity index 55% rename from travis/install-pstl.sh rename to ci/install-pstl.sh index 86f6e4f85..685b3700f 100644 --- a/travis/install-pstl.sh +++ b/ci/install-pstl.sh @@ -4,13 +4,13 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" -git clone --depth 1 https://github.com/llvm-mirror/pstl.git $TRAVIS_ROOT/llvm-pstl-git -cd $TRAVIS_ROOT/llvm-pstl-git +git clone --depth 1 https://github.com/llvm-mirror/pstl.git $CI_ROOT/llvm-pstl-git +cd $CI_ROOT/llvm-pstl-git mkdir build cd build -cmake .. -DCMAKE_INSTALL_PREFIX=$TRAVIS_ROOT/pstl +cmake .. -DCMAKE_INSTALL_PREFIX=$CI_ROOT/pstl make -j2 install #case "$os" in @@ -20,8 +20,8 @@ make -j2 install # ;; # Linux) # echo "Linux" -# if [ ! -d "$TRAVIS_ROOT/pstl" ]; then -# git clone --depth 1 https://github.com/intel/parallelstl.git $TRAVIS_ROOT/pstl +# if [ ! -d "$CI_ROOT/pstl" ]; then +# git clone --depth 1 https://github.com/intel/parallelstl.git $CI_ROOT/pstl # fi # ;; #esac diff --git a/travis/install-python.sh b/ci/install-python.sh similarity index 96% rename from travis/install-python.sh rename to ci/install-python.sh index 78eec441a..a288210df 100644 --- a/travis/install-python.sh +++ b/ci/install-python.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" case "$os" in Darwin) diff --git a/travis/install-raja.sh b/ci/install-raja.sh similarity index 89% rename from travis/install-raja.sh rename to ci/install-raja.sh index 114b9f2a5..5e09695d4 100644 --- a/travis/install-raja.sh +++ b/ci/install-raja.sh @@ -3,7 +3,7 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" case $CXX in g++) @@ -39,18 +39,18 @@ case $CXX in esac ${PRK_CXX} -v -if [ ! -d "$TRAVIS_ROOT/raja" ]; then +if [ ! -d "$CI_ROOT/raja" ]; then BRANCH=develop git clone --recursive --depth 1 -b ${BRANCH} https://github.com/LLNL/RAJA.git cd RAJA mkdir build cd build cmake .. -DCMAKE_CXX_COMPILER=${PRK_CXX} -DCMAKE_C_COMPILER=${PRK_CC} \ - -DCMAKE_INSTALL_PREFIX=${TRAVIS_ROOT}/raja \ + -DCMAKE_INSTALL_PREFIX=${CI_ROOT}/raja \ -DENABLE_TBB=On -DENABLE_OPENMP=${USE_OPENMP} make -j2 make install -j2 else echo "RAJA installed..." - find $TRAVIS_ROOT/raja -name RAJA.hxx + find $CI_ROOT/raja -name RAJA.hxx fi diff --git a/ci/install-ranges.sh b/ci/install-ranges.sh new file mode 100644 index 000000000..84b0d2968 --- /dev/null +++ b/ci/install-ranges.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +set -e +set -x + +CI_ROOT="$1" +os=`uname` + +if [ "$os" = "Darwin" ] ; then + git clone --depth 1 https://github.com/ericniebler/range-v3.git $CI_ROOT/range-v3 +else + sh ./ci/install-boost.sh $CI_ROOT +fi diff --git a/travis/install-rust.sh b/ci/install-rust.sh similarity index 93% rename from travis/install-rust.sh rename to ci/install-rust.sh index 136b0b0f4..69563ae36 100644 --- a/travis/install-rust.sh +++ b/ci/install-rust.sh @@ -4,7 +4,7 @@ set -e set -x os=`uname` -TRAVIS_ROOT="$1" +CI_ROOT="$1" case "$os" in Darwin) diff --git a/travis/install-sandia-openshmem.sh b/ci/install-sandia-openshmem.sh similarity index 82% rename from travis/install-sandia-openshmem.sh rename to ci/install-sandia-openshmem.sh index 7384ea0ac..23985439a 100644 --- a/travis/install-sandia-openshmem.sh +++ b/ci/install-sandia-openshmem.sh @@ -3,8 +3,8 @@ set -e set -x -TRAVIS_ROOT="$1" -SHMEM_ROOT=$TRAVIS_ROOT/sandia-openshmem +CI_ROOT="$1" +SHMEM_ROOT=$CI_ROOT/sandia-openshmem if [ ! -d "$SHMEM_ROOT" ]; then # HEAD @@ -18,8 +18,8 @@ if [ ! -d "$SHMEM_ROOT" ]; then ./autogen.sh mkdir build cd build - # Removed # --with-pmi=$TRAVIS_ROOT/hydra per Jim - ../configure --with-libfabric=$TRAVIS_ROOT/libfabric \ + # Removed # --with-pmi=$CI_ROOT/hydra per Jim + ../configure --with-libfabric=$CI_ROOT/libfabric \ --disable-fortran \ --enable-error-checking \ --enable-pmi-simple \ diff --git a/ci/install-sycl.sh b/ci/install-sycl.sh new file mode 100644 index 000000000..805d9dde0 --- /dev/null +++ b/ci/install-sycl.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +set -e +set -x + +CI_ROOT="$1" + +git clone --depth 1 https://github.com/triSYCL/triSYCL.git $CI_ROOT/triSYCL diff --git a/travis/install-tbb.sh b/ci/install-tbb.sh similarity index 77% rename from travis/install-tbb.sh rename to ci/install-tbb.sh index 2718098e3..52cd293a0 100644 --- a/travis/install-tbb.sh +++ b/ci/install-tbb.sh @@ -3,25 +3,26 @@ set -e set -x -TRAVIS_ROOT="$1" +CI_ROOT="$1" +os=`uname` WEBSITE=https://github.com/01org/tbb/releases/download VERSION=2018_U1 DIRECTORY=tbb2018_20170919oss -case "${TRAVIS_OS_NAME}" in - osx) +case "$os" in + Darwin) echo "Mac" wget --no-check-certificate -q ${WEBSITE}/${VERSION}/${DIRECTORY}_mac.tgz tar -xzf ${DIRECTORY}_mac.tgz ;; - linux) + Linux) echo "Linux" wget --no-check-certificate -q ${WEBSITE}/${VERSION}/${DIRECTORY}_lin.tgz tar -xzf ${DIRECTORY}_lin.tgz ;; esac export TBBROOT=${PWD}/${DIRECTORY} -mv ${TBBROOT} ${TRAVIS_ROOT}/tbb -find ${TRAVIS_ROOT}/tbb -name "libtbb.so" +mv ${TBBROOT} ${CI_ROOT}/tbb +find ${CI_ROOT}/tbb -name "libtbb.so" diff --git a/travis/install-deps.sh b/travis/install-deps.sh deleted file mode 100644 index d661e293f..000000000 --- a/travis/install-deps.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/sh - -set -e -set -x - -TRAVIS_ROOT="$1" -PRK_TARGET="$2" - -# update package managers once at the beginning -case ${TRAVIS_OS_NAME} in - osx) - brew update - ;; - linux) - sudo apt-get update -y - #sudo apt-get upgrade -y - ;; -esac - -case ${TRAVIS_OS_NAME} in - osx) - MPI_IMPL=openmpi - ;; - linux) - MPI_IMPL=mpich - ;; -esac - -echo "PWD=$PWD" - -case "$PRK_TARGET" in - allserial) - echo "Serial" - ;; - alloctave) - echo "Octave" - sh ./travis/install-octave.sh $TRAVIS_ROOT - ;; - alljulia) - echo "Julia" - sh ./travis/install-julia.sh $TRAVIS_ROOT - ;; - allpython) - echo "Python" - sh ./travis/install-python.sh $TRAVIS_ROOT - ;; - allrust) - echo "Rust" - sh ./travis/install-rust.sh $TRAVIS_ROOT - ;; - allc1z) - echo "C1z" - if [ "${CC}" = "gcc" ] ; then - sh ./travis/install-gcc.sh $TRAVIS_ROOT - fi - if [ "${CC}" = "clang" ] ; then - sh ./travis/install-clang.sh $TRAVIS_ROOT - fi - #if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then - # sh ./travis/install-musl.sh $TRAVIS_ROOT - #fi - ;; - allcxx) - echo "C++11" - if [ "${CC}" = "gcc" ] ; then - sh ./travis/install-gcc.sh $TRAVIS_ROOT - fi - if [ "${CC}" = "clang" ] ; then - sh ./travis/install-clang.sh $TRAVIS_ROOT - fi - sh ./travis/install-tbb.sh $TRAVIS_ROOT - sh ./travis/install-pstl.sh $TRAVIS_ROOT - sh ./travis/install-ranges.sh $TRAVIS_ROOT - sh ./travis/install-boost.sh $TRAVIS_ROOT - # CMake 3.10 or higher is required. - sh ./travis/install-cmake.sh $TRAVIS_ROOT - #sh ./travis/install-raja.sh $TRAVIS_ROOT - sh ./travis/install-kokkos.sh $TRAVIS_ROOT - #sh ./travis/install-occa.sh $TRAVIS_ROOT - sh ./travis/install-sycl.sh $TRAVIS_ROOT - ;; - allfortran) - echo "Fortran" - if [ "${CC}" = "gcc" ] ; then - sh ./travis/install-gcc.sh $TRAVIS_ROOT - sh ./travis/install-opencoarrays.sh $TRAVIS_ROOT - fi - ;; - allopenmp) - echo "OpenMP" - if [ "${CC}" = "clang" ] || [ "${CXX}" = "clang++" ] ; then - sh ./travis/install-clang.sh $TRAVIS_ROOT 3.9 - fi - ;; - allmpi) - echo "Traditional MPI" - # install except when Intel MPI used - sh ./travis/install-mpi.sh $TRAVIS_ROOT $MPI_IMPL 0 - ;; - allshmem) - echo "SHMEM" - sh ./travis/install-hydra.sh $TRAVIS_ROOT - sh ./travis/install-libfabric.sh $TRAVIS_ROOT - sh ./travis/install-sandia-openshmem.sh $TRAVIS_ROOT - ;; - allupc) - echo "UPC" - case "$UPC_IMPL" in - gupc) - # GUPC is working fine - sh ./travis/install-intrepid-upc.sh $TRAVIS_ROOT - ;; - bupc) - # BUPC is new - case $GASNET_CONDUIT in - ofi) - sh ./travis/install-hydra.sh $TRAVIS_ROOT - sh ./travis/install-libfabric.sh $TRAVIS_ROOT - ;; - mpi) - sh ./travis/install-mpi.sh $TRAVIS_ROOT $MPI_IMPL 0 - ;; - esac - sh ./travis/install-berkeley-upc.sh $TRAVIS_ROOT - ;; - esac - ;; - allcharm++) - echo "Charm++" - sh ./travis/install-charm++.sh $TRAVIS_ROOT charm++ - ;; - allampi) - echo "Adaptive MPI (AMPI)" - sh ./travis/install-charm++.sh $TRAVIS_ROOT AMPI - ;; - allfgmpi) - echo "Fine-Grain MPI (FG-MPI)" - sh ./travis/install-fgmpi.sh $TRAVIS_ROOT - ;; - allgrappa) - echo "Grappa" - sh ./travis/install-cmake.sh $TRAVIS_ROOT - sh ./travis/install-mpi.sh $TRAVIS_ROOT $MPI_IMPL 0 - sh ./travis/install-grappa.sh $TRAVIS_ROOT - ;; - allchapel) - echo "Chapel" - sh ./travis/install-chapel.sh $TRAVIS_ROOT - ;; - allhpx3) - echo "HPX-3" - sh ./travis/install-cmake.sh $TRAVIS_ROOT - sh ./travis/install-hpx3.sh $TRAVIS_ROOT - ;; - allhpx5) - echo "HPX-5" - sh ./travis/install-autotools.sh $TRAVIS_ROOT - sh ./travis/install-hpx5.sh $TRAVIS_ROOT - ;; - alllegion) - echo "Legion" - # GASNet is not needed, it seems - #sh ./travis/install-gasnet.sh $TRAVIS_ROOT - sh ./travis/install-legion.sh $TRAVIS_ROOT - ;; -esac diff --git a/travis/install-ga.sh b/travis/install-ga.sh deleted file mode 100644 index 8103ca89a..000000000 --- a/travis/install-ga.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh - -set -e -set -x - -TRAVIS_ROOT="$1" - -if [ ! -d "$TRAVIS_ROOT/ga" ]; then - git clone -b develop https://github.com/GlobalArrays/ga.git $TRAVIS_ROOT/ga-src - cd $TRAVIS_ROOT/ga-src - ./autogen.sh - mkdir build - cd build - #../configure CC=mpicc --prefix=$TRAVIS_ROOT/ga - #../configure --with-mpi3 MPICC=mpiicc MPICXX=mpiicpc MPIFC=mpiifort MPIF77=mpiifort --prefix=$TRAVIS_ROOT/ga && make -j8 install - ../configure --with-armci=${TRAVIS_ROOT}/armci-mpi MPICC=mpiicc MPICXX=mpiicpc MPIFC=mpiifort MPIF77=mpiifort --prefix=$TRAVIS_ROOT/ga && make -j8 install - make - make install -else - echo "Global Arrays installed..." - find $TRAVIS_ROOT/ga -name ga.h -fi diff --git a/travis/install-petsc.sh b/travis/install-petsc.sh deleted file mode 100644 index b305afd02..000000000 --- a/travis/install-petsc.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/sh - -set -e -set -x - -TRAVIS_ROOT="$1" - -if [ ! -f "$TRAVIS_ROOT/petsc/include/petsc.h" ]; then - if [ -d "$TRAVIS_ROOT/petsc-src" ]; then - cd $TRAVIS_ROOT/petsc-src - git pull - else - git clone -b maint https://gitlab.com/petsc/petsc.git $TRAVIS_ROOT/petsc-src - fi - cd $TRAVIS_ROOT/petsc-src - ./configure --prefix=$TRAVIS_ROOT/petsc \ - --with-blaslapack-dir=$MKLROOT \ - --with-mpi-dir=$I_MPI_ROOT \ - --with-cxx=0 --with-fc=0 - make PETSC_DIR=$TRAVIS_ROOT/petsc-src PETSC_ARCH=arch-linux-c-debug all - make PETSC_DIR=$TRAVIS_ROOT/petsc-src PETSC_ARCH=arch-linux-c-debug install -else - echo "PETSc installed..." - cat $TRAVIS_ROOT/petsc/lib/petsc/conf/reconfigure*.py -fi diff --git a/travis/install-ranges.sh b/travis/install-ranges.sh deleted file mode 100644 index f14532861..000000000 --- a/travis/install-ranges.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh - -set -e -set -x - -TRAVIS_ROOT="$1" - -if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then - git clone --depth 1 https://github.com/ericniebler/range-v3.git $TRAVIS_ROOT/range-v3 -else - sh ./travis/install-boost.sh $TRAVIS_ROOT -fi diff --git a/travis/install-sycl.sh b/travis/install-sycl.sh deleted file mode 100644 index 3ac157a3f..000000000 --- a/travis/install-sycl.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh - -set -e -set -x - -TRAVIS_ROOT="$1" - -git clone --depth 1 https://github.com/triSYCL/triSYCL.git $TRAVIS_ROOT/triSYCL From ed0cb3a895615d8f6c6b102d009db4605526f437 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 15 Dec 2021 12:57:56 +0200 Subject: [PATCH 138/325] F18 Flang cleanup --- FORTRAN/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 2624ee225..e2c61d18d 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -142,6 +142,7 @@ clean: -rm -f *.dbg -rm -f *__genmod.f90 # Intel Fortran -rm -f *__genmod.mod # Intel Fortran + -rm -f flang_unparsed_file*.f90 # F18 Flang -rm -f *.optrpt -rm -f *.dwarf -rm -rf *.dSYM # Mac From 8e76315d1918dbec9b0a58efa47053debba5e894 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Nov 2021 15:11:41 +0200 Subject: [PATCH 139/325] add warning for weird transpose OpenCL error --- Cxx11/transpose-opencl.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cxx11/transpose-opencl.cc b/Cxx11/transpose-opencl.cc index 78b61d61d..4ebe642b9 100644 --- a/Cxx11/transpose-opencl.cc +++ b/Cxx11/transpose-opencl.cc @@ -168,6 +168,8 @@ int main(int argc, char* argv[]) throw "ERROR: Matrix Order must be greater than 0"; } else if (order > prk::get_max_matrix_size()) { throw "ERROR: matrix dimension too large - overflow risk"; + } else if (order > 1234) { + std::cerr << "WARNING: answer might be wrong, because order>1234 breaks for unknown reasons\n"; } } catch (const char * e) { From 344aac3a8df6d716ed2dcc7f50ed6860d8230b4d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Nov 2021 15:17:09 +0200 Subject: [PATCH 140/325] silence stupid warning --- Cxx11/prk_mpi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/prk_mpi.h b/Cxx11/prk_mpi.h index 15750b51e..8a05e0325 100644 --- a/Cxx11/prk_mpi.h +++ b/Cxx11/prk_mpi.h @@ -201,7 +201,7 @@ namespace prk global_size_ = global_size; local_size_ = global_size_ / np_; const size_t remainder = global_size_ % np_; - if (me_ < remainder) local_size_++; + if ((size_t)me_ < remainder) local_size_++; { MPI_Datatype dt = (std::is_signed() ? MPI_INT64_T : MPI_UINT64_T); From b23941b6cce6596416bed97935282456e4bf8b98 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Nov 2021 15:34:47 +0200 Subject: [PATCH 141/325] need Python3 --- PYTHON/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PYTHON/README.md b/PYTHON/README.md index 33354905f..7f670436f 100644 --- a/PYTHON/README.md +++ b/PYTHON/README.md @@ -1,8 +1,8 @@ # How to run ``` - mpiexec -n 4 python -m mpi4py nstream-numpy-mpi.py 10 10000000 - mpiexec -n 4 python -m mpi4py transpose-numpy-mpi.py 10 1000 + mpiexec -n 4 python3 -m mpi4py nstream-numpy-mpi.py 10 10000000 + mpiexec -n 4 python3 -m mpi4py transpose-numpy-mpi.py 10 1000 ``` On Mac with Homebrew, this might work better: From 91c0e87251085e60e6254dd8a3a0e9a6baa40cfb Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Nov 2021 15:50:58 +0200 Subject: [PATCH 142/325] use modules --- FORTRAN/Makefile | 66 +++++++++++++++++---------------- FORTRAN/prk_mod.F90 | 11 ++++++ FORTRAN/transpose-cufortran.cuf | 10 +---- FORTRAN/transpose.F90 | 11 +----- 4 files changed, 48 insertions(+), 50 deletions(-) create mode 100644 FORTRAN/prk_mod.F90 diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index e2c61d18d..3d7bdfaa0 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -90,53 +90,57 @@ cufortran: nstream-cufortran transpose-cufortran blas: dgemm-blas -%: %.F90 - $(FC) $(FCFLAGS) $< -o $@ +%: %.F90 prk.mod + $(FC) $(FCFLAGS) $< prk_mod.o -o $@ -stencil: stencil.F90 stencil_serial.F90 - #$(FC) $(FCFLAGS) -c stencil_serial.F90 -o stencil_serial.o - $(FC) $(FCFLAGS) $< -o $@ +prk.mod: prk_mod.F90 + $(FC) $(FCFLAGS) -c $< -o prk_mod.o -dgemm-pretty: dgemm-pretty.F90 - $(FC) $(FCFLAGS) $< $(BLASFLAGS) $(STDPARFLAG) -o $@ +stencil: stencil.F90 stencil_serial.F90 prk.mod + #$(FC) $(FCFLAGS) -c stencil_serial.F90 prk_mod.o -o stencil_serial.o + $(FC) $(FCFLAGS) $< prk_mod.o -o $@ -dgemm-blas: dgemm-blas.F90 - $(FC) $(FCFLAGS) $< $(BLASFLAGS) -o $@ +dgemm-pretty: dgemm-pretty.F90 prk.mod + $(FC) $(FCFLAGS) $< prk_mod.o $(BLASFLAGS) $(STDPARFLAG) -o $@ -%-pretty: %-pretty.F90 - $(FC) $(FCFLAGS) $(STDPARFLAG) $< -o $@ +dgemm-blas: dgemm-blas.F90 prk.mod + $(FC) $(FCFLAGS) $< prk_mod.o $(BLASFLAGS) -o $@ -#%-openmp: %.F90 -# $(FC) $(FCFLAGS) $(OPENMPFLAG) $< -o $@ +%-pretty: %-pretty.F90 prk.mod + $(FC) $(FCFLAGS) $(STDPARFLAG) $< prk_mod.o -o $@ -%-openmp: %-openmp.F90 - $(FC) $(FCFLAGS) $(OPENMPFLAG) $< -o $@ +#%-openmp: %.F90 prk.mod +# $(FC) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o -o $@ -%-ga: %-ga.F90 - $(MPIFORT) $(FCFLAGS) $< $(GAFLAG) -o $@ +%-openmp: %-openmp.F90 prk.mod + $(FC) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o -o $@ -%-mpi-openmp: %-mpi.F90 - $(MPIFORT) $(FCFLAGS) $(OPENMPFLAG) $< -o $@ +%-ga: %-ga.F90 prk.mod + $(MPIFORT) $(FCFLAGS) $< prk_mod.o $(GAFLAG) -o $@ -%-mpi: %-mpi.F90 - $(MPIFORT) $(FCFLAGS) $< -o $@ +%-mpi-openmp: %-mpi.F90 prk.mod + $(MPIFORT) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o -o $@ -%-coarray: %-coarray.F90 - $(CAFC) $(FCFLAGS) $< $(COARRAYFLAG) -o $@ +%-mpi: %-mpi.F90 prk.mod + $(MPIFORT) $(FCFLAGS) $< prk_mod.o -o $@ -%-target: %-target.F90 - $(FC) $(FCFLAGS) $(OPENMPFLAG) $(OFFLOADFLAG) $< -o $@ +%-coarray: %-coarray.F90 prk.mod + $(CAFC) $(FCFLAGS) $< prk_mod.o $(COARRAYFLAG) -o $@ -%-openacc: %-openacc.F90 - $(FC) $(FCFLAGS) $(OPENACCFLAG) $< -o $@ +%-target: %-target.F90 prk.mod + $(FC) $(FCFLAGS) $(OPENMPFLAG) $(OFFLOADFLAG) $< prk_mod.o -o $@ -%-cufortran: %-cufortran.cuf - $(FC) $(FCFLAGS) $(CUFORTFLAG) $< -o $@ +%-openacc: %-openacc.F90 prk.mod + $(FC) $(FCFLAGS) $(OPENACCFLAG) $< prk_mod.o -o $@ -%-stdpar: %-stdpar.F90 - $(FC) $(FCFLAGS) $(STDPARFLAG) $< -o $@ +%-cufortran: %-cufortran.cuf prk.mod + $(FC) $(FCFLAGS) $(CUFORTFLAG) $< prk_mod.o -o $@ + +%-stdpar: %-stdpar.F90 prk.mod + $(FC) $(FCFLAGS) $(STDPARFLAG) $< prk_mod.o -o $@ clean: + -rm -f prk.mod -rm -f *.o -rm -f *.i90 -rm -f *.dbg diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 new file mode 100644 index 000000000..5172381e5 --- /dev/null +++ b/FORTRAN/prk_mod.F90 @@ -0,0 +1,11 @@ +module prk + contains + function prk_get_wtime() result(t) + use iso_fortran_env + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) + end function prk_get_wtime +end module prk diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.cuf index 2495de2b9..c695516ab 100644 --- a/FORTRAN/transpose-cufortran.cuf +++ b/FORTRAN/transpose-cufortran.cuf @@ -52,15 +52,6 @@ ! Converted to Fortran by Jeff Hammond, January 2015 ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - module transpose use iso_fortran_env integer(kind=INT32), parameter :: tile_dim = 32 @@ -125,6 +116,7 @@ program main use iso_fortran_env use cudafor use transpose + use prk implicit none real(kind=REAL64) :: prk_get_wtime ! for argument parsing diff --git a/FORTRAN/transpose.F90 b/FORTRAN/transpose.F90 index 929594e6c..0be71030f 100644 --- a/FORTRAN/transpose.F90 +++ b/FORTRAN/transpose.F90 @@ -53,19 +53,10 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen From 3f72a6bdab70a8b3a7f3a6be63279289db28d8cf Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Nov 2021 17:02:54 +0200 Subject: [PATCH 143/325] move reusable PRK stuff to module --- FORTRAN/dgemm-blas.F90 | 11 +------ FORTRAN/dgemm-openmp.F90 | 11 +------ FORTRAN/dgemm-pretty.F90 | 11 +------ FORTRAN/dgemm-stdpar.F90 | 11 +------ FORTRAN/dgemm-taskloop-openmp.F90 | 11 +------ FORTRAN/dgemm.F90 | 11 +------ FORTRAN/nstream-coarray.F90 | 11 +------ FORTRAN/nstream-cufortran.cuf | 11 +------ FORTRAN/nstream-mpi.F90 | 14 --------- FORTRAN/nstream-openacc.F90 | 11 +------ FORTRAN/nstream-pretty.F90 | 11 +------ FORTRAN/nstream-stdpar.F90 | 11 +------ FORTRAN/nstream.F90 | 11 +------ FORTRAN/p2p-async-openacc.F90 | 11 +------ FORTRAN/p2p-coarray.F90 | 11 +------ FORTRAN/p2p-innerloop-openacc.F90 | 11 +------ FORTRAN/p2p-innerloop.F90 | 11 +------ FORTRAN/p2p-openacc.F90 | 11 +------ FORTRAN/p2p.F90 | 11 +------ FORTRAN/pic-openmp.F90 | 15 +-------- FORTRAN/pic.F90 | 13 +------- FORTRAN/pic_soa-openmp.F90 | 13 +------- FORTRAN/pic_soa.F90 | 13 +------- FORTRAN/prk_mod.F90 | 31 +++++++++++++++++++ FORTRAN/stencil-coarray.F90 | 42 +------------------------ FORTRAN/stencil-openacc.F90 | 48 ++--------------------------- FORTRAN/stencil-openmp-target.F90 | 32 +------------------ FORTRAN/stencil-openmp.F90 | 33 +------------------- FORTRAN/stencil-pretty.F90 | 43 +------------------------- FORTRAN/stencil-stdpar.F90 | 42 +------------------------ FORTRAN/stencil-taskloop-openmp.F90 | 32 +------------------ FORTRAN/stencil.F90 | 47 ++-------------------------- FORTRAN/transpose-coarray.F90 | 11 +------ FORTRAN/transpose-cufortran.cuf | 1 - FORTRAN/transpose-openacc.F90 | 11 +------ FORTRAN/transpose-pretty.F90 | 11 +------ FORTRAN/transpose-stdpar.F90 | 11 +------ 37 files changed, 68 insertions(+), 593 deletions(-) diff --git a/FORTRAN/dgemm-blas.F90 b/FORTRAN/dgemm-blas.F90 index bde57248e..171ca5113 100644 --- a/FORTRAN/dgemm-blas.F90 +++ b/FORTRAN/dgemm-blas.F90 @@ -51,22 +51,13 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env #ifdef _OPENMP use omp_lib #endif + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/dgemm-openmp.F90 b/FORTRAN/dgemm-openmp.F90 index a2387a535..0405e6b1d 100644 --- a/FORTRAN/dgemm-openmp.F90 +++ b/FORTRAN/dgemm-openmp.F90 @@ -52,15 +52,6 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - subroutine prk_dgemm(order, tile_size, A, B, C) use iso_fortran_env implicit none @@ -106,8 +97,8 @@ end subroutine prk_dgemm program main use iso_fortran_env use omp_lib + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/dgemm-pretty.F90 b/FORTRAN/dgemm-pretty.F90 index ad0285fdc..592bfa6e3 100644 --- a/FORTRAN/dgemm-pretty.F90 +++ b/FORTRAN/dgemm-pretty.F90 @@ -52,23 +52,14 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env #ifdef NVHPC use cutensorex use cudafor #endif + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/dgemm-stdpar.F90 b/FORTRAN/dgemm-stdpar.F90 index f55ad5b35..84e078dc3 100644 --- a/FORTRAN/dgemm-stdpar.F90 +++ b/FORTRAN/dgemm-stdpar.F90 @@ -52,17 +52,9 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - subroutine prk_dgemm(order, tile_size, A, B, C) use iso_fortran_env + use prk implicit none integer(kind=INT32), intent(in) :: order, tile_size real(kind=REAL64), intent(in) :: A(order,order) @@ -98,7 +90,6 @@ end subroutine prk_dgemm program main use iso_fortran_env implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/dgemm-taskloop-openmp.F90 b/FORTRAN/dgemm-taskloop-openmp.F90 index 41ab1817e..db02c0402 100644 --- a/FORTRAN/dgemm-taskloop-openmp.F90 +++ b/FORTRAN/dgemm-taskloop-openmp.F90 @@ -52,15 +52,6 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - subroutine prk_dgemm(order, tile_size, A, B, C) use iso_fortran_env implicit none @@ -113,8 +104,8 @@ end subroutine prk_dgemm program main use iso_fortran_env use omp_lib + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/dgemm.F90 b/FORTRAN/dgemm.F90 index ff5afe2ab..1167f589d 100644 --- a/FORTRAN/dgemm.F90 +++ b/FORTRAN/dgemm.F90 @@ -52,15 +52,6 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - subroutine prk_dgemm(order, tile_size, A, B, C) use iso_fortran_env implicit none @@ -112,8 +103,8 @@ end subroutine prk_dgemm program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/nstream-coarray.F90 b/FORTRAN/nstream-coarray.F90 index 5e2fa9259..843f7d210 100644 --- a/FORTRAN/nstream-coarray.F90 +++ b/FORTRAN/nstream-coarray.F90 @@ -63,19 +63,10 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime integer :: me, np, p ! for argument parsing integer :: err diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.cuf index 6962cb610..4a0586491 100644 --- a/FORTRAN/nstream-cufortran.cuf +++ b/FORTRAN/nstream-cufortran.cuf @@ -63,15 +63,6 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - module nstream use iso_fortran_env contains @@ -93,8 +84,8 @@ program main use iso_fortran_env use cudafor use nstream + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index 57f1342c1..c9b92bd2b 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -63,17 +63,6 @@ ! ! ******************************************************************* -#ifndef _OPENMP -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime -#endif - program main use iso_fortran_env #ifdef _OPENMP @@ -81,9 +70,6 @@ program main #endif use mpi_f08 implicit none -#ifndef _OPENMP - real(kind=REAL64) :: prk_get_wtime -#endif ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/nstream-openacc.F90 b/FORTRAN/nstream-openacc.F90 index c04364a96..9f71b155a 100644 --- a/FORTRAN/nstream-openacc.F90 +++ b/FORTRAN/nstream-openacc.F90 @@ -63,19 +63,10 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/nstream-pretty.F90 b/FORTRAN/nstream-pretty.F90 index 59205a6e7..39240f632 100644 --- a/FORTRAN/nstream-pretty.F90 +++ b/FORTRAN/nstream-pretty.F90 @@ -63,19 +63,10 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/nstream-stdpar.F90 b/FORTRAN/nstream-stdpar.F90 index 43c0e442f..de4d98671 100644 --- a/FORTRAN/nstream-stdpar.F90 +++ b/FORTRAN/nstream-stdpar.F90 @@ -63,19 +63,10 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/nstream.F90 b/FORTRAN/nstream.F90 index 86bc57814..d8e4c184a 100644 --- a/FORTRAN/nstream.F90 +++ b/FORTRAN/nstream.F90 @@ -63,19 +63,10 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/p2p-async-openacc.F90 b/FORTRAN/p2p-async-openacc.F90 index e42cbb46d..8104f6b9e 100644 --- a/FORTRAN/p2p-async-openacc.F90 +++ b/FORTRAN/p2p-async-openacc.F90 @@ -54,17 +54,9 @@ ! Converted to Fortran by Jeff Hammond, January 2016. ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) use iso_fortran_env + use prk implicit none integer(kind=INT32), intent(in) :: m,n integer(kind=INT32), intent(in) :: startm,endm @@ -83,7 +75,6 @@ subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) program main use iso_fortran_env implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/p2p-coarray.F90 b/FORTRAN/p2p-coarray.F90 index d2846eda7..2766b6291 100644 --- a/FORTRAN/p2p-coarray.F90 +++ b/FORTRAN/p2p-coarray.F90 @@ -57,19 +57,10 @@ ! - Minor bug fixes by Izaak "Zaak" Beekman, March 2017 ! ******************************************************************** -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/p2p-innerloop-openacc.F90 b/FORTRAN/p2p-innerloop-openacc.F90 index ee35ca58a..4c670ca77 100644 --- a/FORTRAN/p2p-innerloop-openacc.F90 +++ b/FORTRAN/p2p-innerloop-openacc.F90 @@ -54,19 +54,10 @@ ! Converted to Fortran by Jeff Hammond, January 2016. ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/p2p-innerloop.F90 b/FORTRAN/p2p-innerloop.F90 index db01f6b6b..b7d884e31 100644 --- a/FORTRAN/p2p-innerloop.F90 +++ b/FORTRAN/p2p-innerloop.F90 @@ -54,22 +54,13 @@ ! Converted to Fortran by Jeff Hammond, January 2016. ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env #ifdef _OPENMP use omp_lib #endif + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/p2p-openacc.F90 b/FORTRAN/p2p-openacc.F90 index 18ee965e2..eb57bfb09 100644 --- a/FORTRAN/p2p-openacc.F90 +++ b/FORTRAN/p2p-openacc.F90 @@ -54,19 +54,10 @@ ! Converted to Fortran by Jeff Hammond, January 2016. ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/p2p.F90 b/FORTRAN/p2p.F90 index 334120cd0..29d328a91 100644 --- a/FORTRAN/p2p.F90 +++ b/FORTRAN/p2p.F90 @@ -54,15 +54,6 @@ ! Converted to Fortran by Jeff Hammond, January 2016. ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) use iso_fortran_env implicit none @@ -80,8 +71,8 @@ subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/pic-openmp.F90 b/FORTRAN/pic-openmp.F90 index 30b6bdf48..55dce51c4 100644 --- a/FORTRAN/pic-openmp.F90 +++ b/FORTRAN/pic-openmp.F90 @@ -42,26 +42,13 @@ #define FAILURE 0 #define epsilon 1.e-5 -#ifndef _OPENMP -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime -#endif - program pic use, intrinsic :: ISO_FORTRAN_ENV, only : REAL64, REAL32, INT64, INT32 #ifdef _OPENMP use omp_lib #endif + use prk implicit none -#ifndef _OPENMP - real(kind=REAL64) :: prk_get_wtime -#endif type particle_t real(kind=REAL64) :: x, y, v_x, v_y, q, x0, y0 diff --git a/FORTRAN/pic.F90 b/FORTRAN/pic.F90 index 30b6bdf48..2f43c7e1d 100644 --- a/FORTRAN/pic.F90 +++ b/FORTRAN/pic.F90 @@ -42,25 +42,14 @@ #define FAILURE 0 #define epsilon 1.e-5 -#ifndef _OPENMP -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime -#endif - program pic use, intrinsic :: ISO_FORTRAN_ENV, only : REAL64, REAL32, INT64, INT32 #ifdef _OPENMP use omp_lib #endif + use prk implicit none #ifndef _OPENMP - real(kind=REAL64) :: prk_get_wtime #endif type particle_t diff --git a/FORTRAN/pic_soa-openmp.F90 b/FORTRAN/pic_soa-openmp.F90 index 229638aeb..f59fffd69 100644 --- a/FORTRAN/pic_soa-openmp.F90 +++ b/FORTRAN/pic_soa-openmp.F90 @@ -42,25 +42,14 @@ #define FAILURE 0 #define epsilon 1.e-5 -#ifndef _OPENMP -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime -#endif - program pic use, intrinsic :: ISO_FORTRAN_ENV, only : REAL64, REAL32, INT64, INT32 #ifdef _OPENMP use omp_lib #endif + use prk implicit none #ifndef _OPENMP - real(kind=REAL64) :: prk_get_wtime #endif type particle_t diff --git a/FORTRAN/pic_soa.F90 b/FORTRAN/pic_soa.F90 index 229638aeb..f59fffd69 100644 --- a/FORTRAN/pic_soa.F90 +++ b/FORTRAN/pic_soa.F90 @@ -42,25 +42,14 @@ #define FAILURE 0 #define epsilon 1.e-5 -#ifndef _OPENMP -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime -#endif - program pic use, intrinsic :: ISO_FORTRAN_ENV, only : REAL64, REAL32, INT64, INT32 #ifdef _OPENMP use omp_lib #endif + use prk implicit none #ifndef _OPENMP - real(kind=REAL64) :: prk_get_wtime #endif type particle_t diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index 5172381e5..72821cfd0 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -8,4 +8,35 @@ function prk_get_wtime() result(t) call system_clock(count = c, count_rate = r) t = real(c,REAL64) / real(r,REAL64) end function prk_get_wtime + + subroutine initialize_w(is_star,r,W) + use iso_fortran_env + implicit none + logical, intent(in) :: is_star + integer(kind=INT32), intent(in) :: r + real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) + integer(kind=INT32) :: ii, jj + ! fill the stencil weights to reflect a discrete divergence operator + W = 0.0d0 + if (is_star) then + do ii=1,r + W(0, ii) = 1.0d0/real(2*ii*r,REAL64) + W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) + W( ii,0) = 1.0d0/real(2*ii*r,REAL64) + W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) + enddo + else + do jj=1,r + do ii=-jj+1,jj-1 + W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) + W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) + W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) + W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) + enddo + W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) + W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) + enddo + endif + end subroutine initialize_w + end module prk diff --git a/FORTRAN/stencil-coarray.F90 b/FORTRAN/stencil-coarray.F90 index 5e5a6293e..3c7e8e3b1 100644 --- a/FORTRAN/stencil-coarray.F90 +++ b/FORTRAN/stencil-coarray.F90 @@ -62,46 +62,6 @@ ! Izaak "Zaak" Beekman, March 2017 ! ************************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - -subroutine initialize_w(is_star,r,W) - use iso_fortran_env - implicit none - logical, intent(in) :: is_star - integer(kind=INT32), intent(in) :: r - real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) - integer(kind=INT32) :: ii, jj - ! fill the stencil weights to reflect a discrete divergence operator - W = 0.0d0 - if (is_star) then - do ii=1,r - W(0, ii) = 1.0d0/real(2*ii*r,REAL64) - W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) - W( ii,0) = 1.0d0/real(2*ii*r,REAL64) - W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) - enddo - else - ! Jeff: check that this is correct with the new W indexing - do jj=1,r - do ii=-jj+1,jj-1 - W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - enddo - W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) - W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) - enddo - endif -end subroutine initialize_w - subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none @@ -185,8 +145,8 @@ end subroutine apply_stencil program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/stencil-openacc.F90 b/FORTRAN/stencil-openacc.F90 index 02a1eecca..a6aec7726 100644 --- a/FORTRAN/stencil-openacc.F90 +++ b/FORTRAN/stencil-openacc.F90 @@ -61,46 +61,6 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - -subroutine initialize_w(is_star,r,W) - use iso_fortran_env - implicit none - logical, intent(in) :: is_star - integer(kind=INT32), intent(in) :: r - real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) - integer(kind=INT32) :: ii, jj - ! fill the stencil weights to reflect a discrete divergence operator - W = 0.0d0 - if (is_star) then - do ii=1,r - W(0, ii) = 1.0d0/real(2*ii*r,REAL64) - W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) - W( ii,0) = 1.0d0/real(2*ii*r,REAL64) - W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) - enddo - else - ! Jeff: check that this is correct with the new W indexing - do jj=1,r - do ii=-jj+1,jj-1 - W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - enddo - W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) - W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) - enddo - endif -end subroutine initialize_w - subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none @@ -183,8 +143,8 @@ end subroutine apply_stencil program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen @@ -320,11 +280,9 @@ program main if (k.eq.1) t0 = prk_get_wtime() - ! DEVICE ! Apply the stencil operator call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - ! DEVICE ! add constant to solution to force refresh of neighbor data, if any !$acc parallel loop collapse(2) do j=1,n @@ -339,14 +297,14 @@ program main !$acc end data - stencil_time = t1 - t0 - !$acc parallel loop collapse(2) reduction(+:norm) do j=r,n-r do i=r,n-r norm = norm + abs(B(i,j)) enddo enddo + + stencil_time = t1 - t0 norm = norm / real(active_points,REAL64) !****************************************************************************** diff --git a/FORTRAN/stencil-openmp-target.F90 b/FORTRAN/stencil-openmp-target.F90 index 96ed3462f..6dcfe97b6 100644 --- a/FORTRAN/stencil-openmp-target.F90 +++ b/FORTRAN/stencil-openmp-target.F90 @@ -61,37 +61,6 @@ ! ! ******************************************************************* -subroutine initialize_w(is_star,r,W) - use iso_fortran_env - implicit none - logical, intent(in) :: is_star - integer(kind=INT32), intent(in) :: r - real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) - integer(kind=INT32) :: ii, jj - ! fill the stencil weights to reflect a discrete divergence operator - W = 0.0d0 - if (is_star) then - do ii=1,r - W(0, ii) = 1.0d0/real(2*ii*r,REAL64) - W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) - W( ii,0) = 1.0d0/real(2*ii*r,REAL64) - W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) - enddo - else - ! Jeff: check that this is correct with the new W indexing - do jj=1,r - do ii=-jj+1,jj-1 - W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - enddo - W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) - W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) - enddo - endif -end subroutine initialize_w - subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none @@ -181,6 +150,7 @@ end subroutine apply_stencil program main use iso_fortran_env use omp_lib + use prk implicit none ! for argument parsing integer :: err diff --git a/FORTRAN/stencil-openmp.F90 b/FORTRAN/stencil-openmp.F90 index 2377c763a..83f191c65 100644 --- a/FORTRAN/stencil-openmp.F90 +++ b/FORTRAN/stencil-openmp.F90 @@ -61,37 +61,6 @@ ! ! ******************************************************************* -subroutine initialize_w(is_star,r,W) - use iso_fortran_env - implicit none - logical, intent(in) :: is_star - integer(kind=INT32), intent(in) :: r - real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) - integer(kind=INT32) :: ii, jj - ! fill the stencil weights to reflect a discrete divergence operator - W = 0.0d0 - if (is_star) then - do ii=1,r - W(0, ii) = 1.0d0/real(2*ii*r,REAL64) - W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) - W( ii,0) = 1.0d0/real(2*ii*r,REAL64) - W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) - enddo - else - ! Jeff: check that this is correct with the new W indexing - do jj=1,r - do ii=-jj+1,jj-1 - W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - enddo - W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) - W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) - enddo - endif -end subroutine initialize_w - subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none @@ -176,6 +145,7 @@ end subroutine apply_stencil program main use iso_fortran_env use omp_lib + use prk implicit none ! for argument parsing integer :: err @@ -316,7 +286,6 @@ program main do k=0,iterations - ! start timer after a warmup iteration !$omp barrier !$omp master if (k.eq.1) then diff --git a/FORTRAN/stencil-pretty.F90 b/FORTRAN/stencil-pretty.F90 index 1119ff731..68386f5c5 100644 --- a/FORTRAN/stencil-pretty.F90 +++ b/FORTRAN/stencil-pretty.F90 @@ -60,46 +60,6 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - -subroutine initialize_w(is_star,r,W) - use iso_fortran_env - implicit none - logical, intent(in) :: is_star - integer(kind=INT32), intent(in) :: r - real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) - integer(kind=INT32) :: ii, jj - ! fill the stencil weights to reflect a discrete divergence operator - W = 0.0d0 - if (is_star) then - do ii=1,r - W(0, ii) = 1.0d0/real(2*ii*r,REAL64) - W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) - W( ii,0) = 1.0d0/real(2*ii*r,REAL64) - W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) - enddo - else - ! Jeff: check that this is correct with the new W indexing - do jj=1,r - do ii=-jj+1,jj-1 - W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - enddo - W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) - W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) - enddo - endif -end subroutine initialize_w - subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none @@ -183,8 +143,8 @@ end subroutine apply_stencil program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen @@ -295,7 +255,6 @@ program main do k=0,iterations - ! start timer after a warmup iteration if (k.eq.1) t0 = prk_get_wtime() ! Apply the stencil operator diff --git a/FORTRAN/stencil-stdpar.F90 b/FORTRAN/stencil-stdpar.F90 index f0ea2ad3c..c8ce01f9e 100644 --- a/FORTRAN/stencil-stdpar.F90 +++ b/FORTRAN/stencil-stdpar.F90 @@ -61,46 +61,6 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - -subroutine initialize_w(is_star,r,W) - use iso_fortran_env - implicit none - logical, intent(in) :: is_star - integer(kind=INT32), intent(in) :: r - real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) - integer(kind=INT32) :: ii, jj - ! fill the stencil weights to reflect a discrete divergence operator - W = 0.0d0 - if (is_star) then - do ii=1,r - W(0, ii) = 1.0d0/real(2*ii*r,REAL64) - W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) - W( ii,0) = 1.0d0/real(2*ii*r,REAL64) - W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) - enddo - else - ! Jeff: check that this is correct with the new W indexing - do jj=1,r - do ii=-jj+1,jj-1 - W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - enddo - W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) - W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) - enddo - endif -end subroutine initialize_w - subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none @@ -175,8 +135,8 @@ end subroutine apply_stencil program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/stencil-taskloop-openmp.F90 b/FORTRAN/stencil-taskloop-openmp.F90 index ff72aab62..495fa0626 100644 --- a/FORTRAN/stencil-taskloop-openmp.F90 +++ b/FORTRAN/stencil-taskloop-openmp.F90 @@ -60,37 +60,6 @@ ! ! ******************************************************************* -subroutine initialize_w(is_star,r,W) - use iso_fortran_env - implicit none - logical, intent(in) :: is_star - integer(kind=INT32), intent(in) :: r - real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) - integer(kind=INT32) :: ii, jj - ! fill the stencil weights to reflect a discrete divergence operator - W = 0.0d0 - if (is_star) then - do ii=1,r - W(0, ii) = 1.0d0/real(2*ii*r,REAL64) - W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) - W( ii,0) = 1.0d0/real(2*ii*r,REAL64) - W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) - enddo - else - ! Jeff: check that this is correct with the new W indexing - do jj=1,r - do ii=-jj+1,jj-1 - W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - enddo - W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) - W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) - enddo - endif -end subroutine initialize_w - subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none @@ -177,6 +146,7 @@ end subroutine apply_stencil program main use iso_fortran_env use omp_lib + use prk implicit none ! for argument parsing integer :: err diff --git a/FORTRAN/stencil.F90 b/FORTRAN/stencil.F90 index 70c5fa3d6..8ca6116be 100644 --- a/FORTRAN/stencil.F90 +++ b/FORTRAN/stencil.F90 @@ -61,46 +61,6 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - -subroutine initialize_w(is_star,r,W) - use iso_fortran_env - implicit none - logical, intent(in) :: is_star - integer(kind=INT32), intent(in) :: r - real(kind=REAL64), intent(inout) :: W(-r:r,-r:r) - integer(kind=INT32) :: ii, jj - ! fill the stencil weights to reflect a discrete divergence operator - W = 0.0d0 - if (is_star) then - do ii=1,r - W(0, ii) = 1.0d0/real(2*ii*r,REAL64) - W(0,-ii) = -1.0d0/real(2*ii*r,REAL64) - W( ii,0) = 1.0d0/real(2*ii*r,REAL64) - W(-ii,0) = -1.0d0/real(2*ii*r,REAL64) - enddo - else - ! Jeff: check that this is correct with the new W indexing - do jj=1,r - do ii=-jj+1,jj-1 - W( ii, jj) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( ii,-jj) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W( jj, ii) = 1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - W(-jj, ii) = -1.0d0/real(4*jj*(2*jj-1)*r,REAL64) - enddo - W( jj, jj) = 1.0d0/real(4*jj*r,REAL64) - W(-jj,-jj) = -1.0d0/real(4*jj*r,REAL64) - enddo - endif -end subroutine initialize_w - subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none @@ -176,8 +136,8 @@ end subroutine apply_stencil program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen @@ -297,7 +257,6 @@ program main call initialize_w(is_star,r,W) - ! intialize the input and output arrays do j=1,n do i=1,n A(i,j) = cx*i+cy*j @@ -309,9 +268,7 @@ program main do k=0,iterations - if (k.eq.1) then - t0 = prk_get_wtime() - endif + if (k.eq.1) t0 = prk_get_wtime() ! Apply the stencil operator call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90 index 0093a3493..13e2ba39d 100644 --- a/FORTRAN/transpose-coarray.F90 +++ b/FORTRAN/transpose-coarray.F90 @@ -54,19 +54,10 @@ ! Izaak "Zaak" Beekman ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime integer :: me, npes logical :: printer ! for argument parsing diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.cuf index c695516ab..fb203a4c9 100644 --- a/FORTRAN/transpose-cufortran.cuf +++ b/FORTRAN/transpose-cufortran.cuf @@ -118,7 +118,6 @@ program main use transpose use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index af6798745..b0da3c8e2 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -52,19 +52,10 @@ ! Converted to Fortran by Jeff Hammond, January 2015 ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/transpose-pretty.F90 b/FORTRAN/transpose-pretty.F90 index 6185431a4..b2ba96d05 100644 --- a/FORTRAN/transpose-pretty.F90 +++ b/FORTRAN/transpose-pretty.F90 @@ -49,19 +49,10 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen diff --git a/FORTRAN/transpose-stdpar.F90 b/FORTRAN/transpose-stdpar.F90 index 4f9a93aa3..c69dc9a9b 100644 --- a/FORTRAN/transpose-stdpar.F90 +++ b/FORTRAN/transpose-stdpar.F90 @@ -52,19 +52,10 @@ ! Converted to Fortran by Jeff Hammond, January 2015 ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env + use prk implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen From 1d2212d24ef48a042b51a23c68240fadf5e3f45b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Nov 2021 17:14:24 +0200 Subject: [PATCH 144/325] remove pointless preprocessor --- FORTRAN/pic.F90 | 2 -- FORTRAN/pic_soa-openmp.F90 | 2 -- FORTRAN/pic_soa.F90 | 2 -- 3 files changed, 6 deletions(-) diff --git a/FORTRAN/pic.F90 b/FORTRAN/pic.F90 index 2f43c7e1d..55dce51c4 100644 --- a/FORTRAN/pic.F90 +++ b/FORTRAN/pic.F90 @@ -49,8 +49,6 @@ program pic #endif use prk implicit none -#ifndef _OPENMP -#endif type particle_t real(kind=REAL64) :: x, y, v_x, v_y, q, x0, y0 diff --git a/FORTRAN/pic_soa-openmp.F90 b/FORTRAN/pic_soa-openmp.F90 index f59fffd69..cda3cc646 100644 --- a/FORTRAN/pic_soa-openmp.F90 +++ b/FORTRAN/pic_soa-openmp.F90 @@ -49,8 +49,6 @@ program pic #endif use prk implicit none -#ifndef _OPENMP -#endif type particle_t real(kind=REAL64) :: x, y, v_x, v_y, q, x0, y0 diff --git a/FORTRAN/pic_soa.F90 b/FORTRAN/pic_soa.F90 index f59fffd69..cda3cc646 100644 --- a/FORTRAN/pic_soa.F90 +++ b/FORTRAN/pic_soa.F90 @@ -49,8 +49,6 @@ program pic #endif use prk implicit none -#ifndef _OPENMP -#endif type particle_t real(kind=REAL64) :: x, y, v_x, v_y, q, x0, y0 From 8bef4b12fdfad72710061493324380d5ed9073f3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Nov 2021 17:41:08 +0200 Subject: [PATCH 145/325] change npes to np --- FORTRAN/transpose-coarray.F90 | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90 index 13e2ba39d..3f99fdf5a 100644 --- a/FORTRAN/transpose-coarray.F90 +++ b/FORTRAN/transpose-coarray.F90 @@ -58,12 +58,12 @@ program main use iso_fortran_env use prk implicit none - integer :: me, npes - logical :: printer ! for argument parsing integer :: err integer :: arglen character(len=32) :: argtmp + integer :: me, np + logical :: printer ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -72,7 +72,7 @@ program main real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile integer(kind=INT64) :: bytes ! combined size of matrices ! distributed data helpers - integer(kind=INT32) :: col_per_pe ! columns per PE = order/npes + integer(kind=INT32) :: col_per_pe ! columns per PE = order/np integer(kind=INT32) :: col_start, row_start ! runtime variables integer(kind=INT32) :: i, j, k, p, q @@ -82,7 +82,7 @@ program main real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance me = this_image()-1 ! use 0-based indexing of PEs - npes = num_images() + np = num_images() printer = (me.eq.0) ! ******************************************************************** @@ -121,14 +121,14 @@ program main endif stop 1 endif - if (modulo(order,npes).gt.0) then + if (modulo(order,np).gt.0) then if (printer) then write(6,'(a20,i5,a35,i5)') 'ERROR: matrix order ',order,& - ' should be divisible by # images ',npes + ' should be divisible by # images ',np endif stop 1 endif - col_per_pe = order/npes + col_per_pe = order/np ! same default as the C implementation tile_size = 32 @@ -167,7 +167,7 @@ program main bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(A)/8 if (printer) then - write(6,'(a23,i8)') 'Number of images = ', npes + write(6,'(a23,i8)') 'Number of images = ', np write(6,'(a23,i8)') 'Number of iterations = ', iterations write(6,'(a23,i8)') 'Matrix order = ', order write(6,'(a23,i8)') 'Tile size = ', tile_size @@ -204,12 +204,12 @@ program main t0 = prk_get_wtime() endif - ! we shift the loop range from [0,npes-1] to [me,me+npes-1] + ! we shift the loop range from [0,np-1] to [me,me+np-1] ! to balance communication. if everyone starts at 0, they will ! take turns blasting each image in the system with get operations. ! side note: this trick is used extensively in NWChem. - do q=me,me+npes-1 - p = modulo(q,npes) + do q=me,me+np-1 + p = modulo(q,np) ! Step 1: Gather A tile from remote image row_start = me*col_per_pe ! * fully explicit version @@ -289,7 +289,7 @@ program main deallocate( B ) - if (abserr .lt. (epsilon/npes)) then + if (abserr .lt. (epsilon/np)) then if (printer) then write(6,'(a)') 'Solution validates' avgtime = trans_time/iterations @@ -299,7 +299,7 @@ program main else if (printer) then write(6,'(a30,f13.6,a18,f13.6)') 'ERROR: Aggregate squared error ', & - abserr,' exceeds threshold ',(epsilon/npes) + abserr,' exceeds threshold ',(epsilon/np) endif stop 1 endif From 6be8f5eedd83135c9b57fb0a443dd695131d1cde Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Nov 2021 17:42:26 +0200 Subject: [PATCH 146/325] update MPI stuff, add transpose --- FORTRAN/Makefile | 2 +- FORTRAN/nstream-mpi.F90 | 37 +++----- FORTRAN/transpose-mpi.F90 | 184 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+), 26 deletions(-) create mode 100644 FORTRAN/transpose-mpi.F90 diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 3d7bdfaa0..2ab78cb65 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -72,7 +72,7 @@ taskloop: stencil-taskloop-openmp transpose-taskloop-openmp nstream-taskloop-ope coarray: nstream-coarray p2p-coarray stencil-coarray transpose-coarray -mpi: nstream-mpi +mpi: nstream-mpi transpose-mpi mpi-openmp: nstream-mpi-openmp diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index c9b92bd2b..e428a8efc 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -92,9 +92,11 @@ program main integer(kind=INT32) :: me, np, provided call MPI_Init_thread(MPI_THREAD_FUNNELED,provided) +#ifdef _OPENMP if (provided.eq.MPI_THREAD_SINGLE) then call MPI_Abort(MPI_COMM_WORLD,1) endif +#endif call MPI_Comm_rank(MPI_COMM_WORLD, me) call MPI_Comm_size(MPI_COMM_WORLD, np) @@ -111,10 +113,8 @@ program main #endif if (command_argument_count().lt.2) then - if (me.eq.0) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a49)') 'Usage: ./nstream <# iterations> ' - endif + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a49)') 'Usage: ./nstream <# iterations> ' call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) endif @@ -135,10 +135,11 @@ program main endif #ifdef _OPENMP - write(*,'(a,i12)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a23,i8)') 'Number of threads = ', omp_get_max_threads() #endif - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Vector length = ', length + write(*,'(a23,i8)') 'Number of MPI procs = ', np + write(*,'(a23,i8)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(length, 1, MPI_INTEGER8, 0, MPI_COMM_WORLD) @@ -147,24 +148,12 @@ program main ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err call MPI_Abort(MPI_COMM_WORLD, 10) endif - allocate( B(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - call MPI_Abort(MPI_COMM_WORLD, 11) - endif - - allocate( C(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err - call MPI_Abort(MPI_COMM_WORLD, 12) - endif - scalar = 3 t0 = 0 @@ -277,9 +266,7 @@ program main #endif call MPI_Allreduce(MPI_IN_PLACE, asum, 1, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD) - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then if (me.eq.0) then @@ -297,7 +284,7 @@ program main write(*,'(a12,f15.3,1x,a12,e15.6)') & 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & 'Avg time (s): ', avgtime - endif + endif endif call MPI_Finalize() diff --git a/FORTRAN/transpose-mpi.F90 b/FORTRAN/transpose-mpi.F90 new file mode 100644 index 000000000..d9920bbad --- /dev/null +++ b/FORTRAN/transpose-mpi.F90 @@ -0,0 +1,184 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: transpose +! +! PURPOSE: This program measures the time for the transpose of a +! column-major stored matrix into a row-major stored matrix. +! +! USAGE: Program input is the matrix order and the number of times to +! repeat the operation: +! +! transpose <# iterations> [tile size] +! +! An optional parameter specifies the tile size used to divide the +! individual matrix blocks for improved cache and TLB performance. +! +! The output consists of diagnostics to make sure the +! transpose worked and timing statistics. +! +! HISTORY: Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2015 +! MPI by Jeff Hammond, November 2021 +! ******************************************************************* + +program main + use iso_fortran_env + use mpi_f08 + implicit none + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations + integer(kind=INT32) :: order + real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix + real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix + real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile + real(kind=REAL64), parameter :: one=1.0d0 + integer(kind=INT64) :: bytes + ! distributed data helpers + integer(kind=INT32) :: col_per_pe ! columns per PE = order/np + integer(kind=INT32) :: col_start, row_start + ! runtime variables + integer(kind=INT32) :: i, j, k, p, q + integer(kind=INT32) :: it, jt, tile_size + real(kind=REAL64) :: abserr, addit, temp + real(kind=REAL64) :: t0, t1, trans_time, avgtime + real(kind=REAL64), parameter :: epsilon=1.d-8 + ! MPI stuff + integer(kind=INT32) :: me, np, provided + + call MPI_Init_thread(MPI_THREAD_FUNNELED,provided) +#ifdef _OPENMP + if (provided.eq.MPI_THREAD_SINGLE) then + call MPI_Abort(MPI_COMM_WORLD,1) + endif +#endif + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + if (me.eq.0) then + write(*,'(a25)') 'Parallel Research Kernels' +#ifdef _OPENMP + write(*,'(a43)') 'Fortran MPI/OpenMP Matrix transpose: B = A^T' +#else + write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' +#endif + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a62)') 'Usage: ./transpose <# iterations> ' + call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + call MPI_Abort(MPI_COMM_WORLD, 2) + endif + + order = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') order + if (order .lt. 1) then + write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order + call MPI_Abort(MPI_COMM_WORLD, 3) + endif + +#ifdef _OPENMP + write(*,'(a23,i8)') 'Number of threads = ', omp_get_max_threads() +#endif + write(*,'(a23,i8)') 'Number of MPI procs = ', np + write(*,'(a23,i8)') 'Number of iterations = ', iterations + write(*,'(a23,i8)') 'Matrix order = ', order + endif + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + + call MPI_Barrier(MPI_COMM_WORLD) + + ! ******************************************************************** + ! ** Allocate space for the input and transpose matrix + ! ******************************************************************** + + t0 = 0.0d0 + + do k=0,iterations + + if (k.eq.1) then + call MPI_Barrier(MPI_COMM_WORLD) + t0 = MPI_Wtime() + endif + + ! B += A^T + ! A += 1 + + enddo ! iterations + + call MPI_Barrier(MPI_COMM_WORLD) + t1 = MPI_Wtime() + + trans_time = t1 - t0 + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + if (me.eq.0) then + if (abserr .lt. epsilon) then + write(*,'(a)') 'Solution validates' + avgtime = trans_time/iterations + bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(one)/8 + write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & + ' Avg time (s): ', avgtime + else + write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & + 'exceeds threshold ',epsilon + call MPI_Abort(MPI_COMM_WORLD,1) + endif + endif + + call MPI_Barrier(MPI_COMM_WORLD) + call mpi_finalize() + +end program main + From f28ea04daa02d6e02622335ad724ea351be345f6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 16 Nov 2021 16:49:05 +0200 Subject: [PATCH 147/325] transpose-mpi still WIP --- FORTRAN/prk_mod.F90 | 11 ++++++ FORTRAN/transpose-coarray.F90 | 22 ++--------- FORTRAN/transpose-mpi.F90 | 71 ++++++++++++++++++++++++++++++----- FORTRAN/transpose.F90 | 10 +---- 4 files changed, 78 insertions(+), 36 deletions(-) diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index 72821cfd0..dc9251bbb 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -39,4 +39,15 @@ subroutine initialize_w(is_star,r,W) endif end subroutine initialize_w + subroutine print_matrix(row, col, mat) + use iso_fortran_env + implicit none + integer(kind=INT32), intent(in) :: row, col + real(kind=REAL64), intent(in) :: mat(row, col) + integer(kind=INT32) :: i + do i=1,row + write(*,*) mat(i,:) + end do + end subroutine print_matrix + end module prk diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90 index 3f99fdf5a..9021ddce7 100644 --- a/FORTRAN/transpose-coarray.F90 +++ b/FORTRAN/transpose-coarray.F90 @@ -146,26 +146,12 @@ program main ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,col_per_pe)[*], stat=err) + allocate( A(order,col_per_pe)[*], B(order,col_per_pe)[*], T(col_per_pe,col_per_pe), stat=err) if (err .ne. 0) then - write(6,'(a20,i3,a10,i5)') 'allocation of A returned ',err,' at image ',me + write(6,'(a20,i3,a10,i5)') 'allocation returned ',err,' at image ',me stop 1 endif - allocate( B(order,col_per_pe)[*], stat=err ) - if (err .ne. 0) then - write(6,'(a20,i3,a10,i5)') 'allocation of B returned ',err,' at image ',me - stop 1 - endif - - allocate( T(col_per_pe,col_per_pe), stat=err ) - if (err .ne. 0) then - write(6,'(a20,i3,a10,i5)') 'allocation of T returned ',err,' at image ',me - stop 1 - endif - - bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(A)/8 - if (printer) then write(6,'(a23,i8)') 'Number of images = ', np write(6,'(a23,i8)') 'Number of iterations = ', iterations @@ -270,8 +256,7 @@ program main t1 = prk_get_wtime() trans_time = t1 - t0 - deallocate( T ) - deallocate( A ) + deallocate( A,T ) ! ******************************************************************** ! ** Analyze and output results. @@ -293,6 +278,7 @@ program main if (printer) then write(6,'(a)') 'Solution validates' avgtime = trans_time/iterations + bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(A(1,1))/8 write(6,'(a12,f13.6,a17,f10.6)') 'Rate (MB/s): ',& (1.d-6*bytes/avgtime),' Avg time (s): ', avgtime endif diff --git a/FORTRAN/transpose-mpi.F90 b/FORTRAN/transpose-mpi.F90 index d9920bbad..9ddff956f 100644 --- a/FORTRAN/transpose-mpi.F90 +++ b/FORTRAN/transpose-mpi.F90 @@ -56,6 +56,7 @@ program main use iso_fortran_env use mpi_f08 + use prk implicit none ! for argument parsing integer :: err @@ -63,18 +64,15 @@ program main character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations - integer(kind=INT32) :: order + integer(kind=INT32) :: order, block_order real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile real(kind=REAL64), parameter :: one=1.0d0 - integer(kind=INT64) :: bytes - ! distributed data helpers - integer(kind=INT32) :: col_per_pe ! columns per PE = order/np - integer(kind=INT32) :: col_start, row_start ! runtime variables - integer(kind=INT32) :: i, j, k, p, q - integer(kind=INT32) :: it, jt, tile_size + integer(kind=INT64) :: bytes + integer(kind=INT32) :: i, j, k, r, lo, hi + !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 @@ -134,12 +132,27 @@ program main call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + block_order = int(order / np) + call MPI_Barrier(MPI_COMM_WORLD) ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** + allocate( A(order,block_order), B(order,block_order), T(order,block_order), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 + endif + + ! Fill the original matrix + do concurrent (i=1:order, j=1:block_order) + A(i,j) = me * block_order + (i-1)*order + (j-1) + end do + !call print_matrix(order, block_order, A) + B = 0 + t0 = 0.0d0 do k=0,iterations @@ -150,7 +163,20 @@ program main endif ! B += A^T + call MPI_Alltoall(A, order*block_order, MPI_DOUBLE_PRECISION, & + T, order*block_order, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) + + !for r in range(0,np): + do r=1,np + lo = block_order * (r-1) + 1 + hi = block_order * r + B(lo:hi,:) = B(lo:hi,:) + transpose(T(lo:hi,:)) + end do + !print*,'====================================' + !print*,'B=',B + !call print_matrix(order, block_order, B) ! A += 1 + A = A + one enddo ! iterations @@ -159,10 +185,36 @@ program main trans_time = t1 - t0 + deallocate( A,T ) + ! ******************************************************************** ! ** Analyze and output results. ! ******************************************************************** + abserr = 0.0; + addit = (0.5*iterations) * (iterations+1.0) + do r=0,np-1 + if (me.eq.r) then + print*,'====================================' + do j=1,block_order + do i=1,order + !temp = ((real(order,REAL64)*real(j-1,REAL64))+real(block_order*me+i-1,REAL64)) & + ! * real(iterations+1,REAL64) + addit + temp = ((iterations/2.0)+(order*j+i))*(iterations+1.0) + abserr = abserr + abs(B(i,j) - temp) + if (abs(B(i,j) - temp).gt.epsilon) then + print*,me,':',i,j+me*block_order,B(i,j),temp,'<<<<' + else + print*,me,':',i,j+me*block_order,B(i,j),temp + endif + enddo + enddo + endif + call MPI_Barrier(MPI_COMM_WORLD) + enddo + + deallocate( B ) + if (me.eq.0) then if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' @@ -173,12 +225,11 @@ program main else write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & 'exceeds threshold ',epsilon - call MPI_Abort(MPI_COMM_WORLD,1) + !call MPI_Abort(MPI_COMM_WORLD,1) endif endif - call MPI_Barrier(MPI_COMM_WORLD) - call mpi_finalize() + call MPI_Finalize() end program main diff --git a/FORTRAN/transpose.F90 b/FORTRAN/transpose.F90 index 0be71030f..e76fc7900 100644 --- a/FORTRAN/transpose.F90 +++ b/FORTRAN/transpose.F90 @@ -119,15 +119,9 @@ program main ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif From 7b2aa3c8449e247d80d226502db5de27aa82bc4d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 10:17:27 +0200 Subject: [PATCH 148/325] better matrix printer --- FORTRAN/prk_mod.F90 | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index dc9251bbb..6d06d5c16 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -39,15 +39,25 @@ subroutine initialize_w(is_star,r,W) endif end subroutine initialize_w - subroutine print_matrix(row, col, mat) + subroutine print_matrix(mat, label) use iso_fortran_env implicit none - integer(kind=INT32), intent(in) :: row, col - real(kind=REAL64), intent(in) :: mat(row, col) - integer(kind=INT32) :: i - do i=1,row - write(*,*) mat(i,:) + real(kind=REAL64), intent(in) :: mat(:,:) + integer(kind=INT32), intent(in), optional :: label + integer(kind=INT32) :: dims(2) + integer(kind=INT32) :: i,j + dims(1) = size(mat,1) + dims(2) = size(mat,2) + do i=1,dims(1) + write(6,'(i5,a1)', advance='no') label,':' + do j=1,dims(2) + if (present(label)) then + write(6,'(f10.1)', advance='no') mat(i,j) + end if + end do + write(6,'(a1)',advance='yes') '' end do + flush(6) end subroutine print_matrix end module prk From 09f60d51902a884d41e441ed6a6f511022d2e4cc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 10:17:44 +0200 Subject: [PATCH 149/325] finally works --- FORTRAN/transpose-mpi.F90 | 83 +++++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/FORTRAN/transpose-mpi.F90 b/FORTRAN/transpose-mpi.F90 index 9ddff956f..412837071 100644 --- a/FORTRAN/transpose-mpi.F90 +++ b/FORTRAN/transpose-mpi.F90 @@ -53,10 +53,40 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* +module prk_mpi + contains + subroutine mpi_print_matrix(mat,clabel) + use iso_fortran_env + use mpi_f08 + use prk + implicit none + real(kind=REAL64), intent(in) :: mat(:,:) + character(*), intent(in), optional :: clabel + integer(kind=INT32) :: r, me, np + flush(6) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + if (me.eq.0) print*,clabel + flush(6) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + do r=0,np-1 + if (me.eq.r) then + call print_matrix(mat,me) + endif + call MPI_Barrier(MPI_COMM_WORLD) + enddo + flush(6) + end subroutine +end module prk_mpi + program main use iso_fortran_env use mpi_f08 use prk + use prk_mpi implicit none ! for argument parsing integer :: err @@ -140,7 +170,7 @@ program main ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,block_order), B(order,block_order), T(order,block_order), stat=err) + allocate( A(block_order,order), B(block_order,order), T(block_order,order), stat=err) if (err .ne. 0) then write(*,'(a,i3)') 'allocation returned ',err stop 1 @@ -148,9 +178,9 @@ program main ! Fill the original matrix do concurrent (i=1:order, j=1:block_order) - A(i,j) = me * block_order + (i-1)*order + (j-1) + A(j,i) = me * block_order + (i-1)*order + (j-1) end do - !call print_matrix(order, block_order, A) + !call mpi_print_matrix(A,'A=') B = 0 t0 = 0.0d0 @@ -163,18 +193,17 @@ program main endif ! B += A^T - call MPI_Alltoall(A, order*block_order, MPI_DOUBLE_PRECISION, & - T, order*block_order, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) + call MPI_Alltoall(A, block_order*block_order, MPI_DOUBLE_PRECISION, & + T, block_order*block_order, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) + !call mpi_print_matrix(T,'T=') !for r in range(0,np): - do r=1,np - lo = block_order * (r-1) + 1 - hi = block_order * r - B(lo:hi,:) = B(lo:hi,:) + transpose(T(lo:hi,:)) + do r=0,np-1 + lo = block_order * r + 1 + hi = block_order * (r+1) + B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,lo:hi)) end do - !print*,'====================================' - !print*,'B=',B - !call print_matrix(order, block_order, B) + !call mpi_print_matrix(B,'B=') ! A += 1 A = A + one @@ -191,27 +220,21 @@ program main ! ** Analyze and output results. ! ******************************************************************** + !T = 0 abserr = 0.0; addit = (0.5*iterations) * (iterations+1.0) - do r=0,np-1 - if (me.eq.r) then - print*,'====================================' - do j=1,block_order - do i=1,order - !temp = ((real(order,REAL64)*real(j-1,REAL64))+real(block_order*me+i-1,REAL64)) & - ! * real(iterations+1,REAL64) + addit - temp = ((iterations/2.0)+(order*j+i))*(iterations+1.0) - abserr = abserr + abs(B(i,j) - temp) - if (abs(B(i,j) - temp).gt.epsilon) then - print*,me,':',i,j+me*block_order,B(i,j),temp,'<<<<' - else - print*,me,':',i,j+me*block_order,B(i,j),temp - endif - enddo - enddo - endif - call MPI_Barrier(MPI_COMM_WORLD) + do j=1,block_order + do i=1,order + temp = (order*(me*block_order+j-1)+(i-1)) * (iterations+1)+addit + !T(j,i) = temp + abserr = abserr + abs(B(j,i) - temp) + enddo enddo + call MPI_Allreduce(MPI_IN_PLACE,abserr,1,MPI_DOUBLE_PRECISION, & + MPI_SUM,MPI_COMM_WORLD) + + !call mpi_print_matrix(T,'R=') + !call mpi_print_matrix(B,'B=') deallocate( B ) From 66a2ccd97e774e8ee8937db171b2f22440216b81 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 10:22:39 +0200 Subject: [PATCH 150/325] cleanup --- FORTRAN/transpose-mpi.F90 | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/FORTRAN/transpose-mpi.F90 b/FORTRAN/transpose-mpi.F90 index 412837071..b2f22e7c2 100644 --- a/FORTRAN/transpose-mpi.F90 +++ b/FORTRAN/transpose-mpi.F90 @@ -109,12 +109,7 @@ program main ! MPI stuff integer(kind=INT32) :: me, np, provided - call MPI_Init_thread(MPI_THREAD_FUNNELED,provided) -#ifdef _OPENMP - if (provided.eq.MPI_THREAD_SINGLE) then - call MPI_Abort(MPI_COMM_WORLD,1) - endif -#endif + call MPI_Init_thread(MPI_THREAD_SINGLE,provided) call MPI_Comm_rank(MPI_COMM_WORLD, me) call MPI_Comm_size(MPI_COMM_WORLD, np) @@ -124,11 +119,7 @@ program main if (me.eq.0) then write(*,'(a25)') 'Parallel Research Kernels' -#ifdef _OPENMP - write(*,'(a43)') 'Fortran MPI/OpenMP Matrix transpose: B = A^T' -#else write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' -#endif if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() @@ -152,9 +143,6 @@ program main call MPI_Abort(MPI_COMM_WORLD, 3) endif -#ifdef _OPENMP - write(*,'(a23,i8)') 'Number of threads = ', omp_get_max_threads() -#endif write(*,'(a23,i8)') 'Number of MPI procs = ', np write(*,'(a23,i8)') 'Number of iterations = ', iterations write(*,'(a23,i8)') 'Matrix order = ', order @@ -180,7 +168,6 @@ program main do concurrent (i=1:order, j=1:block_order) A(j,i) = me * block_order + (i-1)*order + (j-1) end do - !call mpi_print_matrix(A,'A=') B = 0 t0 = 0.0d0 @@ -195,15 +182,11 @@ program main ! B += A^T call MPI_Alltoall(A, block_order*block_order, MPI_DOUBLE_PRECISION, & T, block_order*block_order, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) - !call mpi_print_matrix(T,'T=') - - !for r in range(0,np): do r=0,np-1 lo = block_order * r + 1 hi = block_order * (r+1) B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,lo:hi)) end do - !call mpi_print_matrix(B,'B=') ! A += 1 A = A + one @@ -220,22 +203,17 @@ program main ! ** Analyze and output results. ! ******************************************************************** - !T = 0 abserr = 0.0; addit = (0.5*iterations) * (iterations+1.0) do j=1,block_order do i=1,order temp = (order*(me*block_order+j-1)+(i-1)) * (iterations+1)+addit - !T(j,i) = temp abserr = abserr + abs(B(j,i) - temp) enddo enddo call MPI_Allreduce(MPI_IN_PLACE,abserr,1,MPI_DOUBLE_PRECISION, & MPI_SUM,MPI_COMM_WORLD) - !call mpi_print_matrix(T,'R=') - !call mpi_print_matrix(B,'B=') - deallocate( B ) if (me.eq.0) then From 87d4588bd4ae94fa84331cd6f4eea2d1fd8c1c5e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 14:34:37 +0200 Subject: [PATCH 151/325] p2p version --- FORTRAN/transpose-p2p-mpi.F90 | 236 ++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 FORTRAN/transpose-p2p-mpi.F90 diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 new file mode 100644 index 000000000..b2f22e7c2 --- /dev/null +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -0,0 +1,236 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: transpose +! +! PURPOSE: This program measures the time for the transpose of a +! column-major stored matrix into a row-major stored matrix. +! +! USAGE: Program input is the matrix order and the number of times to +! repeat the operation: +! +! transpose <# iterations> [tile size] +! +! An optional parameter specifies the tile size used to divide the +! individual matrix blocks for improved cache and TLB performance. +! +! The output consists of diagnostics to make sure the +! transpose worked and timing statistics. +! +! HISTORY: Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2015 +! MPI by Jeff Hammond, November 2021 +! ******************************************************************* + +module prk_mpi + contains + subroutine mpi_print_matrix(mat,clabel) + use iso_fortran_env + use mpi_f08 + use prk + implicit none + real(kind=REAL64), intent(in) :: mat(:,:) + character(*), intent(in), optional :: clabel + integer(kind=INT32) :: r, me, np + flush(6) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + if (me.eq.0) print*,clabel + flush(6) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + do r=0,np-1 + if (me.eq.r) then + call print_matrix(mat,me) + endif + call MPI_Barrier(MPI_COMM_WORLD) + enddo + flush(6) + end subroutine +end module prk_mpi + +program main + use iso_fortran_env + use mpi_f08 + use prk + use prk_mpi + implicit none + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations + integer(kind=INT32) :: order, block_order + real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix + real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix + real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile + real(kind=REAL64), parameter :: one=1.0d0 + ! runtime variables + integer(kind=INT64) :: bytes + integer(kind=INT32) :: i, j, k, r, lo, hi + !integer(kind=INT32) :: it, jt, tile_size + real(kind=REAL64) :: abserr, addit, temp + real(kind=REAL64) :: t0, t1, trans_time, avgtime + real(kind=REAL64), parameter :: epsilon=1.d-8 + ! MPI stuff + integer(kind=INT32) :: me, np, provided + + call MPI_Init_thread(MPI_THREAD_SINGLE,provided) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + if (me.eq.0) then + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a62)') 'Usage: ./transpose <# iterations> ' + call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + call MPI_Abort(MPI_COMM_WORLD, 2) + endif + + order = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') order + if (order .lt. 1) then + write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order + call MPI_Abort(MPI_COMM_WORLD, 3) + endif + + write(*,'(a23,i8)') 'Number of MPI procs = ', np + write(*,'(a23,i8)') 'Number of iterations = ', iterations + write(*,'(a23,i8)') 'Matrix order = ', order + endif + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + + block_order = int(order / np) + + call MPI_Barrier(MPI_COMM_WORLD) + + ! ******************************************************************** + ! ** Allocate space for the input and transpose matrix + ! ******************************************************************** + + allocate( A(block_order,order), B(block_order,order), T(block_order,order), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 + endif + + ! Fill the original matrix + do concurrent (i=1:order, j=1:block_order) + A(j,i) = me * block_order + (i-1)*order + (j-1) + end do + B = 0 + + t0 = 0.0d0 + + do k=0,iterations + + if (k.eq.1) then + call MPI_Barrier(MPI_COMM_WORLD) + t0 = MPI_Wtime() + endif + + ! B += A^T + call MPI_Alltoall(A, block_order*block_order, MPI_DOUBLE_PRECISION, & + T, block_order*block_order, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) + do r=0,np-1 + lo = block_order * r + 1 + hi = block_order * (r+1) + B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,lo:hi)) + end do + ! A += 1 + A = A + one + + enddo ! iterations + + call MPI_Barrier(MPI_COMM_WORLD) + t1 = MPI_Wtime() + + trans_time = t1 - t0 + + deallocate( A,T ) + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + abserr = 0.0; + addit = (0.5*iterations) * (iterations+1.0) + do j=1,block_order + do i=1,order + temp = (order*(me*block_order+j-1)+(i-1)) * (iterations+1)+addit + abserr = abserr + abs(B(j,i) - temp) + enddo + enddo + call MPI_Allreduce(MPI_IN_PLACE,abserr,1,MPI_DOUBLE_PRECISION, & + MPI_SUM,MPI_COMM_WORLD) + + deallocate( B ) + + if (me.eq.0) then + if (abserr .lt. epsilon) then + write(*,'(a)') 'Solution validates' + avgtime = trans_time/iterations + bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(one)/8 + write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & + ' Avg time (s): ', avgtime + else + write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & + 'exceeds threshold ',epsilon + !call MPI_Abort(MPI_COMM_WORLD,1) + endif + endif + + call MPI_Finalize() + +end program main + From 2f808455b45aab3e2b4d386acab2ce286a6e0514 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 15:01:08 +0200 Subject: [PATCH 152/325] remove unnecessary temp --- PYTHON/transpose-numpy-mpi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PYTHON/transpose-numpy-mpi.py b/PYTHON/transpose-numpy-mpi.py index e5a2f9196..56934ce0c 100755 --- a/PYTHON/transpose-numpy-mpi.py +++ b/PYTHON/transpose-numpy-mpi.py @@ -139,8 +139,7 @@ def main(): # ** Allocate space for the input and transpose matrix # ******************************************************************** - offset = me * block_order - A = numpy.fromfunction(lambda i,j: offset+i*order+j, (order,block_order), dtype=float) + A = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=float) B = numpy.zeros((order,block_order)) T = numpy.zeros((order,block_order)) From afca09f5d2e1153e05b842d08313a99494e3ff29 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 15:55:28 +0200 Subject: [PATCH 153/325] use arange not fromfunction to match cupy --- PYTHON/transpose-numpy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PYTHON/transpose-numpy.py b/PYTHON/transpose-numpy.py index 657a09218..58a02b2d0 100755 --- a/PYTHON/transpose-numpy.py +++ b/PYTHON/transpose-numpy.py @@ -86,7 +86,8 @@ def main(): # ** Allocate space for the input and transpose matrix # ******************************************************************** - A = numpy.fromfunction(lambda i,j: i*order+j, (order,order), dtype=float) + #A = numpy.fromfunction(lambda i,j: i*order+j, (order,order), dtype=float) + A = numpy.arange(order*order,dtype=float).reshape(order,order) B = numpy.zeros((order,order)) for k in range(0,iterations+1): From 3024c691afe044a5c2dbd321c559aec5b937a073 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 16:54:13 +0200 Subject: [PATCH 154/325] transpose w/ p2p --- PYTHON/transpose-numpy-mpi-p2p.py | 200 ++++++++++++++++++++++++++++++ PYTHON/transpose-numpy-mpi.py | 7 +- 2 files changed, 206 insertions(+), 1 deletion(-) create mode 100755 PYTHON/transpose-numpy-mpi-p2p.py diff --git a/PYTHON/transpose-numpy-mpi-p2p.py b/PYTHON/transpose-numpy-mpi-p2p.py new file mode 100755 index 000000000..c8e8ab017 --- /dev/null +++ b/PYTHON/transpose-numpy-mpi-p2p.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2021, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: transpose +# +# PURPOSE: This program measures the time for the transpose of a +# column-major stored matrix into a row-major stored matrix. +# +# USAGE: Program input is the matrix order and the number of times to +# repeat the operation: +# +# transpose <# iterations> +# +# The output consists of diagnostics to make sure the +# transpose worked and timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# +# ******************************************************************* + +# Layout nomenclature +# ------------------- +# +# - Each rank owns one block of columns (Colblock) of the overall +# matrix to be transposed, as well as of the transposed matrix. +# - Colblock is stored contiguously in the memory of the rank. +# The stored format is column major, which means that matrix +# elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1) +# are "order" words apart +# - Colblock is logically composed of #ranks Blocks, but a Block is +# not stored contiguously in memory. Conceptually, the Block is +# the unit of data that gets communicated between ranks. Block i of +# rank j is locally transposed and gathered into a buffer called Work, +# which is sent to rank i, where it is scattered into Block j of the +# transposed matrix. +# - When tiling is applied to reduce TLB misses, each block gets +# accessed by tiles. +# - The original and transposed matrices are called A and B +# +# +-----------------------------------------------------------------+ +# | | | | | +# | Colblock | | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | Block | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | Overall Matrix | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# +-----------------------------------------------------------------+ + +import sys +from mpi4py import MPI +import numpy + +def main(): + + comm = MPI.COMM_WORLD + me = comm.Get_rank() + np = comm.Get_size() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python MPI/Numpy Matrix transpose: B = A^T') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./transpose <# iterations> ") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + order = int(sys.argv[2]) + if order < 1: + sys.exit("ERROR: order must be >= 1") + + if order % np != 0: + sys.exit("ERROR: matrix order ", order," should be divisible by # procs", np) + + block_order = int(order / np) + + if (me==0): + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Matrix order = ', order) + + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + A = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=float) + B = numpy.zeros((order,block_order)) + T = numpy.zeros((block_order,block_order)) + + for k in range(0,iterations+1): + + if k<1: + comm.Barrier() + t0 = MPI.Wtime() + + for phase in range(0,np): + recv_from = (me + phase ) % np; + send_to = (me - phase + np) % np; + #if k==0: + # print('i am ',me,' receiving from ',recv_from,' sending to ',send_to) + + lo = block_order * send_to + hi = block_order * (send_to+1) + comm.Sendrecv(sendbuf=A[lo:hi,:],dest=send_to,sendtag=phase,recvbuf=T,source=recv_from,recvtag=phase) + lo = block_order * recv_from + hi = block_order * (recv_from+1) + B[lo:hi,:] += T.T + + A += 1.0 + + comm.Barrier() + t1 = MPI.Wtime() + trans_time = t1 - t0 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # allgather is non-scalable but was easier to debug + F = comm.allgather(B) + G = numpy.concatenate(F,axis=1) + #if (me==0): + # print(G) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) + abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) + + epsilon=1.e-8 + nbytes = 2 * order**2 * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + if abserr < epsilon: + if (me==0): + print('Solution validates') + avgtime = trans_time/iterations + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + else: + if (me==0): + print('error ',abserr, ' exceeds threshold ',epsilon) + print("ERROR: solution did not validate") + comm.Abort() + #sys.exit("ERROR: solution did not validate") + + +if __name__ == '__main__': + main() diff --git a/PYTHON/transpose-numpy-mpi.py b/PYTHON/transpose-numpy-mpi.py index 56934ce0c..5dacbd5ea 100755 --- a/PYTHON/transpose-numpy-mpi.py +++ b/PYTHON/transpose-numpy-mpi.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # # Copyright (c) 2020, Intel Corporation +# Copyright (c) 2021, NVIDIA # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -174,6 +175,8 @@ def main(): # allgather is non-scalable but was easier to debug F = comm.allgather(B) G = numpy.concatenate(F,axis=1) + #if (me==0): + # print(G) H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) @@ -187,7 +190,9 @@ def main(): else: if (me==0): print('error ',abserr, ' exceeds threshold ',epsilon) - sys.exit("ERROR: solution did not validate") + print("ERROR: solution did not validate") + comm.Abort() + #sys.exit("ERROR: solution did not validate") if __name__ == '__main__': From 8b6cf584b808705ee3f9f63a996fa6f1994667f8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 17:21:18 +0200 Subject: [PATCH 155/325] remove ; --- PYTHON/transpose-numpy-mpi-p2p.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PYTHON/transpose-numpy-mpi-p2p.py b/PYTHON/transpose-numpy-mpi-p2p.py index c8e8ab017..82163b406 100755 --- a/PYTHON/transpose-numpy-mpi-p2p.py +++ b/PYTHON/transpose-numpy-mpi-p2p.py @@ -151,8 +151,8 @@ def main(): t0 = MPI.Wtime() for phase in range(0,np): - recv_from = (me + phase ) % np; - send_to = (me - phase + np) % np; + recv_from = (me + phase ) % np + send_to = (me - phase + np) % np #if k==0: # print('i am ',me,' receiving from ',recv_from,' sending to ',send_to) From 8dbdcf7bba8772de095d86f015228be9394d86b7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Nov 2021 17:21:40 +0200 Subject: [PATCH 156/325] add transpose mpi w/ p2p --- FORTRAN/Makefile | 2 +- FORTRAN/transpose-p2p-mpi.F90 | 31 +++++++++++++++++++++---------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 2ab78cb65..e31bfdc10 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -72,7 +72,7 @@ taskloop: stencil-taskloop-openmp transpose-taskloop-openmp nstream-taskloop-ope coarray: nstream-coarray p2p-coarray stencil-coarray transpose-coarray -mpi: nstream-mpi transpose-mpi +mpi: nstream-mpi transpose-mpi transpose-p2p-mpi mpi-openmp: nstream-mpi-openmp diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index b2f22e7c2..ef4d96511 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -100,14 +100,14 @@ program main real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables - integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, r, lo, hi - !integer(kind=INT32) :: it, jt, tile_size + integer(kind=INT64) :: bytes + integer(kind=INT32) :: i, j, k, lo, hi, phase real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 ! MPI stuff integer(kind=INT32) :: me, np, provided + integer(kind=INT32) :: send_to, recv_from call MPI_Init_thread(MPI_THREAD_SINGLE,provided) call MPI_Comm_rank(MPI_COMM_WORLD, me) @@ -158,7 +158,7 @@ program main ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(block_order,order), B(block_order,order), T(block_order,order), stat=err) + allocate( A(block_order,order), B(block_order,order), T(block_order,block_order), stat=err) if (err .ne. 0) then write(*,'(a,i3)') 'allocation returned ',err stop 1 @@ -180,12 +180,23 @@ program main endif ! B += A^T - call MPI_Alltoall(A, block_order*block_order, MPI_DOUBLE_PRECISION, & - T, block_order*block_order, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) - do r=0,np-1 - lo = block_order * r + 1 - hi = block_order * (r+1) - B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,lo:hi)) + !call MPI_Alltoall(A, block_order*block_order, MPI_DOUBLE_PRECISION, & + ! T, block_order*block_order, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) + do phase=0,np-1 + recv_from = mod( (me + phase ), np) + send_to = mod( (me - phase + np), np) + + lo = block_order * send_to + 1 + hi = block_order * (send_to+1) + call MPI_Sendrecv(A(:,lo:hi), block_order*block_order, MPI_DOUBLE_PRECISION, & + send_to,phase, & + T,block_order*block_order, MPI_DOUBLE_PRECISION, & + recv_from, phase, MPI_COMM_WORLD, MPI_STATUS_IGNORE) + lo = block_order * recv_from + 1 + hi = block_order * (recv_from+1) + B(:,lo:hi) = B(:,lo:hi) + transpose(T) + + end do ! A += 1 A = A + one From 544f41d5adee7110610fa757840b9c2f86b4cb3f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Nov 2021 12:49:54 +0200 Subject: [PATCH 157/325] WIP RMA mpi4py transpose --- PYTHON/transpose-numpy-mpi-rma.py | 208 ++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100755 PYTHON/transpose-numpy-mpi-rma.py diff --git a/PYTHON/transpose-numpy-mpi-rma.py b/PYTHON/transpose-numpy-mpi-rma.py new file mode 100755 index 000000000..0aad83413 --- /dev/null +++ b/PYTHON/transpose-numpy-mpi-rma.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2021, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: transpose +# +# PURPOSE: This program measures the time for the transpose of a +# column-major stored matrix into a row-major stored matrix. +# +# USAGE: Program input is the matrix order and the number of times to +# repeat the operation: +# +# transpose <# iterations> +# +# The output consists of diagnostics to make sure the +# transpose worked and timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# +# ******************************************************************* + +# Layout nomenclature +# ------------------- +# +# - Each rank owns one block of columns (Colblock) of the overall +# matrix to be transposed, as well as of the transposed matrix. +# - Colblock is stored contiguously in the memory of the rank. +# The stored format is column major, which means that matrix +# elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1) +# are "order" words apart +# - Colblock is logically composed of #ranks Blocks, but a Block is +# not stored contiguously in memory. Conceptually, the Block is +# the unit of data that gets communicated between ranks. Block i of +# rank j is locally transposed and gathered into a buffer called Work, +# which is sent to rank i, where it is scattered into Block j of the +# transposed matrix. +# - When tiling is applied to reduce TLB misses, each block gets +# accessed by tiles. +# - The original and transposed matrices are called A and B +# +# +-----------------------------------------------------------------+ +# | | | | | +# | Colblock | | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | Block | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | Overall Matrix | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# +-----------------------------------------------------------------+ + +import sys +from mpi4py import MPI +import numpy + +def main(): + + comm = MPI.COMM_WORLD + me = comm.Get_rank() + np = comm.Get_size() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python MPI/Numpy Matrix transpose: B = A^T') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./transpose <# iterations> ") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + order = int(sys.argv[2]) + if order < 1: + sys.exit("ERROR: order must be >= 1") + + if order % np != 0: + sys.exit("ERROR: matrix order ", order," should be divisible by # procs", np) + + block_order = int(order / np) + + if (me==0): + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Matrix order = ', order) + + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + A = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=numpy.double) + #B = numpy.zeros((order,block_order)) + T = numpy.zeros((block_order,block_order)) + + n = order*block_order + #print('double size=',MPI.DOUBLE.Get_size()) + W = MPI.Win.Allocate(n * MPI.DOUBLE.Get_size(), 1, MPI.INFO_NULL, comm) + #print('win size=',W.Get_attr(MPI.WIN_SIZE)) + memory = W.tomemory() + #print('len memory=',len(memory)) + B = numpy.ndarray([order,block_order],dtype=numpy.double,buffer=memory) + + for k in range(0,iterations+1): + + if k<1: + comm.Barrier() + t0 = MPI.Wtime() + + for phase in range(0,np): + recv_from = (me + phase ) % np + send_to = (me - phase + np) % np + #if k==0: + # print('i am ',me,' receiving from ',recv_from,' sending to ',send_to) + + lo = block_order * send_to + hi = block_order * (send_to+1) + comm.Sendrecv(sendbuf=A[lo:hi,:],dest=send_to,sendtag=phase,recvbuf=T,source=recv_from,recvtag=phase) + lo = block_order * recv_from + hi = block_order * (recv_from+1) + B[lo:hi,:] += T.T + + A += 1.0 + + comm.Barrier() + t1 = MPI.Wtime() + trans_time = t1 - t0 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # allgather is non-scalable but was easier to debug + F = comm.allgather(B) + G = numpy.concatenate(F,axis=1) + #if (me==0): + # print(G) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) + abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) + + epsilon=1.e-8 + nbytes = 2 * order**2 * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + if abserr < epsilon: + if (me==0): + print('Solution validates') + avgtime = trans_time/iterations + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + else: + if (me==0): + print('error ',abserr, ' exceeds threshold ',epsilon) + print("ERROR: solution did not validate") + comm.Abort() + #sys.exit("ERROR: solution did not validate") + + +if __name__ == '__main__': + main() From 60fca9a47bad8db8abd31d76c11cc5a38d164ebf Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Nov 2021 15:00:33 +0200 Subject: [PATCH 158/325] rename col_per_pe to block_order --- FORTRAN/transpose-coarray.F90 | 56 +++++++++++++++++------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90 index 9021ddce7..4b5ace825 100644 --- a/FORTRAN/transpose-coarray.F90 +++ b/FORTRAN/transpose-coarray.F90 @@ -72,7 +72,7 @@ program main real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile integer(kind=INT64) :: bytes ! combined size of matrices ! distributed data helpers - integer(kind=INT32) :: col_per_pe ! columns per PE = order/np + integer(kind=INT32) :: block_order ! columns per PE = order/np integer(kind=INT32) :: col_start, row_start ! runtime variables integer(kind=INT32) :: i, j, k, p, q @@ -128,7 +128,7 @@ program main endif stop 1 endif - col_per_pe = order/np + block_order = order/np ! same default as the C implementation tile_size = 32 @@ -146,7 +146,7 @@ program main ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,col_per_pe)[*], B(order,col_per_pe)[*], T(col_per_pe,col_per_pe), stat=err) + allocate( A(order,block_order)[*], B(order,block_order)[*], T(block_order,block_order), stat=err) if (err .ne. 0) then write(6,'(a20,i3,a10,i5)') 'allocation returned ',err,' at image ',me stop 1 @@ -160,21 +160,21 @@ program main endif ! initialization - ! local column index j corresponds to global column index col_per_pe*me+j + ! local column index j corresponds to global column index block_order*me+j if ((tile_size.gt.1).and.(tile_size.lt.order)) then - do concurrent (jt=1:col_per_pe:tile_size, & + do concurrent (jt=1:block_order:tile_size, & it=1:order:tile_size) - do j=jt,min(col_per_pe,jt+tile_size-1) + do j=jt,min(block_order,jt+tile_size-1) do i=it,min(order,it+tile_size-1) - A(i,j) = real(order,REAL64) * real(col_per_pe*me+j-1,REAL64) + real(i-1,REAL64) + A(i,j) = real(order,REAL64) * real(block_order*me+j-1,REAL64) + real(i-1,REAL64) B(i,j) = 0.0 enddo enddo enddo else - do concurrent (j=1:col_per_pe) + do concurrent (j=1:block_order) do i=1,order - A(i,j) = real(order,REAL64) * real(col_per_pe*me+j-1,REAL64) + real(i-1,REAL64) + A(i,j) = real(order,REAL64) * real(block_order*me+j-1,REAL64) + real(i-1,REAL64) B(i,j) = 0.0 enddo enddo @@ -197,54 +197,54 @@ program main do q=me,me+np-1 p = modulo(q,np) ! Step 1: Gather A tile from remote image - row_start = me*col_per_pe + row_start = me*block_order ! * fully explicit version - !do i=1,col_per_pe - ! do j=1,col_per_pe + !do i=1,block_order + ! do j=1,block_order ! T(j,i) = A(row_start+j,i)[p+1] ! enddo !enddo ! * half explicit, half colon - !do i=1,col_per_pe - ! T(:,i) = A(row_start+1:row_start+col_per_pe,i)[p+1] + !do i=1,block_order + ! T(:,i) = A(row_start+1:row_start+block_order,i)[p+1] !enddo ! * full colon - T(:,:) = A(row_start+1:row_start+col_per_pe,:)[p+1] + T(:,:) = A(row_start+1:row_start+block_order,:)[p+1] ! Step 2: Transpose tile into B matrix - col_start = p*col_per_pe + col_start = p*block_order ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if ((tile_size.gt.1).and.(tile_size.lt.order)) then - do concurrent (jt=1:col_per_pe:tile_size, & - it=1:col_per_pe:tile_size) - do j=jt,min(col_per_pe,jt+tile_size-1) - do i=it,min(col_per_pe,it+tile_size-1) + do concurrent (jt=1:block_order:tile_size, & + it=1:block_order:tile_size) + do j=jt,min(block_order,jt+tile_size-1) + do i=it,min(block_order,it+tile_size-1) B(col_start+i,j) = B(col_start+i,j) + T(j,i) enddo enddo enddo else ! untiled ! * fully explicit version - !do j=1,col_per_pe - ! do i=1,col_per_pe + !do j=1,block_order + ! do i=1,block_order ! B(col_start+i,j) = B(col_start+i,j) + T(j,i) ! enddo !enddo ! * half explicit, half colon - do concurrent (j=1:col_per_pe) - B(col_start+1:col_start+col_per_pe,j) = B(col_start+1:col_start+col_per_pe,j) + T(j,:) + do concurrent (j=1:block_order) + B(col_start+1:col_start+block_order,j) = B(col_start+1:col_start+block_order,j) + T(j,:) enddo endif enddo sync all ! Step 3: Update A matrix ! * fully explicit version - !do j=1,col_per_pe + !do j=1,block_order ! do i=1,order ! A(i,j) = A(i,j) + 1.0 ! enddo !enddo ! * half explicit, half colon - do concurrent (j=1:col_per_pe) + do concurrent (j=1:block_order) A(:,j) = A(:,j) + 1.0 enddo ! * fully implicit version @@ -264,9 +264,9 @@ program main abserr = 0.0; addit = (0.5*iterations) * (iterations+1.0) - do j=1,col_per_pe + do j=1,block_order do i=1,order - temp = ((real(order,REAL64)*real(i-1,REAL64))+real(col_per_pe*me+j-1,REAL64)) & + temp = ((real(order,REAL64)*real(i-1,REAL64))+real(block_order*me+j-1,REAL64)) & * real(iterations+1,REAL64) + addit abserr = abserr + abs(B(i,j) - temp) enddo From fc63ffda0f65e2ac920f72154b410bedc374285c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Nov 2021 15:03:34 +0200 Subject: [PATCH 159/325] wrong for np>1 but shows how to win allocate into numpy --- PYTHON/transpose-numpy-mpi-rma.py | 35 ++++++++++++++++--------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/PYTHON/transpose-numpy-mpi-rma.py b/PYTHON/transpose-numpy-mpi-rma.py index 0aad83413..efa3ca359 100755 --- a/PYTHON/transpose-numpy-mpi-rma.py +++ b/PYTHON/transpose-numpy-mpi-rma.py @@ -140,18 +140,16 @@ def main(): # ** Allocate space for the input and transpose matrix # ******************************************************************** - A = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=numpy.double) - #B = numpy.zeros((order,block_order)) + dtsize = MPI.DOUBLE.Get_size() + WA = MPI.Win.Allocate(order * block_order * dtsize, dtsize, MPI.INFO_NULL, comm) + A = numpy.ndarray([order,block_order],dtype=numpy.double,buffer=WA.tomemory()) + B = numpy.zeros((order,block_order)) T = numpy.zeros((block_order,block_order)) - n = order*block_order - #print('double size=',MPI.DOUBLE.Get_size()) - W = MPI.Win.Allocate(n * MPI.DOUBLE.Get_size(), 1, MPI.INFO_NULL, comm) - #print('win size=',W.Get_attr(MPI.WIN_SIZE)) - memory = W.tomemory() - #print('len memory=',len(memory)) - B = numpy.ndarray([order,block_order],dtype=numpy.double,buffer=memory) + TA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=numpy.double) + A[:,:] = TA[:,:] + WA.Lock_all() for k in range(0,iterations+1): if k<1: @@ -159,24 +157,27 @@ def main(): t0 = MPI.Wtime() for phase in range(0,np): - recv_from = (me + phase ) % np - send_to = (me - phase + np) % np - #if k==0: - # print('i am ',me,' receiving from ',recv_from,' sending to ',send_to) - - lo = block_order * send_to - hi = block_order * (send_to+1) - comm.Sendrecv(sendbuf=A[lo:hi,:],dest=send_to,sendtag=phase,recvbuf=T,source=recv_from,recvtag=phase) + recv_from = (me + phase) % np + bsize = block_order * block_order + WA.Get(T, recv_from, [bsize * recv_from, bsize, MPI.DOUBLE]) + WA.Flush_all() + lo = block_order * recv_from hi = block_order * (recv_from+1) B[lo:hi,:] += T.T + comm.Barrier() A += 1.0 + WA.Sync() + comm.Barrier() comm.Barrier() t1 = MPI.Wtime() trans_time = t1 - t0 + WA.Unlock_all() + WA.Free() + # ******************************************************************** # ** Analyze and output results. # ******************************************************************** From 25e362650f851a52cf946c916144f2de844e3c3f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Nov 2021 15:41:15 +0200 Subject: [PATCH 160/325] Fortran transpose w/ pointers --- FORTRAN/transpose-pointer.F90 | 223 ++++++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 FORTRAN/transpose-pointer.F90 diff --git a/FORTRAN/transpose-pointer.F90 b/FORTRAN/transpose-pointer.F90 new file mode 100644 index 000000000..96e0ca735 --- /dev/null +++ b/FORTRAN/transpose-pointer.F90 @@ -0,0 +1,223 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: transpose +! +! PURPOSE: This program measures the time for the transpose of a +! column-major stored matrix into a row-major stored matrix. +! +! USAGE: Program input is the matrix order and the number of times to +! repeat the operation: +! +! transpose <# iterations> [tile size] +! +! An optional parameter specifies the tile size used to divide the +! individual matrix blocks for improved cache and TLB performance. +! +! The output consists of diagnostics to make sure the +! transpose worked and timing statistics. +! +! HISTORY: Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2015 +! +! ******************************************************************* + +program main + use iso_fortran_env + use prk + implicit none + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations ! number of times to do the transpose + integer(kind=INT32) :: order ! order of a the matrix + real(kind=REAL64), allocatable, target :: A(:) ! buffer to hold original matrix + real(kind=REAL64), allocatable, target :: B(:) ! buffer to hold transposed matrix + real(kind=REAL64), pointer :: PA(:,:) ! pointer to original matrix buffer + real(kind=REAL64), pointer :: PB(:,:) ! pointer to transposed matrix buffer + integer(kind=INT64) :: bytes ! combined size of matrices + ! runtime variables + integer(kind=INT32) :: i, j, k + integer(kind=INT32) :: it, jt, tile_size + real(kind=REAL64) :: abserr, addit, temp ! squared error + real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters + real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a40)') 'Fortran Serial Matrix transpose: B = A^T' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + stop 1 + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + stop 1 + endif + + order = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') order + if (order .lt. 1) then + write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order + stop 1 + endif + + ! same default as the C implementation + tile_size = 32 + if (command_argument_count().gt.2) then + call get_command_argument(3,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') tile_size + endif + if ((tile_size .lt. 1).or.(tile_size.gt.order)) then + write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& + ' must be >= 1 and <= ',order + tile_size = order ! no tiling + endif + + ! ******************************************************************** + ! ** Allocate space for the input and transpose matrix + ! ******************************************************************** + + allocate( A(order*order), B(order*order), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 + endif + + PA(1:order,1:order) => A + PB(1:order,1:order) => B + + write(*,'(a,i8)') 'Number of iterations = ', iterations + write(*,'(a,i8)') 'Matrix order = ', order + write(*,'(a,i8)') 'Tile size = ', tile_size + + t0 = 0 + + if (tile_size.lt.order) then + do jt=1,order,tile_size + do it=1,order,tile_size + do j=jt,min(order,jt+tile_size-1) + do i=it,min(order,it+tile_size-1) + PA(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + PB(i,j) = 0.0 + enddo + enddo + enddo + enddo + else + do j=1,order + do i=1,order + PA(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + PB(i,j) = 0.0 + enddo + enddo + endif + + do k=0,iterations + + if (k.eq.1) then + t0 = prk_get_wtime() + endif + + ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix + if (tile_size.lt.order) then + do jt=1,order,tile_size + do it=1,order,tile_size + do j=jt,min(order,jt+tile_size-1) + do i=it,min(order,it+tile_size-1) + PB(j,i) = PB(j,i) + PA(i,j) + PA(i,j) = PA(i,j) + 1.0 + enddo + enddo + enddo + enddo + else + do j=1,order + do i=1,order + PB(j,i) = PB(j,i) + PA(i,j) + PA(i,j) = PA(i,j) + 1.0 + enddo + enddo + endif + + enddo ! iterations + + t1 = prk_get_wtime() + + trans_time = t1 - t0 + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + abserr = 0.0 + ! this will overflow if iterations>>1000 + addit = (0.5*iterations) * (iterations+1) + do j=1,order + do i=1,order + temp = ((real(order,REAL64)*real(i-1,REAL64))+real(j-1,REAL64)) & + * real(iterations+1,REAL64) + abserr = abserr + abs(PB(i,j) - (temp+addit)) + enddo + enddo + + deallocate( B ) + deallocate( A ) + + if (abserr .lt. epsilon) then + write(*,'(a)') 'Solution validates' + avgtime = trans_time/iterations + bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(A)/8 + write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & + ' Avg time (s): ', avgtime + else + write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & + 'exceeds threshold ',epsilon + stop 1 + endif + +end program main + From b4442fad083ae6f588f58c2581fe4e6c462cd1c1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Nov 2021 12:43:55 +0200 Subject: [PATCH 161/325] a2a w/ get working --- FORTRAN/transpose-get-a2a-mpi.F90 | 262 ++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 FORTRAN/transpose-get-a2a-mpi.F90 diff --git a/FORTRAN/transpose-get-a2a-mpi.F90 b/FORTRAN/transpose-get-a2a-mpi.F90 new file mode 100644 index 000000000..b26c71982 --- /dev/null +++ b/FORTRAN/transpose-get-a2a-mpi.F90 @@ -0,0 +1,262 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: transpose +! +! PURPOSE: This program measures the time for the transpose of a +! column-major stored matrix into a row-major stored matrix. +! +! USAGE: Program input is the matrix order and the number of times to +! repeat the operation: +! +! transpose <# iterations> [tile size] +! +! An optional parameter specifies the tile size used to divide the +! individual matrix blocks for improved cache and TLB performance. +! +! The output consists of diagnostics to make sure the +! transpose worked and timing statistics. +! +! HISTORY: Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2015 +! MPI by Jeff Hammond, November 2021 +! ******************************************************************* + +module prk_mpi + contains + subroutine mpi_print_matrix(mat,clabel) + use iso_fortran_env + use mpi_f08 + use prk + implicit none + real(kind=REAL64), intent(in) :: mat(:,:) + character(*), intent(in), optional :: clabel + integer(kind=INT32) :: r, me, np + flush(6) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + if (me.eq.0) print*,clabel + flush(6) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + do r=0,np-1 + if (me.eq.r) then + call print_matrix(mat,me) + endif + call MPI_Barrier(MPI_COMM_WORLD) + enddo + flush(6) + end subroutine +end module prk_mpi + +program main + use iso_fortran_env + use mpi_f08 + use prk + use prk_mpi + implicit none + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations + integer(kind=INT32) :: order, block_order + type(MPI_Win) :: WA ! MPI window for A (original matrix) + type(c_ptr) :: XA ! MPI baseptr / C pointer for A + real(kind=REAL64), pointer :: A(:,:) ! Fortran pointer to A + real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix + real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile + real(kind=REAL64), parameter :: one=1.0d0 + ! runtime variables + integer(kind=INT64) :: bytes + integer(kind=INT32) :: i, j, k, r, lo, hi + !integer(kind=INT32) :: it, jt, tile_size + real(kind=REAL64) :: abserr, addit, temp + real(kind=REAL64) :: t0, t1, trans_time, avgtime + real(kind=REAL64), parameter :: epsilon=1.d-8 + ! MPI stuff + integer(kind=INT32) :: me, np, provided + integer(kind=MPI_ADDRESS_KIND) :: wsize, woff + integer(kind=INT32) :: dsize + + call MPI_Init_thread(MPI_THREAD_SINGLE,provided) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + if (me.eq.0) then + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a62)') 'Usage: ./transpose <# iterations> ' + call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + call MPI_Abort(MPI_COMM_WORLD, 2) + endif + + order = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') order + if (order .lt. 1) then + write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order + call MPI_Abort(MPI_COMM_WORLD, 3) + endif + + write(*,'(a23,i8)') 'Number of MPI procs = ', np + write(*,'(a23,i8)') 'Number of iterations = ', iterations + write(*,'(a23,i8)') 'Matrix order = ', order + endif + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + + block_order = int(order / np) + + call MPI_Barrier(MPI_COMM_WORLD) + + ! ******************************************************************** + ! ** Allocate space for the input and transpose matrix + ! ******************************************************************** + + dsize = storage_size(one)/8 + ! MPI_Win_allocate(size, disp_unit, info, comm, baseptr, win, ierror) + wsize = block_order * order * dsize + call MPI_Win_allocate(size=wsize, disp_unit=dsize, & + info=MPI_INFO_NULL, comm=MPI_COMM_WORLD, baseptr=XA, win=WA) + call MPI_Win_lock_all(0,WA) + + call c_f_pointer(XA,A,[block_order,order]) + + allocate( B(block_order,order), T(block_order,order), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 + endif + + ! Fill the original matrix + do concurrent (i=1:order, j=1:block_order) + A(j,i) = me * block_order + (i-1)*order + (j-1) + end do + call MPI_Win_sync(WA) + B = 0 + + t0 = 0.0d0 + + do k=0,iterations + + if (k.eq.1) then + call MPI_Barrier(MPI_COMM_WORLD) + t0 = MPI_Wtime() + endif + + ! B += A^T + call MPI_Barrier(MPI_COMM_WORLD) + do r=0,np-1 + woff = block_order * block_order * me + lo = block_order * r + 1 + hi = block_order * (r+1) + call MPI_Get(origin_addr=T(:,lo:hi), origin_count=block_order*block_order, & + origin_datatype=MPI_DOUBLE_PRECISION, & + target_rank=r, target_disp=woff, target_count=block_order*block_order, & + target_datatype=MPI_DOUBLE_PRECISION, win=WA) + end do + call MPI_Win_flush_local_all(WA) + do r=0,np-1 + lo = block_order * r + 1 + hi = block_order * (r+1) + B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,lo:hi)) + end do + ! A += 1 + A = A + one + call MPI_Win_sync(WA) + + enddo ! iterations + + call MPI_Barrier(MPI_COMM_WORLD) + t1 = MPI_Wtime() + + trans_time = t1 - t0 + + deallocate( T ) + call MPI_Win_unlock_all(WA) + call MPI_Win_free( WA) + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + abserr = 0.0; + addit = (0.5*iterations) * (iterations+1.0) + do j=1,block_order + do i=1,order + temp = (order*(me*block_order+j-1)+(i-1)) * (iterations+1)+addit + abserr = abserr + abs(B(j,i) - temp) + enddo + enddo + call MPI_Allreduce(MPI_IN_PLACE,abserr,1,MPI_DOUBLE_PRECISION, & + MPI_SUM,MPI_COMM_WORLD) + + deallocate( B ) + + if (me.eq.0) then + if (abserr .lt. epsilon) then + write(*,'(a)') 'Solution validates' + avgtime = trans_time/iterations + bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(one)/8 + write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & + ' Avg time (s): ', avgtime + else + write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & + 'exceeds threshold ',epsilon + !call MPI_Abort(MPI_COMM_WORLD,1) + endif + endif + + call MPI_Finalize() + +end program main + From 1cebf6b92dcc801eaa17f0d7c919ee8e9bf13bde Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Nov 2021 12:51:10 +0200 Subject: [PATCH 162/325] use proper amount of buffering in RMA transpose --- FORTRAN/transpose-get-a2a-mpi.F90 | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/FORTRAN/transpose-get-a2a-mpi.F90 b/FORTRAN/transpose-get-a2a-mpi.F90 index b26c71982..205664797 100644 --- a/FORTRAN/transpose-get-a2a-mpi.F90 +++ b/FORTRAN/transpose-get-a2a-mpi.F90 @@ -103,7 +103,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, r, lo, hi + integer(kind=INT32) :: i, j, k, q, r, lo, hi !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime @@ -171,7 +171,7 @@ program main call c_f_pointer(XA,A,[block_order,order]) - allocate( B(block_order,order), T(block_order,order), stat=err) + allocate( B(block_order,order), T(block_order,block_order), stat=err) if (err .ne. 0) then write(*,'(a,i3)') 'allocation returned ',err stop 1 @@ -182,6 +182,7 @@ program main A(j,i) = me * block_order + (i-1)*order + (j-1) end do call MPI_Win_sync(WA) + call MPI_Barrier(MPI_COMM_WORLD) B = 0 t0 = 0.0d0 @@ -195,21 +196,20 @@ program main ! B += A^T call MPI_Barrier(MPI_COMM_WORLD) - do r=0,np-1 + do q=0,np-1 + r = mod(me+q,np) woff = block_order * block_order * me - lo = block_order * r + 1 - hi = block_order * (r+1) - call MPI_Get(origin_addr=T(:,lo:hi), origin_count=block_order*block_order, & + call MPI_Get(origin_addr=T(:,:), origin_count=block_order*block_order, & origin_datatype=MPI_DOUBLE_PRECISION, & target_rank=r, target_disp=woff, target_count=block_order*block_order, & target_datatype=MPI_DOUBLE_PRECISION, win=WA) - end do - call MPI_Win_flush_local_all(WA) - do r=0,np-1 + call MPI_Win_flush_local(r,WA) lo = block_order * r + 1 hi = block_order * (r+1) - B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,lo:hi)) + B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,:)) end do + ! nobody should update A before everyone has fetched it + call MPI_Barrier(MPI_COMM_WORLD) ! A += 1 A = A + one call MPI_Win_sync(WA) From ac31fb0725e1c9cf0925dafd259a579fd6ea005e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Nov 2021 13:01:30 +0200 Subject: [PATCH 163/325] mpi acc version of transpose working --- FORTRAN/Makefile | 2 +- FORTRAN/transpose-acc-mpi.F90 | 261 ++++++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 FORTRAN/transpose-acc-mpi.F90 diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index e31bfdc10..198da6333 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -72,7 +72,7 @@ taskloop: stencil-taskloop-openmp transpose-taskloop-openmp nstream-taskloop-ope coarray: nstream-coarray p2p-coarray stencil-coarray transpose-coarray -mpi: nstream-mpi transpose-mpi transpose-p2p-mpi +mpi: nstream-mpi transpose-mpi transpose-p2p-mpi transpose-get-mpi transpose-acc-mpi mpi-openmp: nstream-mpi-openmp diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 new file mode 100644 index 000000000..5115c3b68 --- /dev/null +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -0,0 +1,261 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: transpose +! +! PURPOSE: This program measures the time for the transpose of a +! column-major stored matrix into a row-major stored matrix. +! +! USAGE: Program input is the matrix order and the number of times to +! repeat the operation: +! +! transpose <# iterations> [tile size] +! +! An optional parameter specifies the tile size used to divide the +! individual matrix blocks for improved cache and TLB performance. +! +! The output consists of diagnostics to make sure the +! transpose worked and timing statistics. +! +! HISTORY: Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2015 +! MPI by Jeff Hammond, November 2021 +! ******************************************************************* + +module prk_mpi + contains + subroutine mpi_print_matrix(mat,clabel) + use iso_fortran_env + use mpi_f08 + use prk + implicit none + real(kind=REAL64), intent(in) :: mat(:,:) + character(*), intent(in), optional :: clabel + integer(kind=INT32) :: r, me, np + flush(6) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + if (me.eq.0) print*,clabel + flush(6) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + do r=0,np-1 + if (me.eq.r) then + call print_matrix(mat,me) + endif + call MPI_Barrier(MPI_COMM_WORLD) + enddo + flush(6) + end subroutine +end module prk_mpi + +program main + use iso_fortran_env + use mpi_f08 + use prk + use prk_mpi + implicit none + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations + integer(kind=INT32) :: order, block_order + type(MPI_Win) :: WB ! MPI window for B (transpose matrix) + type(c_ptr) :: XB ! MPI baseptr / C pointer for B + real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix + real(kind=REAL64), pointer :: B(:,:) ! Fortran pointer to B + real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile + real(kind=REAL64), parameter :: one=1.0d0 + ! runtime variables + integer(kind=INT64) :: bytes + integer(kind=INT32) :: i, j, k, q, r, lo, hi + !integer(kind=INT32) :: it, jt, tile_size + real(kind=REAL64) :: abserr, addit, temp + real(kind=REAL64) :: t0, t1, trans_time, avgtime + real(kind=REAL64), parameter :: epsilon=1.d-8 + ! MPI stuff + integer(kind=INT32) :: me, np, provided + integer(kind=MPI_ADDRESS_KIND) :: wsize, woff + integer(kind=INT32) :: dsize + + call MPI_Init_thread(MPI_THREAD_SINGLE,provided) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + if (me.eq.0) then + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a62)') 'Usage: ./transpose <# iterations> ' + call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + call MPI_Abort(MPI_COMM_WORLD, 2) + endif + + order = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') order + if (order .lt. 1) then + write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order + call MPI_Abort(MPI_COMM_WORLD, 3) + endif + if (mod(order,np).ne.0) then + write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np + call MPI_Abort(MPI_COMM_WORLD, 4) + endif + + write(*,'(a23,i8)') 'Number of MPI procs = ', np + write(*,'(a23,i8)') 'Number of iterations = ', iterations + write(*,'(a23,i8)') 'Matrix order = ', order + endif + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + + block_order = int(order / np) + + call MPI_Barrier(MPI_COMM_WORLD) + + ! ******************************************************************** + ! ** Allocate space for the input and transpose matrix + ! ******************************************************************** + + dsize = storage_size(one)/8 + ! MPI_Win_allocate(size, disp_unit, info, comm, baseptr, win, ierror) + wsize = block_order * order * dsize + call MPI_Win_allocate(size=wsize, disp_unit=dsize, & + info=MPI_INFO_NULL, comm=MPI_COMM_WORLD, baseptr=XB, win=WB) + call MPI_Win_lock_all(0,WB) + + call c_f_pointer(XB,B,[block_order,order]) + + allocate( A(block_order,order), T(block_order,block_order), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 + endif + + ! Fill the original matrix + do concurrent (i=1:order, j=1:block_order) + A(j,i) = me * block_order + (i-1)*order + (j-1) + end do + call MPI_Win_sync(WB) + call MPI_Barrier(MPI_COMM_WORLD) + B = 0 + + t0 = 0.0d0 + + do k=0,iterations + + if (k.eq.1) then + call MPI_Barrier(MPI_COMM_WORLD) + t0 = MPI_Wtime() + endif + + ! B += A^T + do q=0,np-1 + r = mod(me+q,np) + lo = block_order * r + 1 + hi = block_order * (r+1) + T = transpose( A(:,lo:hi) ) + woff = block_order * block_order * me + call MPI_Accumulate(origin_addr=T(:,:), origin_count=block_order*block_order, & + origin_datatype=MPI_DOUBLE_PRECISION, & + target_rank=r, target_disp=woff, target_count=block_order*block_order, & + target_datatype=MPI_DOUBLE_PRECISION, op=MPI_SUM, win=WB) + call MPI_Win_flush_local(r,WB) + end do + ! A += 1 + A = A + one + + enddo ! iterations + + call MPI_Barrier(MPI_COMM_WORLD) + t1 = MPI_Wtime() + + trans_time = t1 - t0 + + deallocate( A,T ) + call MPI_Win_unlock_all(WB) + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + abserr = 0.0; + addit = (0.5*iterations) * (iterations+1.0) + do j=1,block_order + do i=1,order + temp = (order*(me*block_order+j-1)+(i-1)) * (iterations+1)+addit + abserr = abserr + abs(B(j,i) - temp) + enddo + enddo + call MPI_Allreduce(MPI_IN_PLACE,abserr,1,MPI_DOUBLE_PRECISION, & + MPI_SUM,MPI_COMM_WORLD) + + call MPI_Win_free(WB) + + if (me.eq.0) then + if (abserr .lt. epsilon) then + write(*,'(a)') 'Solution validates' + avgtime = trans_time/iterations + bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(one)/8 + write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & + ' Avg time (s): ', avgtime + else + write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & + 'exceeds threshold ',epsilon + !call MPI_Abort(MPI_COMM_WORLD,1) + endif + endif + + call MPI_Finalize() + +end program main + From 9d1eaa52b75ff13e2345fdd5ef97d037a81236dc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Nov 2021 14:01:43 +0200 Subject: [PATCH 164/325] tranpose mpi cleanup --- FORTRAN/Makefile | 2 +- ...transpose-mpi.F90 => transpose-a2a-mpi.F90} | 8 ++++++-- FORTRAN/transpose-acc-mpi.F90 | 15 ++++++++------- ...e-get-a2a-mpi.F90 => transpose-get-mpi.F90} | 12 ++++++++---- FORTRAN/transpose-p2p-mpi.F90 | 18 ++++++++++-------- 5 files changed, 33 insertions(+), 22 deletions(-) rename FORTRAN/{transpose-mpi.F90 => transpose-a2a-mpi.F90} (97%) rename FORTRAN/{transpose-get-a2a-mpi.F90 => transpose-get-mpi.F90} (96%) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 198da6333..8d44d28e5 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -72,7 +72,7 @@ taskloop: stencil-taskloop-openmp transpose-taskloop-openmp nstream-taskloop-ope coarray: nstream-coarray p2p-coarray stencil-coarray transpose-coarray -mpi: nstream-mpi transpose-mpi transpose-p2p-mpi transpose-get-mpi transpose-acc-mpi +mpi: nstream-mpi transpose-a2a-mpi transpose-p2p-mpi transpose-get-mpi transpose-acc-mpi mpi-openmp: nstream-mpi-openmp diff --git a/FORTRAN/transpose-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90 similarity index 97% rename from FORTRAN/transpose-mpi.F90 rename to FORTRAN/transpose-a2a-mpi.F90 index b2f22e7c2..11d098e6b 100644 --- a/FORTRAN/transpose-mpi.F90 +++ b/FORTRAN/transpose-a2a-mpi.F90 @@ -100,8 +100,8 @@ program main real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables - integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, r, lo, hi + integer(kind=INT64) :: bytes + integer(kind=INT32) :: i, j, k, r, lo, hi !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime @@ -142,6 +142,10 @@ program main write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order call MPI_Abort(MPI_COMM_WORLD, 3) endif + if (mod(order,np).ne.0) then + write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np + call MPI_Abort(MPI_COMM_WORLD, 4) + endif write(*,'(a23,i8)') 'Number of MPI procs = ', np write(*,'(a23,i8)') 'Number of iterations = ', iterations diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 index 5115c3b68..0c3e37952 100644 --- a/FORTRAN/transpose-acc-mpi.F90 +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -102,8 +102,8 @@ program main real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables - integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, q, r, lo, hi + integer(kind=INT64) :: bytes + integer(kind=INT32) :: i, j, k, q, r, lo, hi !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime @@ -185,9 +185,9 @@ program main do concurrent (i=1:order, j=1:block_order) A(j,i) = me * block_order + (i-1)*order + (j-1) end do + B = 0 call MPI_Win_sync(WB) call MPI_Barrier(MPI_COMM_WORLD) - B = 0 t0 = 0.0d0 @@ -198,24 +198,25 @@ program main t0 = MPI_Wtime() endif - ! B += A^T + woff = block_order * block_order * me do q=0,np-1 r = mod(me+q,np) lo = block_order * r + 1 hi = block_order * (r+1) + ! B += A^T T = transpose( A(:,lo:hi) ) - woff = block_order * block_order * me call MPI_Accumulate(origin_addr=T(:,:), origin_count=block_order*block_order, & origin_datatype=MPI_DOUBLE_PRECISION, & target_rank=r, target_disp=woff, target_count=block_order*block_order, & target_datatype=MPI_DOUBLE_PRECISION, op=MPI_SUM, win=WB) call MPI_Win_flush_local(r,WB) + ! A += 1 + A(:,lo:hi) = A(:,lo:hi) + 1 end do - ! A += 1 - A = A + one enddo ! iterations + call MPI_Win_flush_all(WB) call MPI_Barrier(MPI_COMM_WORLD) t1 = MPI_Wtime() diff --git a/FORTRAN/transpose-get-a2a-mpi.F90 b/FORTRAN/transpose-get-mpi.F90 similarity index 96% rename from FORTRAN/transpose-get-a2a-mpi.F90 rename to FORTRAN/transpose-get-mpi.F90 index 205664797..4da287053 100644 --- a/FORTRAN/transpose-get-a2a-mpi.F90 +++ b/FORTRAN/transpose-get-mpi.F90 @@ -102,8 +102,8 @@ program main real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables - integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, q, r, lo, hi + integer(kind=INT64) :: bytes + integer(kind=INT32) :: i, j, k, q, r, lo, hi !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime @@ -146,6 +146,10 @@ program main write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order call MPI_Abort(MPI_COMM_WORLD, 3) endif + if (mod(order,np).ne.0) then + write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np + call MPI_Abort(MPI_COMM_WORLD, 4) + endif write(*,'(a23,i8)') 'Number of MPI procs = ', np write(*,'(a23,i8)') 'Number of iterations = ', iterations @@ -194,11 +198,11 @@ program main t0 = MPI_Wtime() endif - ! B += A^T + woff = block_order * block_order * me call MPI_Barrier(MPI_COMM_WORLD) + ! B += A^T do q=0,np-1 r = mod(me+q,np) - woff = block_order * block_order * me call MPI_Get(origin_addr=T(:,:), origin_count=block_order*block_order, & origin_datatype=MPI_DOUBLE_PRECISION, & target_rank=r, target_disp=woff, target_count=block_order*block_order, & diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index ef4d96511..e376c99eb 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -101,7 +101,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, lo, hi, phase + integer(kind=INT32) :: i, j, k, lo, hi, q real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 @@ -142,6 +142,10 @@ program main write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order call MPI_Abort(MPI_COMM_WORLD, 3) endif + if (mod(order,np).ne.0) then + write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np + call MPI_Abort(MPI_COMM_WORLD, 4) + endif write(*,'(a23,i8)') 'Number of MPI procs = ', np write(*,'(a23,i8)') 'Number of iterations = ', iterations @@ -180,18 +184,16 @@ program main endif ! B += A^T - !call MPI_Alltoall(A, block_order*block_order, MPI_DOUBLE_PRECISION, & - ! T, block_order*block_order, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) - do phase=0,np-1 - recv_from = mod( (me + phase ), np) - send_to = mod( (me - phase + np), np) + do q=0,np-1 + recv_from = mod( (me + q ), np) + send_to = mod( (me - q + np), np) lo = block_order * send_to + 1 hi = block_order * (send_to+1) call MPI_Sendrecv(A(:,lo:hi), block_order*block_order, MPI_DOUBLE_PRECISION, & - send_to,phase, & + send_to,q, & T,block_order*block_order, MPI_DOUBLE_PRECISION, & - recv_from, phase, MPI_COMM_WORLD, MPI_STATUS_IGNORE) + recv_from, q, MPI_COMM_WORLD, MPI_STATUS_IGNORE) lo = block_order * recv_from + 1 hi = block_order * (recv_from+1) B(:,lo:hi) = B(:,lo:hi) + transpose(T) From b0aecdf4154361ef423fb23a1e02660a5a12bc9f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 25 Nov 2021 12:03:52 +0200 Subject: [PATCH 165/325] add ignore stuff --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 74164a40e..7843446ba 100644 --- a/.gitignore +++ b/.gitignore @@ -311,6 +311,11 @@ FORTRAN/transpose-pretty FORTRAN/transpose-stdpar FORTRAN/transpose-taskloop-openmp FORTRAN/transpose-tasks-openmp +FORTRAN/transpose-a2a-mpi +FORTRAN/transpose-acc-mpi +FORTRAN/transpose-get-mpi +FORTRAN/transpose-p2p-mpi +FORTRAN/transpose-pointer GO/dgemm GO/hello GO/nstream From 6a4c06841861394bf762f65dd7f7c45bc390f69f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 26 Nov 2021 16:14:44 +0200 Subject: [PATCH 166/325] rename variables --- FORTRAN/transpose-pointer.F90 | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/FORTRAN/transpose-pointer.F90 b/FORTRAN/transpose-pointer.F90 index 96e0ca735..72cac8cc7 100644 --- a/FORTRAN/transpose-pointer.F90 +++ b/FORTRAN/transpose-pointer.F90 @@ -64,10 +64,10 @@ program main ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix - real(kind=REAL64), allocatable, target :: A(:) ! buffer to hold original matrix - real(kind=REAL64), allocatable, target :: B(:) ! buffer to hold transposed matrix - real(kind=REAL64), pointer :: PA(:,:) ! pointer to original matrix buffer - real(kind=REAL64), pointer :: PB(:,:) ! pointer to transposed matrix buffer + real(kind=REAL64), allocatable, target :: TA(:) ! buffer to hold original matrix + real(kind=REAL64), allocatable, target :: TB(:) ! buffer to hold transposed matrix + real(kind=REAL64), pointer :: A(:,:) ! pointer to original matrix buffer + real(kind=REAL64), pointer :: B(:,:) ! pointer to transposed matrix buffer integer(kind=INT64) :: bytes ! combined size of matrices ! runtime variables integer(kind=INT32) :: i, j, k @@ -121,14 +121,14 @@ program main ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order*order), B(order*order), stat=err) + allocate( TA(order*order), TB(order*order), stat=err) if (err .ne. 0) then write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - PA(1:order,1:order) => A - PB(1:order,1:order) => B + A(1:order,1:order) => TA + B(1:order,1:order) => TB write(*,'(a,i8)') 'Number of iterations = ', iterations write(*,'(a,i8)') 'Matrix order = ', order @@ -141,8 +141,8 @@ program main do it=1,order,tile_size do j=jt,min(order,jt+tile_size-1) do i=it,min(order,it+tile_size-1) - PA(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) - PB(i,j) = 0.0 + A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + B(i,j) = 0.0 enddo enddo enddo @@ -150,8 +150,8 @@ program main else do j=1,order do i=1,order - PA(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) - PB(i,j) = 0.0 + A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + B(i,j) = 0.0 enddo enddo endif @@ -168,8 +168,8 @@ program main do it=1,order,tile_size do j=jt,min(order,jt+tile_size-1) do i=it,min(order,it+tile_size-1) - PB(j,i) = PB(j,i) + PA(i,j) - PA(i,j) = PA(i,j) + 1.0 + B(j,i) = B(j,i) + A(i,j) + A(i,j) = A(i,j) + 1.0 enddo enddo enddo @@ -177,8 +177,8 @@ program main else do j=1,order do i=1,order - PB(j,i) = PB(j,i) + PA(i,j) - PA(i,j) = PA(i,j) + 1.0 + B(j,i) = B(j,i) + A(i,j) + A(i,j) = A(i,j) + 1.0 enddo enddo endif @@ -200,7 +200,7 @@ program main do i=1,order temp = ((real(order,REAL64)*real(i-1,REAL64))+real(j-1,REAL64)) & * real(iterations+1,REAL64) - abserr = abserr + abs(PB(i,j) - (temp+addit)) + abserr = abserr + abs(B(i,j) - (temp+addit)) enddo enddo From 938ebd33c7fc444f0ba36933f8f782b55ee38356 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 08:47:42 +0200 Subject: [PATCH 167/325] F18 workarounds --- FORTRAN/Makefile | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 8d44d28e5..bb4d5f97e 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -42,7 +42,7 @@ ifeq ($(findstring gfortran,$(FC)),gfortran) endif # PGI and LLVM Flang ifeq ($(findstring flang,$(FC)),flang) - EXTRA = target openacc + EXTRA = openacc #FCFLAGS += -DPGI endif ifeq ($(findstring pgf,$(FC)),pgf) @@ -90,15 +90,15 @@ cufortran: nstream-cufortran transpose-cufortran blas: dgemm-blas -%: %.F90 prk.mod +%: %.F90 prk.mod prk_mod.o $(FC) $(FCFLAGS) $< prk_mod.o -o $@ -prk.mod: prk_mod.F90 +prk.mod prk_mod.o: prk_mod.F90 $(FC) $(FCFLAGS) -c $< -o prk_mod.o -stencil: stencil.F90 stencil_serial.F90 prk.mod - #$(FC) $(FCFLAGS) -c stencil_serial.F90 prk_mod.o -o stencil_serial.o - $(FC) $(FCFLAGS) $< prk_mod.o -o $@ +stencil: stencil.F90 prk.mod + $(FC) $(FCFLAGS) -c stencil_serial.F90 + $(FC) $(FCFLAGS) stencil.F90 stencil_serial.o prk_mod.o -o $@ dgemm-pretty: dgemm-pretty.F90 prk.mod $(FC) $(FCFLAGS) $< prk_mod.o $(BLASFLAGS) $(STDPARFLAG) -o $@ @@ -141,6 +141,7 @@ dgemm-blas: dgemm-blas.F90 prk.mod clean: -rm -f prk.mod + -rm -f prk.f18.mod -rm -f *.o -rm -f *.i90 -rm -f *.dbg @@ -151,6 +152,7 @@ clean: -rm -f *.dwarf -rm -rf *.dSYM # Mac -rm -f p2p stencil transpose nstream dgemm + -rm -f transpose-pointer -rm -f p2p-innerloop -rm -f *-pretty -rm -f *-blas @@ -163,3 +165,4 @@ clean: -rm -f *-stdpar -rm -f *-cufortran -rm -f pic pic_soa + -rm -f a.out From a423a197ea0983c9fcfc3347fb1aa5c53945147d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 10:47:30 +0200 Subject: [PATCH 168/325] new LLVM --- common/make.defs.llvm | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 583d9a668..730e1fa08 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -11,7 +11,8 @@ # C99 is required in some implementations. CC=${LLVM_PATH}clang${CLANG_VERSION} -std=c11 -pthread # All of the Fortran code is written for the 2008 standard and requires preprocessing. -FC=/opt/llvm/pgi-flang/bin/flang -Mpreprocess -Mfreeform -L/opt/llvm/pgi-flang/lib -Wl,-rpath=/opt/llvm/pgi-flang/lib +#FC=/opt/llvm/pgi-flang/bin/flang -Mpreprocess -Mfreeform -L/opt/llvm/pgi-flang/lib -Wl,-rpath=/opt/llvm/pgi-flang/lib +FC=${LLVM_PATH}flang-new # C++11 may not be required but does no harm here. CXX=${LLVM_PATH}clang++${CLANG_VERSION} -std=c++2a -pthread # @@ -20,7 +21,7 @@ CXX=${LLVM_PATH}clang++${CLANG_VERSION} -std=c++2a -pthread # -mtune=native is appropriate for most cases. # -march=native is appropriate if you want portable binaries. # -DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math +DEFAULT_OPT_FLAGS=-g -O3 -ffast-math #DEFAULT_OPT_FLAGS+=-mllvm -polly -mllvm -polly-vectorizer=stripmine # # If you want to be specific, get the architecture options from: @@ -33,7 +34,7 @@ DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math # DEFAULT_OPT_FLAGS+=-Rpass=loop-vectorize #DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed DEFAULT_OPT_FLAGS+=-Wall #-Werror -DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations +#DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations #DEFAULT_OPT_FLAGS+=-mavx -mfma # # OpenMP flags @@ -42,7 +43,7 @@ OPENMPFLAG=-fopenmp OPENMPSIMDFLAG=-fopenmp-simd OFFLOADFLAG=-fopenmp OFFLOADFLAG+=-DGPU_SCHEDULE="" -#OPENACCFLAG= # Flang does not support OpenACC +OPENACCFLAG=-fopenacc # Klondike weirdness # OPENMPFLAG+=-L/opt/intel/compilers_and_libraries_2018.0.082/linux/compiler/lib/intel64_lin -liomp5 # Mac weirdness @@ -61,8 +62,9 @@ OPENCLFLAG=-framework OpenCL #OPENCLDIR=/etc/alternatives #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -#OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations -#OPENCLFLAG+=-Wno-deprecated-declarations -Wno-missing-braces +#OPENCLFLAG+=-Wno-ignored-attributes +#OPENCLFLAG+=-Wno-deprecated-declarations +#OPENCLFLAG+=-Wno-missing-braces # oneAPI #OPENCLDIR=/opt/intel/oneapi/compiler/latest/linux #OPENCLFLAG=-I${OPENCLDIR}/include/sycl -L${OPENCLDIR}/lib -lOpenCL @@ -201,6 +203,8 @@ UPCXXFLAG+=-mtune=native -ffast-math # BLASFLAG=-DACCELERATE -framework Accelerate CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +#BLASFLAG=-lblas +#CBLASFLAG=-lcblas # # CUDA flags # @@ -263,9 +267,10 @@ PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib # # see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details # single-node -#COARRAYFLAG=-fcoarray=single -lcaf_single +COARRAYFLAG=-fcoarray=single -lcaf_single # multi-node -COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +#COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +#COARRAYFLAG=-fcoarray=lib -L/usr/lib/x86_64-linux-gnu/open-coarrays/mpich/lib -lcaf_mpi # # MEMKIND (used in C1z) # From 62adbdffbc6094be0c5bcc20b05d9cbf43388cf5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 11:02:43 +0200 Subject: [PATCH 169/325] new Fortran argument parser --- FORTRAN/nstream.F90 | 38 +-------- FORTRAN/prk_mod.F90 | 186 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 35 deletions(-) diff --git a/FORTRAN/nstream.F90 b/FORTRAN/nstream.F90 index d8e4c184a..50c1e7b54 100644 --- a/FORTRAN/nstream.F90 +++ b/FORTRAN/nstream.F90 @@ -67,13 +67,10 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations, offset - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable :: A(:) real(kind=REAL64), allocatable :: B(:) real(kind=REAL64), allocatable :: C(:) @@ -93,37 +90,8 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a47)') 'Fortran Serial STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./nstream <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif - - offset = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') offset - if (offset .lt. 0) then - write(*,'(a,i5)') 'ERROR: offset must be positive : ', offset - stop 1 - endif - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) write(*,'(a,i12)') 'Number of iterations = ', iterations write(*,'(a,i12)') 'Vector length = ', length diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index 6d06d5c16..62b3a1615 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -9,6 +9,192 @@ function prk_get_wtime() result(t) t = real(c,REAL64) / real(r,REAL64) end function prk_get_wtime + subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? + iterations, & ! everything + length, offset, & ! nstream + order, tile_size, & ! transpose, stencil, dgemm + stencil, radius) ! not supported in implementations yet + use iso_fortran_env + implicit none + character(len=*), intent(in) :: kernel + integer(kind=INT32), intent(out) :: iterations + integer(kind=INT64), intent(out), optional :: length, offset ! nstream + integer(kind=INT32), intent(out), optional :: order, tile_size ! transpose, stencil, dgemm + integer(kind=INT32), intent(out), optional :: radius ! stencil + character(len=4), intent(out), optional :: stencil ! stencil + + integer :: argc,arglen,err,a,p,q + character(len=64) :: argtmp + + iterations = 10 + if (present(length)) then + length = 1024*1024*32 + endif + if (present(offset)) then + offset = 0 + endif + if (present(order)) then + order = 1024 + endif + if (present(tile_size)) then + tile_size = 32 + endif + if (present(stencil)) then + stencil = 'star' + endif + if (present(radius)) then + radius = 2 + endif + +#ifndef PRK_NO_ARGUMENTS + if (kernel(1:7).eq.'nstream') then + if (present(length)) then + length = 0 + else + print*,'You cannot parse nstream arguments without length' + stop + endif + else if ( (kernel(1:9).eq.'transpose') & + .or.(kernel(1:7).eq.'stencil') & + .or.(kernel(1:5).eq.'dgemm') ) then + if (present(order)) then + order = 0 + else + print*,'You cannot parse ',kernel,' arguments without order' + stop + endif + else + print*,kernel,'is not supported yet' + stop + endif + + argc = command_argument_count() + + if (argc.lt.2 ) then + write(*,'(a17,i2)') 'argument count = ', command_argument_count() + if (kernel(1:7).eq.'nstream') then + write(*,'(a62)') 'Old Usage: <# iterations> []' + write(*,'(a87)') 'New Usage: iterations=<# iterations> length= [offset=]' + else if ( (kernel(1:9).eq.'transpose') & + .or.(kernel(1:7).eq.'stencil') & + .or.(kernel(1:5).eq.'dgemm') ) then + write(*,'(a53)') 'Old Usage: <# iterations> []' + write(*,'(a73)') 'New Usage: iterations=<# iterations> order= [tile_size=]' + endif + STOP + endif + + do a=1,argc + call get_command_argument(a,argtmp,arglen,err) + if (err.eq.0) then + p = index(argtmp,"=") + if (p.eq.0) then + if (a.eq.1) then + read(argtmp,'(i10)') iterations + else if (a.eq.2) then + if (present(length)) then + read(argtmp,'(i15)') length + else if (present(order)) then + read(argtmp,'(i7)') order + endif + else if (a.eq.3) then + if (present(offset)) then + read(argtmp,'(i15)') offset + endif + if (present(tile_size)) then + read(argtmp,'(i3)') tile_size + endif + else + print*,'too many positional arguments:',argc + endif + else ! found an = + ! look for iterations + q = index(argtmp(1:p-1),"it") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i10)') iterations + endif + ! look for length + if (present(length)) then + q = index(argtmp(1:p-1),"len") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i15)') length + endif + endif + ! look for offset + if (present(offset)) then + q = index(argtmp(1:p-1),"off") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i15)') offset + endif + endif + ! look for order + if (present(order)) then + q = index(argtmp(1:p-1),"ord") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i7)') order + endif + endif + ! look for tile_size + if (present(tile_size)) then + q = index(argtmp(1:p-1),"tile") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i3)') tile_size + endif + endif + ! look for radius + if (present(radius)) then + q = index(argtmp(1:p-1),"rad") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i1)') radius + endif + endif + endif + endif + enddo + + ! check all the relevant arguments for validity + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be positive : ', iterations + stop 1 + endif + + if (present(length)) then + if (length .lt. 1) then + write(*,'(a,i15)') 'ERROR: length must be positive : ', length + stop 1 + endif + endif + + if (present(order)) then + if (order .lt. 1) then + write(*,'(a,i7)') 'ERROR: order must be positive : ', order + stop 1 + endif + endif + + if (present(radius)) then + if (radius .lt. 1) then + write(*,'(a,i3)') 'ERROR: radius must be positive : ', radius + stop 1 + endif + endif + + if (present(offset)) then + if (offset .lt. 0) then + write(*,'(a,i15)') 'ERROR: offset must be nonnegative : ', offset + stop 1 + endif + endif + + if (present(tile_size)) then + if (tile_size .lt. 0) then + write(*,'(a,i3)') 'ERROR: tile_size must be nonnegative : ', tile_size + stop 1 + endif + endif +#endif + end subroutine + subroutine initialize_w(is_star,r,W) use iso_fortran_env implicit none From e2fee124207ed40be256100d8d2c97b22e3bf052 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 11:12:51 +0200 Subject: [PATCH 170/325] new arg parse for nstream and transpose --- FORTRAN/nstream.F90 | 17 ++------------- FORTRAN/prk_mod.F90 | 43 ++++++++++++++++++------------------- FORTRAN/transpose.F90 | 49 ++++++++----------------------------------- 3 files changed, 33 insertions(+), 76 deletions(-) diff --git a/FORTRAN/nstream.F90 b/FORTRAN/nstream.F90 index 50c1e7b54..a52624394 100644 --- a/FORTRAN/nstream.F90 +++ b/FORTRAN/nstream.F90 @@ -90,7 +90,6 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a47)') 'Fortran Serial STREAM triad: A = B + scalar * C' - call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) write(*,'(a,i12)') 'Number of iterations = ', iterations @@ -101,21 +100,9 @@ program main ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(length), stat=err ) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index 62b3a1615..06e346fef 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -78,8 +78,8 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? else if ( (kernel(1:9).eq.'transpose') & .or.(kernel(1:7).eq.'stencil') & .or.(kernel(1:5).eq.'dgemm') ) then - write(*,'(a53)') 'Old Usage: <# iterations> []' - write(*,'(a73)') 'New Usage: iterations=<# iterations> order= [tile_size=]' + write(*,'(a57)') 'Old Usage: <# iterations> []' + write(*,'(a84)') 'New Usage: iterations=<# iterations> order= [tile_size=]' endif STOP endif @@ -158,38 +158,39 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? stop 1 endif + ! nstream if (present(length)) then if (length .lt. 1) then write(*,'(a,i15)') 'ERROR: length must be positive : ', length stop 1 endif + if (present(offset)) then + if (offset .lt. 0) then + write(*,'(a,i15)') 'ERROR: offset must be nonnegative : ', offset + stop 1 + endif + endif endif + ! transpose, stencil, dgemm if (present(order)) then if (order .lt. 1) then write(*,'(a,i7)') 'ERROR: order must be positive : ', order stop 1 endif - endif - - if (present(radius)) then - if (radius .lt. 1) then - write(*,'(a,i3)') 'ERROR: radius must be positive : ', radius - stop 1 - endif - endif - - if (present(offset)) then - if (offset .lt. 0) then - write(*,'(a,i15)') 'ERROR: offset must be nonnegative : ', offset - stop 1 + if (present(tile_size)) then + if ((tile_size .lt. 1).or.(tile_size.gt.order)) then + write(*,'(a18,i3,a22,i5)') 'WARNING: tile_size ',tile_size,& + ' must be between 1 and ',order + tile_size = order ! no tiling + endif endif - endif - - if (present(tile_size)) then - if (tile_size .lt. 0) then - write(*,'(a,i3)') 'ERROR: tile_size must be nonnegative : ', tile_size - stop 1 + ! stencil + if (present(radius)) then + if (radius .lt. 1) then + write(*,'(a,i3)') 'ERROR: radius must be positive : ', radius + stop 1 + endif endif endif #endif diff --git a/FORTRAN/transpose.F90 b/FORTRAN/transpose.F90 index e76fc7900..8870133dd 100644 --- a/FORTRAN/transpose.F90 +++ b/FORTRAN/transpose.F90 @@ -57,10 +57,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -81,38 +78,14 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran Serial Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** @@ -125,12 +98,6 @@ program main stop 1 endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - - t0 = 0 - if (tile_size.lt.order) then do jt=1,order,tile_size do it=1,order,tile_size @@ -151,6 +118,8 @@ program main enddo endif + t0 = 0 + do k=0,iterations if (k.eq.1) then From 1c6dfd7165a9a4bb186273e81f0379b6060310e3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 11:21:06 +0200 Subject: [PATCH 171/325] new arg parse for stencil --- FORTRAN/prk_mod.F90 | 2 +- FORTRAN/stencil.F90 | 92 +++++++++++---------------------------------- 2 files changed, 23 insertions(+), 71 deletions(-) diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index 06e346fef..cd6224e5e 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -188,7 +188,7 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? ! stencil if (present(radius)) then if (radius .lt. 1) then - write(*,'(a,i3)') 'ERROR: radius must be positive : ', radius + write(*,'(a27)') 'ERROR: radius must be between 1 and 9' stop 1 endif endif diff --git a/FORTRAN/stencil.F90 b/FORTRAN/stencil.F90 index 8ca6116be..b4c8cc711 100644 --- a/FORTRAN/stencil.F90 +++ b/FORTRAN/stencil.F90 @@ -168,41 +168,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a43)') 'Fortran Serial Stencil execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a32,a29)') 'Usage: ./stencil <# iterations> ', & - ' [tile_size]' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - if (n .lt. 1) then - write(*,'(a,i5)') 'ERROR: array dimension must be >= 1 : ', n - stop 1 - endif - - tiling = .false. - tile_size = n - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - if ((tile_size .lt. 1).or.(tile_size.gt.n)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',n - else - tiling = .true. - endif - endif + call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -211,48 +177,32 @@ program main is_star = .false. #endif - ! TODO: parse runtime input for radius - - if (r .lt. 1) then - write(*,'(a,i5,a)') 'ERROR: Stencil radius ',r,' should be positive' - stop 1 - else if ((2*r+1) .gt. n) then - write(*,'(a,i5,a,i5)') 'ERROR: Stencil radius ',r,& - ' exceeds grid size ',n - stop 1 - endif - - allocate( A(n,n), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(n,n), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif + tiling = (tile_size.ne.n) - norm = 0.d0 - active_points = int(n-2*r,INT64)**2 - - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Grid size = ', n - write(*,'(a,i8)') 'Radius of stencil = ', r + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Grid size = ', n + write(*,'(a22,i8)') 'Radius of stencil = ', r if (is_star) then - write(*,'(a,a)') 'Type of stencil = star' + write(*,'(a22,a8)') 'Type of stencil = ','star' stencil_size = 4*r+1 else - write(*,'(a,a)') 'Type of stencil = grid' + write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - write(*,'(a)') 'Data type = double precision' - write(*,'(a)') 'Compact representation of stencil loop body' if (tiling) then - write(*,'(a,i5)') 'Tile size = ', tile_size + write(*,'(a22,i8)') 'Tile size = ', tile_size else - write(*,'(a)') 'Untiled' + write(*,'(a10)') 'Tiling off' + endif + + ! ******************************************************************** + ! ** Allocate space for the input and perform the computation + ! ******************************************************************** + + allocate( A(n,n), B(n,n), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 endif call initialize_w(is_star,r,W) @@ -283,14 +233,16 @@ program main enddo ! iterations t1 = prk_get_wtime() + stencil_time = t1 - t0 + norm = 0.0d0 do j=r,n-r do i=r,n-r norm = norm + abs(B(i,j)) enddo enddo - stencil_time = t1 - t0 + active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) !****************************************************************************** From a548912d6ab22ab64456d1dbf08e1cdd56bb27d8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 15:11:24 +0200 Subject: [PATCH 172/325] new arg parse --- FORTRAN/nstream-coarray.F90 | 73 ++++++---------------- FORTRAN/nstream-cufortran.cuf | 83 ++++++------------------- FORTRAN/nstream-ga.F90 | 62 +++++++------------ FORTRAN/nstream-mpi.F90 | 60 ++++++------------ FORTRAN/nstream-openacc.F90 | 82 ++++++------------------- FORTRAN/nstream-openmp-target.F90 | 76 +++++++---------------- FORTRAN/nstream-openmp.F90 | 95 ++++++++--------------------- FORTRAN/nstream-pretty.F90 | 69 ++++----------------- FORTRAN/nstream-stdpar.F90 | 78 +++++------------------ FORTRAN/nstream-taskloop-openmp.F90 | 93 ++++++++-------------------- FORTRAN/nstream.F90 | 24 ++++---- 11 files changed, 205 insertions(+), 590 deletions(-) diff --git a/FORTRAN/nstream-coarray.F90 b/FORTRAN/nstream-coarray.F90 index 843f7d210..cca208f97 100644 --- a/FORTRAN/nstream-coarray.F90 +++ b/FORTRAN/nstream-coarray.F90 @@ -68,15 +68,12 @@ program main use prk implicit none integer :: me, np, p - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations - integer(kind=INT64) :: length - integer(kind=INT32) :: co_iterations[*] - integer(kind=INT64) :: co_length[*] + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset + integer(kind=INT32) :: co_iterations[*] + integer(kind=INT64) :: co_length[*] real(kind=REAL64), allocatable :: A(:)[:] real(kind=REAL64), allocatable :: B(:)[:] real(kind=REAL64), allocatable :: C(:)[:] @@ -101,34 +98,13 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a48)') 'Fortran coarray STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a49)') 'Usage: ./nstream <# iterations> ' - error stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - error stop 1 - endif - - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - error stop 1 - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) - write(*,'(a,i12)') 'Number of images = ', np - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Vector length = ', length - endif + write(*,'(a23,i12)') 'Number of images = ', np + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset - if (me.eq.1) then ! co_broadcast is 2018 and not available in all coarray implementations do p=1,np co_iterations[p] = iterations @@ -146,37 +122,24 @@ program main ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length)[*], stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - error stop 1 - endif - - allocate( B(length)[*], stat=err ) + allocate( A(length)[*], B(length)[*], C(length)[*], stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err error stop 1 endif - allocate( C(length)[*], stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err - error stop 1 - endif - - scalar = 3 - - t0 = 0 - do concurrent (i=1:length) A(i) = 0 B(i) = 2 C(i) = 2 enddo - sync all ! barrier to ensure initialization is finished at all PEs + sync all - do k=0,iterations + scalar = 3 + t0 = 0 + + do k=0,iterations if (k.eq.1) then sync all ! barrier t0 = prk_get_wtime() @@ -218,9 +181,7 @@ program main enddo endif - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then if (me.eq.1) then diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.cuf index 4a0586491..51ea47858 100644 --- a/FORTRAN/nstream-cufortran.cuf +++ b/FORTRAN/nstream-cufortran.cuf @@ -86,13 +86,10 @@ program main use nstream use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations, block_size - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations, block_size + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable, managed :: A(:) real(kind=REAL64), allocatable, managed :: B(:) real(kind=REAL64), allocatable, managed :: C(:) @@ -114,41 +111,12 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a45)') 'CUDA Fortran STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a64)') 'Usage: ./nstream <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset,gpu_block_size=block_size) - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif - - block_size = 256 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') block_size - if (block_size .lt. 0) then - write(*,'(a,i5)') 'ERROR: block_size must be positive : ', block_size - stop 1 - endif - endif - - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Vector length = ', length - write(*,'(a,i12)') 'GPU block size = ', block_size + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset + write(*,'(a23,i12)') 'GPU block size = ', block_size tblock = dim3(block_size,1,1) grid = dim3(ceiling(real(length)/tblock%x),1,1) @@ -157,37 +125,26 @@ program main ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(length), stat=err ) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a20,i3)') 'allocation returned ',err stop 1 endif - allocate( C(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err - stop 1 - endif - - scalar = 3 - - t0 = 0 - do i=1,length A(i) = 0 B(i) = 2 C(i) = 2 enddo - do k=0,iterations + scalar = 3 - if (k.eq.1) t0 = prk_get_wtime() + t0 = 0 + + do k=0,iterations + if (k.eq.1) then + t0 = prk_get_wtime() + endif call kernel<<>>(scalar, A, B, C) @@ -213,9 +170,7 @@ program main asum = asum + abs(A(i)-ar) enddo - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then write(*,'(a35)') 'Failed Validation on output array' @@ -228,8 +183,8 @@ program main avgtime = nstream_time/iterations; bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & - 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & - 'Avg time (s): ', avgtime + 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & + 'Avg time (s): ', avgtime endif end program main diff --git a/FORTRAN/nstream-ga.F90 b/FORTRAN/nstream-ga.F90 index 1838dd594..fc9343b6a 100644 --- a/FORTRAN/nstream-ga.F90 +++ b/FORTRAN/nstream-ga.F90 @@ -68,14 +68,13 @@ program main use iso_fortran_env use mpi_f08 + use prk implicit none #include "global.fh" -!#include 'ga-mpi.fh' ! unused +#include 'ga-mpi.fh' ! unused #include "mafdecls.fh" ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! MPI - should always use 32-bit INTEGER integer(kind=INT32), parameter :: requested = MPI_THREAD_SERIALIZED integer(kind=INT32) :: provided @@ -93,8 +92,8 @@ program main real(kind=REAL64), parameter :: one = 1.d0 real(kind=REAL64), parameter :: two = 2.d0 ! problem definition - integer(kind=INT32) :: iterations, offset - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset integer(kind=INT64) :: bytes, max_mem real(kind=REAL64) :: scalar ! runtime variables @@ -102,7 +101,7 @@ program main integer(kind=INT32) :: k real(kind=REAL64) :: asum, ar, br, cr, atmp real(kind=REAL64) :: t0, t1, nstream_time, avgtime - real(kind=REAL64), parameter :: epsilon=1.d-8 + real(kind=REAL64), parameter :: epsilon=1.D-8 if (storage_size(length).ne.storage_size(me)) then write(*,'(a50)') 'You must compile with 64-bit INTEGER!' @@ -112,39 +111,7 @@ program main ! read and test input parameters ! ******************************************************************** - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./nstream <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif - - offset = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') offset - if (offset .lt. 0) then - write(*,'(a,i5)') 'ERROR: offset must be positive : ', offset - stop 1 - endif - endif - - call mpi_init_thread(requested,provided) + call MPI_Init_thread(requested,provided) ! ask GA to allocate enough memory for 4 vectors, just to be safe max_mem = length * 4 * ( storage_size(scalar) / 8 ) @@ -172,12 +139,29 @@ program main if (me.eq.0) then write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a54)') 'Fortran Global Arrays STREAM triad: A = B + scalar * C' + + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) + write(*,'(a22,i12)') 'Number of GA procs = ', np write(*,'(a22,i12)') 'Number of iterations = ', iterations write(*,'(a22,i12)') 'Vector length = ', length write(*,'(a22,i12)') 'Offset = ', offset endif +#if 1 + call ga_brdcst(0,iterations,4,0) + call ga_brdcst(0,length,8,0) + call ga_brdcst(0,offset,8,0) +#else + block + integer :: comm + call ga_mpi_comm_pgroup_default(comm) + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, comm) + call MPI_Bcast(length, 1, MPI_INTEGER8, 0, comm) + call MPI_Bcast(offset, 1, MPI_INTEGER8, 0, comm) + end block +#endif + call ga_sync() ! ******************************************************************** diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index e428a8efc..f94ac2af4 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -40,10 +40,10 @@ ! a third vector. ! ! USAGE: The program takes as input the number -! of iterations to loop over the triad vectors and -! the length of the vectors. +! of iterations to loop over the triad vectors, the length of the +! vectors, and the offset between vectors ! -! <# iterations> +! <# iterations> ! ! The output consists of diagnostics to make sure the ! algorithm worked, and of timing statistics. @@ -69,14 +69,12 @@ program main use omp_lib #endif use mpi_f08 + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable :: A(:) real(kind=REAL64), allocatable :: B(:) real(kind=REAL64), allocatable :: C(:) @@ -112,34 +110,15 @@ program main write(*,'(a44)') 'Fortran MPI STREAM triad: A = B + scalar * C' #endif - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a49)') 'Usage: ./nstream <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) + write(*,'(a23,i12)') 'Number of MPI procs = ', np #ifdef _OPENMP - write(*,'(a23,i8)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a23,i12)') 'Number of threads = ', omp_get_max_threads() #endif - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Number of iterations = ', iterations write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(length, 1, MPI_INTEGER8, 0, MPI_COMM_WORLD) @@ -154,15 +133,11 @@ program main call MPI_Abort(MPI_COMM_WORLD, 10) endif - scalar = 3 - - t0 = 0 - #ifdef _OPENMP - !$omp parallel default(none) & - !$omp& shared(A,B,C,t0,t1) & - !$omp& firstprivate(length,iterations,scalar) & - !$omp& private(i,k) + !$omp parallel default(none) & + !$omp& shared(A,B,C,nstream_time) & + !$omp& firstprivate(length,iterations) & + !$omp& private(i,k,scalar,t0,t1) #endif #if defined(_OPENMP) @@ -194,8 +169,11 @@ program main call MPI_Barrier(MPI_COMM_WORLD) !$omp end master + scalar = 3 + + t0 = 0 + do k=0,iterations - ! start timer after a warmup iteration if (k.eq.1) then call MPI_Barrier(MPI_COMM_WORLD) #ifdef _OPENMP diff --git a/FORTRAN/nstream-openacc.F90 b/FORTRAN/nstream-openacc.F90 index 9f71b155a..952e4ed35 100644 --- a/FORTRAN/nstream-openacc.F90 +++ b/FORTRAN/nstream-openacc.F90 @@ -67,13 +67,10 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations, offset - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable :: A(:) real(kind=REAL64), allocatable :: B(:) real(kind=REAL64), allocatable :: C(:) @@ -93,68 +90,22 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a48)') 'Fortran OpenACC STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./nstream <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif - - offset = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') offset - if (offset .lt. 0) then - write(*,'(a,i5)') 'ERROR: offset must be positive : ', offset - stop 1 - endif - endif - - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Vector length = ', length - write(*,'(a,i12)') 'Offset = ', offset + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(length), stat=err ) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a20,i3)') 'allocation returned ',err stop 1 endif - allocate( C(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err - stop 1 - endif - - scalar = 3 - - t0 = 0 - !$acc data copy(A) copyin(B,C) !$acc parallel loop @@ -164,9 +115,14 @@ program main C(i) = 2 enddo - do k=0,iterations + scalar = 3 - if (k.eq.1) t0 = prk_get_wtime() + t0 = 0 + + do k=0,iterations + if (k.eq.1) then + t0 = prk_get_wtime() + endif #if 1 !$acc parallel loop @@ -206,9 +162,7 @@ program main asum = asum + abs(A(i)-ar) enddo - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then write(*,'(a35)') 'Failed Validation on output array' @@ -221,8 +175,8 @@ program main avgtime = nstream_time/iterations; bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & - 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & - 'Avg time (s): ', avgtime + 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & + 'Avg time (s): ', avgtime endif end program main diff --git a/FORTRAN/nstream-openmp-target.F90 b/FORTRAN/nstream-openmp-target.F90 index 8b16e5a71..adda23a4a 100644 --- a/FORTRAN/nstream-openmp-target.F90 +++ b/FORTRAN/nstream-openmp-target.F90 @@ -43,7 +43,7 @@ ! of iterations to loop over the triad vectors, the length of the ! vectors, and the offset between vectors ! -! <# iterations> +! <# iterations> ! ! The output consists of diagnostics to make sure the ! algorithm worked, and of timing statistics. @@ -66,14 +66,12 @@ program main use iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable :: A(:) real(kind=REAL64), allocatable :: B(:) real(kind=REAL64), allocatable :: C(:) @@ -93,58 +91,23 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a54)') 'Fortran OpenMP TARGET STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./nstream <# iterations> ' - stop 1 - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif - - write(*,'(a,i12)') 'OpenMP default device = ', omp_get_default_device() - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Matrix length = ', length + write(*,'(a23,i12)') 'OpenMP default device = ', omp_get_default_device() + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(length), stat=err ) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a20,i3)') 'allocation returned ',err stop 1 endif - allocate( C(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err - stop 1 - endif - - scalar = 3 - - t0 = 0 - !$omp parallel do do i=1,length A(i) = 0 @@ -153,11 +116,16 @@ program main enddo !$omp end parallel do + scalar = 3 + + t0 = 0 + !$omp target data map(tofrom: A) map(to: B,C) map(to:length) do k=0,iterations - - if (k.eq.1) t0 = omp_get_wtime() + if (k.eq.1) then + t0 = omp_get_wtime() + endif !$omp target teams distribute parallel do simd GPU_SCHEDULE do i=1,length @@ -191,9 +159,7 @@ program main enddo !$omp end parallel do - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then write(*,'(a35)') 'Failed Validation on output array' @@ -206,8 +172,8 @@ program main avgtime = nstream_time/iterations; bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & - 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & - 'Avg time (s): ', avgtime + 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & + 'Avg time (s): ', avgtime endif end program main diff --git a/FORTRAN/nstream-openmp.F90 b/FORTRAN/nstream-openmp.F90 index 052b3005d..f6bd84467 100644 --- a/FORTRAN/nstream-openmp.F90 +++ b/FORTRAN/nstream-openmp.F90 @@ -66,14 +66,12 @@ program main use iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations, offset - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable :: A(:) real(kind=REAL64), allocatable :: B(:) real(kind=REAL64), allocatable :: C(:) @@ -93,73 +91,27 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a47)') 'Fortran OpenMP STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./nstream <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) - offset = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') offset - if (offset .lt. 0) then - write(*,'(a,i5)') 'ERROR: offset must be positive : ', offset - stop 1 - endif - endif - - write(*,'(a,i12)') 'Number of threads = ', omp_get_max_threads() - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Vector length = ', length - write(*,'(a,i12)') 'Offset = ', offset + write(*,'(a23,i12)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(length), stat=err ) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a20,i3)') 'allocation returned ',err stop 1 endif - scalar = 3 - - t0 = 0 - - !$omp parallel default(none) & - !$omp& shared(A,B,C,t0,t1) & - !$omp& firstprivate(length,iterations,offset,scalar) & - !$omp& private(i,k) + !$omp parallel default(none) & + !$omp& shared(A,B,C,nstream_time) & + !$omp& firstprivate(length,iterations,offset) & + !$omp& private(i,k,t0,t1,scalar) !$omp do do i=1,length @@ -169,6 +121,10 @@ program main enddo !$omp end do + scalar = 3 + + t0 = 0 + ! need this because otherwise no barrier between initialization ! and iteration 0 (warmup), which will lead to incorrectness. !$omp barrier @@ -188,11 +144,14 @@ program main !$omp end do enddo ! iterations + !$omp barrier t1 = omp_get_wtime() - !$omp end parallel - + !$omp master nstream_time = t1 - t0 + !$omp end master + + !$omp end parallel ! ******************************************************************** ! ** Analyze and output results. @@ -212,9 +171,7 @@ program main enddo !$omp end parallel do - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then write(*,'(a35)') 'Failed Validation on output array' @@ -227,8 +184,8 @@ program main avgtime = nstream_time/iterations; bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & - 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & - 'Avg time (s): ', avgtime + 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & + 'Avg time (s): ', avgtime endif end program main diff --git a/FORTRAN/nstream-pretty.F90 b/FORTRAN/nstream-pretty.F90 index 39240f632..65a46ea95 100644 --- a/FORTRAN/nstream-pretty.F90 +++ b/FORTRAN/nstream-pretty.F90 @@ -67,13 +67,10 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations, offset - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable :: A(:) real(kind=REAL64), allocatable :: B(:) real(kind=REAL64), allocatable :: C(:) @@ -92,61 +89,19 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a47)') 'Fortran Pretty STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./nstream <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif - - offset = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') offset - if (offset .lt. 0) then - write(*,'(a,i5)') 'ERROR: offset must be positive : ', offset - stop 1 - endif - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Vector length = ', length - write(*,'(a,i12)') 'Offset = ', offset + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(length), stat=err ) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a20,i3)') 'allocation returned ',err stop 1 endif @@ -180,9 +135,7 @@ program main asum = sum(abs(A-ar)) - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then write(*,'(a35)') 'Failed Validation on output array' @@ -195,8 +148,8 @@ program main avgtime = nstream_time/iterations; bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & - 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & - 'Avg time (s): ', avgtime + 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & + 'Avg time (s): ', avgtime endif end program main diff --git a/FORTRAN/nstream-stdpar.F90 b/FORTRAN/nstream-stdpar.F90 index de4d98671..3e747787f 100644 --- a/FORTRAN/nstream-stdpar.F90 +++ b/FORTRAN/nstream-stdpar.F90 @@ -67,13 +67,10 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations, offset - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable :: A(:) real(kind=REAL64), allocatable :: B(:) real(kind=REAL64), allocatable :: C(:) @@ -93,77 +90,36 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a47)') 'Fortran stdpar STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./nstream <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif - - offset = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') offset - if (offset .lt. 0) then - write(*,'(a,i5)') 'ERROR: offset must be positive : ', offset - stop 1 - endif - endif - - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Vector length = ', length - write(*,'(a,i12)') 'Offset = ', offset + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(length), stat=err ) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a20,i3)') 'allocation returned ',err stop 1 endif - allocate( C(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err - stop 1 - endif - - scalar = 3 - - t0 = 0 - do concurrent (i=1:length) A(i) = 0 B(i) = 2 C(i) = 2 enddo - do k=0,iterations + scalar = 3 - if (k.eq.1) t0 = prk_get_wtime() + t0 = 0 + + do k=0,iterations + if (k.eq.1) then + t0 = prk_get_wtime() + endif do concurrent (i=1:length) A(i) = A(i) + B(i) + scalar * C(i) @@ -190,9 +146,7 @@ program main asum = asum + abs(A(i)-ar) enddo - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then write(*,'(a35)') 'Failed Validation on output array' diff --git a/FORTRAN/nstream-taskloop-openmp.F90 b/FORTRAN/nstream-taskloop-openmp.F90 index 716d5a83a..2cce93824 100644 --- a/FORTRAN/nstream-taskloop-openmp.F90 +++ b/FORTRAN/nstream-taskloop-openmp.F90 @@ -66,14 +66,12 @@ program main use iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition - integer(kind=INT32) :: iterations, offset - integer(kind=INT64) :: length + integer(kind=INT32) :: iterations + integer(kind=INT64) :: length, offset real(kind=REAL64), allocatable :: A(:) real(kind=REAL64), allocatable :: B(:) real(kind=REAL64), allocatable :: C(:) @@ -93,73 +91,27 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a47)') 'Fortran OpenMP TASKLOOP STREAM triad: A = B + scalar * C' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./nstream <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) - length = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') length - if (length .lt. 1) then - write(*,'(a,i5)') 'ERROR: length must be nonnegative : ', length - stop 1 - endif - - offset = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') offset - if (offset .lt. 0) then - write(*,'(a,i5)') 'ERROR: offset must be positive : ', offset - stop 1 - endif - endif - - write(*,'(a,i12)') 'Number of threads = ', omp_get_max_threads() - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Matrix length = ', length - write(*,'(a,i12)') 'Offset = ', offset + write(*,'(a23,i12)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space and perform the computation ! ******************************************************************** - allocate( A(length), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(length), stat=err ) + allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a20,i3)') 'allocation returned ',err stop 1 endif - allocate( C(length), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err - stop 1 - endif - - scalar = 3 - - t0 = 0 - - !$omp parallel default(none) & - !$omp& shared(A,B,C,t0,t1) & - !$omp& firstprivate(length,iterations,offset,scalar) & - !$omp& private(i,k) + !$omp parallel default(none) & + !$omp& shared(A,B,C,nstream_time) & + !$omp& firstprivate(length,iterations,offset) & + !$omp& private(i,k,t0,t1,scalar) !$omp master !$omp taskloop firstprivate(length,offset) shared(A,B,C) private(i) @@ -170,11 +122,16 @@ program main enddo !$omp end taskloop + scalar = 3 + + t0 = 0 + !$omp taskwait do k=0,iterations - - if (k.eq.1) t0 = omp_get_wtime() + if (k.eq.1) then + t0 = omp_get_wtime() + endif !$omp taskloop firstprivate(length,offset) shared(A,B,C) private(i) do i=1,length @@ -211,9 +168,7 @@ program main enddo !$omp end parallel do - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then write(*,'(a35)') 'Failed Validation on output array' @@ -226,8 +181,8 @@ program main avgtime = nstream_time/iterations; bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & - 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & - 'Avg time (s): ', avgtime + 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & + 'Avg time (s): ', avgtime endif end program main diff --git a/FORTRAN/nstream.F90 b/FORTRAN/nstream.F90 index a52624394..85fff7971 100644 --- a/FORTRAN/nstream.F90 +++ b/FORTRAN/nstream.F90 @@ -92,9 +92,9 @@ program main call prk_get_arguments('nstream',iterations=iterations,length=length,offset=offset) - write(*,'(a,i12)') 'Number of iterations = ', iterations - write(*,'(a,i12)') 'Vector length = ', length - write(*,'(a,i12)') 'Offset = ', offset + write(*,'(a23,i12)') 'Number of iterations = ', iterations + write(*,'(a23,i12)') 'Vector length = ', length + write(*,'(a23,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space and perform the computation @@ -102,14 +102,10 @@ program main allocate( A(length), B(length), C(length), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation returned ',err + write(*,'(a20,i3)') 'allocation returned ',err stop 1 endif - scalar = 3 - - t0 = 0 - #if 0 forall (i=1:length) A(i) = 0 @@ -124,6 +120,10 @@ program main enddo #endif + scalar = 3 + + t0 = 0 + do k=0,iterations if (k.eq.1) then t0 = prk_get_wtime() @@ -160,9 +160,7 @@ program main asum = asum + abs(A(i)-ar) enddo - deallocate( C ) - deallocate( B ) - deallocate( A ) + deallocate( A,B,C ) if (abs(asum) .gt. epsilon) then write(*,'(a35)') 'Failed Validation on output array' @@ -175,8 +173,8 @@ program main avgtime = nstream_time/iterations; bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & - 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & - 'Avg time (s): ', avgtime + 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & + 'Avg time (s): ', avgtime endif end program main From 53a5c2208b438f534a2de43c5a99bb33fe6cfc97 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 15:11:36 +0200 Subject: [PATCH 173/325] new arg parse --- FORTRAN/dgemm-blas.F90 | 50 +++------------------ FORTRAN/dgemm-ga.F90 | 52 ++++++++++------------ FORTRAN/dgemm-openmp-target.F90 | 71 ++++++------------------------ FORTRAN/dgemm-openmp.F90 | 67 +++++----------------------- FORTRAN/dgemm-pretty.F90 | 49 +++------------------ FORTRAN/dgemm-stdpar.F90 | 67 +++++----------------------- FORTRAN/dgemm-taskloop-openmp.F90 | 73 +++++++------------------------ FORTRAN/dgemm.F90 | 65 +++++---------------------- 8 files changed, 94 insertions(+), 400 deletions(-) diff --git a/FORTRAN/dgemm-blas.F90 b/FORTRAN/dgemm-blas.F90 index 171ca5113..3f1a54f2a 100644 --- a/FORTRAN/dgemm-blas.F90 +++ b/FORTRAN/dgemm-blas.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2017, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -58,10 +59,7 @@ program main #endif use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the kernel integer(kind=INT32) :: order ! order of the matrix @@ -75,7 +73,7 @@ program main integer(kind=INT32) :: i,j,k real(kind=REAL64) :: checksum, reference, residuum real(kind=REAL64) :: t0, t1, dgemm_time, avgtime ! timing parameters - real(kind=REAL64), parameter :: epsilon=1.0d-8 ! error tolerance + real(kind=REAL64), parameter :: epsilon=1.0d-8 ! error tolerance ! ******************************************************************** ! read and test input parameters @@ -84,27 +82,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a59)') 'Fortran BLAS Dense matrix-matrix multiplication: C += A x B' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a66)') 'Usage: ./dgemm-pretty <# iterations> ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('dgemm',iterations=iterations,order=order) #ifdef _OPENMP write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() @@ -116,21 +94,9 @@ program main ! ** Allocate space for the input and output matrices ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), C(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif @@ -155,16 +121,12 @@ program main t1 = prk_get_wtime() - dgemm_time = t1 - t0 ! ******************************************************************** ! ** Analyze and output results. ! ******************************************************************** - deallocate( A ) - deallocate( B ) - forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = 0.0d0 @@ -174,7 +136,7 @@ program main enddo enddo - deallocate( C ) + deallocate( A,B,C ) residuum = abs(checksum-reference)/reference if (residuum .lt. epsilon) then diff --git a/FORTRAN/dgemm-ga.F90 b/FORTRAN/dgemm-ga.F90 index fd5565509..cd52c43e6 100644 --- a/FORTRAN/dgemm-ga.F90 +++ b/FORTRAN/dgemm-ga.F90 @@ -55,14 +55,13 @@ program main use iso_fortran_env use mpi_f08 + use prk implicit none #include "global.fh" +#include 'ga-mpi.fh' ! unused #include "mafdecls.fh" -!#include 'ga-mpi.fh' ! unused ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! MPI - should always use 32-bit INTEGER integer(kind=INT32), parameter :: requested = MPI_THREAD_SERIALIZED integer(kind=INT32) :: provided @@ -92,28 +91,6 @@ program main ! read and test input parameters ! ******************************************************************** - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./dgemm-ga <# iterations> ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - call mpi_init_thread(requested,provided) !call ga_initialize() @@ -145,19 +122,32 @@ program main if (me.eq.0) then write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a68)') 'Fortran Global Arrays Dense matrix-matrix multiplication: C += A x B' - write(*,'(a22,i12)') 'Number of GA procs = ', np - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order + + call prk_get_arguments('dgemm',iterations=iterations,order=order,tile_size=tile_size) + + write(*,'(a22,i12)') 'Number of GA procs = ', np + write(*,'(a22,i12)') 'Number of iterations = ', iterations + write(*,'(a22,i12)') 'Matrix order = ', order endif +#if 1 + call ga_brdcst(0,iterations,4,0) + call ga_brdcst(0,order, 4,0) +#else + block + integer :: comm + call ga_mpi_comm_pgroup_default(comm) + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, comm) + call MPI_Bcast(order, 1, MPI_INTEGER4, 0, comm) + end block +#endif + call ga_sync() ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - t0 = 0.0d0 - !print*,'order=',order ! must cast int32 order to integer... ok = ga_create(MT_DBL, int(order), int(order),'A',-1,-1, A) @@ -207,6 +197,8 @@ program main call ga_print(A) endif + t0 = 0.0d0 + do k=0,iterations ! start timer after a warmup iteration diff --git a/FORTRAN/dgemm-openmp-target.F90 b/FORTRAN/dgemm-openmp-target.F90 index 13fb821c1..7ed137a4a 100644 --- a/FORTRAN/dgemm-openmp-target.F90 +++ b/FORTRAN/dgemm-openmp-target.F90 @@ -55,11 +55,9 @@ program main use iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the kernel integer(kind=INT32) :: order ! order of the matrix @@ -82,63 +80,24 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a70)') 'Fortran OpenMP TARGET Dense matrix-matrix multiplication: C += A x B' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a65)') 'Usage: ./dgemm-pretty <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('dgemm',iterations=iterations,order=order,tile_size=tile_size) - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size.lt.1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size, & - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - ! ******************************************************************** ! ** Allocate space for the input and output matrices ! ******************************************************************** - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), C(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif @@ -168,17 +127,15 @@ program main enddo t1 = omp_get_wtime() - dgemm_time = t1 - t0 !$omp end target data + dgemm_time = t1 - t0 + ! ******************************************************************** ! ** Analyze and output results. ! ******************************************************************** - deallocate( A ) - deallocate( B ) - forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = 0.0d0 @@ -190,7 +147,7 @@ program main enddo !$omp end parallel do simd - deallocate( C ) + deallocate( A,B,C ) residuum = abs(checksum-reference)/reference if (residuum .lt. epsilon) then diff --git a/FORTRAN/dgemm-openmp.F90 b/FORTRAN/dgemm-openmp.F90 index 0405e6b1d..17ccb29cf 100644 --- a/FORTRAN/dgemm-openmp.F90 +++ b/FORTRAN/dgemm-openmp.F90 @@ -99,10 +99,7 @@ program main use omp_lib use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the kernel integer(kind=INT32) :: order ! order of the matrix @@ -125,63 +122,24 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a61)') 'Fortran OpenMP Dense matrix-matrix multiplication: C += A x B' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a66)') 'Usage: ./dgemm-pretty <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('dgemm',iterations=iterations,order=order,tile_size=tile_size) - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size.lt.1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size, & - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - ! ******************************************************************** ! ** Allocate space for the input and output matrices ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), C(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif @@ -223,9 +181,6 @@ program main ! ** Analyze and output results. ! ******************************************************************** - deallocate( A ) - deallocate( B ) - forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = 0.0d0 @@ -237,7 +192,7 @@ program main enddo !$omp end parallel do - deallocate( C ) + deallocate( A,B,C ) residuum = abs(checksum-reference)/reference if (residuum .lt. epsilon) then diff --git a/FORTRAN/dgemm-pretty.F90 b/FORTRAN/dgemm-pretty.F90 index 592bfa6e3..25d4b6140 100644 --- a/FORTRAN/dgemm-pretty.F90 +++ b/FORTRAN/dgemm-pretty.F90 @@ -60,10 +60,7 @@ program main #endif use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the kernel integer(kind=INT32) :: order ! order of the matrix @@ -85,27 +82,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a61)') 'Fortran Pretty Dense matrix-matrix multiplication: C += A x B' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./dgemm-pretty <# iterations> ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('dgemm',iterations=iterations,order=order) write(*,'(a,i8)') 'Number of iterations = ', iterations write(*,'(a,i8)') 'Matrix order = ', order @@ -114,30 +91,17 @@ program main ! ** Allocate space for the input and output matrices ! ******************************************************************** - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), C(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err - stop 1 - endif - - ! Fill the original matrix do i=1, order A(:,i) = real(i-1,REAL64) B(:,i) = real(i-1,REAL64) + C(:,i) = real(0,REAL64) enddo - C = 0 t0 = 0 @@ -153,14 +117,11 @@ program main ! ** Analyze and output results. ! ******************************************************************** - deallocate( B ) - deallocate( A ) - forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = sum(C) - deallocate( C ) + deallocate( A,B,C ) residuum = abs(checksum-reference)/reference if (residuum .lt. epsilon) then diff --git a/FORTRAN/dgemm-stdpar.F90 b/FORTRAN/dgemm-stdpar.F90 index 84e078dc3..9cb8a49e1 100644 --- a/FORTRAN/dgemm-stdpar.F90 +++ b/FORTRAN/dgemm-stdpar.F90 @@ -54,7 +54,6 @@ subroutine prk_dgemm(order, tile_size, A, B, C) use iso_fortran_env - use prk implicit none integer(kind=INT32), intent(in) :: order, tile_size real(kind=REAL64), intent(in) :: A(order,order) @@ -89,11 +88,9 @@ end subroutine prk_dgemm program main use iso_fortran_env + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the kernel integer(kind=INT32) :: order ! order of the matrix @@ -116,62 +113,23 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a61)') 'Fortran STDPAR Dense matrix-matrix multiplication: C += A x B' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a66)') 'Usage: ./dgemm-pretty <# iterations> []' - stop 1 - endif + call prk_get_arguments('dgemm',iterations=iterations,order=order,tile_size=tile_size) - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size.lt.1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size, & - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - ! ******************************************************************** ! ** Allocate space for the input and output matrices ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), C(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif @@ -198,9 +156,6 @@ program main ! ** Analyze and output results. ! ******************************************************************** - deallocate( A ) - deallocate( B ) - forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = 0.0d0 @@ -210,7 +165,7 @@ program main enddo enddo - deallocate( C ) + deallocate( A,B,C ) residuum = abs(checksum-reference)/reference if (residuum .lt. epsilon) then diff --git a/FORTRAN/dgemm-taskloop-openmp.F90 b/FORTRAN/dgemm-taskloop-openmp.F90 index db02c0402..e663cbf72 100644 --- a/FORTRAN/dgemm-taskloop-openmp.F90 +++ b/FORTRAN/dgemm-taskloop-openmp.F90 @@ -106,10 +106,7 @@ program main use omp_lib use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the kernel integer(kind=INT32) :: order ! order of the matrix @@ -123,7 +120,7 @@ program main integer(kind=INT32) :: i,j,k real(kind=REAL64) :: checksum, reference, residuum real(kind=REAL64) :: t0, t1, dgemm_time, avgtime ! timing parameters - real(kind=REAL64), parameter :: epsilon=1.0d-8 ! error tolerance + real(kind=REAL64), parameter :: epsilon=1.0d-8 ! error tolerance ! ******************************************************************** ! read and test input parameters @@ -132,63 +129,24 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a70)') 'Fortran OpenMP TASKLOOP Dense matrix-matrix multiplication: C += A x B' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./dgemm-pretty <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('dgemm',iterations=iterations,order=order,tile_size=tile_size) - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size.lt.1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size, & - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - ! ******************************************************************** ! ** Allocate space for the input and output matrices ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), C(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif @@ -208,7 +166,9 @@ program main t0 = 0 do k=0,iterations - if (k.eq.1) t0 = prk_get_wtime() + if (k.eq.1) then + t0 = prk_get_wtime() + endif call prk_dgemm(order, tile_size, A, B, C) enddo @@ -223,9 +183,6 @@ program main ! ** Analyze and output results. ! ******************************************************************** - deallocate( A ) - deallocate( B ) - forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = 0.0d0 @@ -237,7 +194,7 @@ program main enddo !$omp end parallel do simd - deallocate( C ) + deallocate( A,B,C ) residuum = abs(checksum-reference)/reference if (residuum .lt. epsilon) then diff --git a/FORTRAN/dgemm.F90 b/FORTRAN/dgemm.F90 index 1167f589d..47e968e2c 100644 --- a/FORTRAN/dgemm.F90 +++ b/FORTRAN/dgemm.F90 @@ -105,10 +105,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the kernel integer(kind=INT32) :: order ! order of the matrix @@ -131,62 +128,23 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a61)') 'Fortran Serial Dense matrix-matrix multiplication: C += A x B' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a66)') 'Usage: ./dgemm-pretty <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('dgemm',iterations=iterations,order=order,tile_size=tile_size) - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size.lt.1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size, & - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - ! ******************************************************************** ! ** Allocate space for the input and output matrices ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), C(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - allocate( C(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of C returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif @@ -213,9 +171,6 @@ program main ! ** Analyze and output results. ! ******************************************************************** - deallocate( A ) - deallocate( B ) - forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = 0.0d0 @@ -225,7 +180,7 @@ program main enddo enddo - deallocate( C ) + deallocate( A,B,C ) residuum = abs(checksum-reference)/reference if (residuum .lt. epsilon) then From 9ae235e21850fe6bf2c4c8d9368717956e3771c3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 15:11:43 +0200 Subject: [PATCH 174/325] new arg parse --- FORTRAN/stencil-coarray.F90 | 131 +++++++--------------------- FORTRAN/stencil-openacc.F90 | 97 +++++--------------- FORTRAN/stencil-openmp-target.F90 | 104 +++++----------------- FORTRAN/stencil-openmp.F90 | 98 +++++---------------- FORTRAN/stencil-pretty.F90 | 86 ++++++------------ FORTRAN/stencil-stdpar.F90 | 105 +++++----------------- FORTRAN/stencil-taskloop-openmp.F90 | 98 +++++---------------- FORTRAN/stencil.F90 | 8 +- 8 files changed, 172 insertions(+), 555 deletions(-) diff --git a/FORTRAN/stencil-coarray.F90 b/FORTRAN/stencil-coarray.F90 index 3c7e8e3b1..61c248cc7 100644 --- a/FORTRAN/stencil-coarray.F90 +++ b/FORTRAN/stencil-coarray.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2013, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -147,10 +148,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension @@ -182,49 +180,14 @@ program main if (me == 1) then write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a44)') 'Fortran coarray stencil execution on 2D grid' - endif - if (command_argument_count().lt.2) then - if (me == 1) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a32,a29)') 'Usage: ./stencil <# iterations> ', & - ' [tile_size]' - endif - stop 1 - endif + call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - if (me == 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - endif - stop 1 endif - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - if (n .lt. 1) then - if (me == 1) then - write(*,'(a,i5)') 'ERROR: array dimension must be >= 1 : ', n - endif - stop 1 - endif - - tiling = .false. - tile_size = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - if ((tile_size .lt. 1).or.(tile_size.gt.n)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',n - else - tiling = .true. - endif - endif + call co_broadcast(iterations,source_image=1) + call co_broadcast(n,source_image=1) + call co_broadcast(tile_size,source_image=1) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -233,24 +196,30 @@ program main is_star = .false. #endif - ! TODO: parse runtime input for radius + tiling = (tile_size.ne.n) - if (r .lt. 1) then - if (me == 1) then - write(*,'(a,i5,a)') 'ERROR: Stencil radius ',r,' should be positive' + if (me == 1) then + write(*,'(a,i8)') 'Number of images = ',num_images() + write(*,'(a,i8)') 'Number of iterations = ', iterations + write(*,'(a,i8)') 'Grid size = ', n + write(*,'(a,i8)') 'Radius of stencil = ', r + if (is_star) then + write(*,'(a,a)') 'Type of stencil = star' + stencil_size = 4*r+1 + else + write(*,'(a,a)') 'Type of stencil = grid' + stencil_size = (2*r+1)**2 endif - stop 1 - else if ((2*r+1) .gt. n) then - if (me == 1) then - write(*,'(a,i5,a,i5)') 'ERROR: Stencil radius ',r,& - ' exceeds grid size ',n + if (tiling) then + write(*,'(a,i5)') 'Tile size = ', tile_size + else + write(*,'(a)') 'Untiled' endif - stop 1 endif -! Collectives are part of Fortran 2015 -! call co_broadcast(n,source_image=1) -! call co_broadcast(iterations,source_image=1) + ! ******************************************************************** + ! ** Allocate space for the input and perform the computation + ! ******************************************************************** dims(1) = int(sqrt(real(np))) dims(2) = int(np/dims(1)) @@ -281,42 +250,12 @@ program main if(modulo(n,nr) > 0) nr_g = nr_g + 1 if(modulo(n,nc) > 0) nc_g = nc_g + 1 - allocate( A(1-r:nr_g+r,1-r:nc_g+r)[dims(1),*], stat=err) + allocate( A(1-r:nr_g+r,1-r:nc_g+r)[dims(1),*], B(1:nr_g,1:nc_g), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - allocate( B(1:nr_g,1:nc_g), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - norm = 0.d0 - active_points = int(n-2*r,INT64)**2 - - if (me == 1) then - write(*,'(a,i8)') 'Number of images = ',num_images() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Grid size = ', n - write(*,'(a,i8)') 'Radius of stencil = ', r - if (is_star) then - write(*,'(a,a)') 'Type of stencil = star' - stencil_size = 4*r+1 - else - write(*,'(a,a)') 'Type of stencil = grid' - stencil_size = (2*r+1)**2 - endif - !write(*,'(a)') 'Data type = double precision' - !write(*,'(a)') 'Compact representation of stencil loop body' - if (tiling) then - write(*,'(a,i5)') 'Tile size = ', tile_size - else - write(*,'(a)') 'Untiled' - endif - endif - call initialize_w(is_star,r,W) ! Getting the remote size of the upper and left images @@ -469,6 +408,7 @@ program main enddo ! iterations t1 = prk_get_wtime() + stencil_time = t1 - t0 sync all @@ -484,29 +424,22 @@ program main if(coords(1) == dims(1)) end_i = nr - r if(coords(2) == dims(2)) end_j = nc - r + norm = 0.d0 do j=start_j,end_j do i=start_i,end_i norm = norm + abs(B(i,j)) enddo enddo - stencil_time = t1 - t0 -! Collectives are part of Fortran 2015 -! call co_sum(norm,result_image=1) - sync all - if(me == 1) then - do i=2,np - norm = norm + norm[i] - enddo - norm = norm / real(active_points,REAL64) - endif + active_points = int(n-2*r,INT64)**2 + call co_sum(norm,result_image=1) + norm = norm / real(active_points,REAL64) !****************************************************************************** !* Analyze and output results. !****************************************************************************** - deallocate( B ) - deallocate( A ) + deallocate( A,B ) ! Jeff: valgrind says that this is branching on uninitialized value (norm), ! but this is nonsense since norm is initialized to 0 at line 167. diff --git a/FORTRAN/stencil-openacc.F90 b/FORTRAN/stencil-openacc.F90 index a6aec7726..c656ec9f1 100644 --- a/FORTRAN/stencil-openacc.F90 +++ b/FORTRAN/stencil-openacc.F90 @@ -145,10 +145,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension @@ -175,41 +172,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a44)') 'Fortran OpenACC Stencil execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a32,a29)') 'Usage: ./stencil <# iterations> ', & - ' [tile_size]' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - if (n .lt. 1) then - write(*,'(a,i5)') 'ERROR: array dimension must be >= 1 : ', n - stop 1 - endif - - tiling = .false. - tile_size = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - if ((tile_size .lt. 1).or.(tile_size.gt.n)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',n - else - tiling = .true. - endif - endif + call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -218,48 +181,32 @@ program main is_star = .false. #endif - ! TODO: parse runtime input for radius - - if (r .lt. 1) then - write(*,'(a,i5,a)') 'ERROR: Stencil radius ',r,' should be positive' - stop 1 - else if ((2*r+1) .gt. n) then - write(*,'(a,i5,a,i5)') 'ERROR: Stencil radius ',r,& - ' exceeds grid size ',n - stop 1 - endif - - allocate( A(n,n), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(n,n), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif + tiling = (tile_size.ne.n) - norm = 0.d0 - active_points = int(n-2*r,INT64)**2 - - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Grid size = ', n - write(*,'(a,i8)') 'Radius of stencil = ', r + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Grid size = ', n + write(*,'(a22,i8)') 'Radius of stencil = ', r if (is_star) then - write(*,'(a,a)') 'Type of stencil = star' + write(*,'(a22,a8)') 'Type of stencil = ', 'star' stencil_size = 4*r+1 else - write(*,'(a,a)') 'Type of stencil = grid' + write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - write(*,'(a)') 'Data type = double precision' - write(*,'(a)') 'Compact representation of stencil loop body' if (tiling) then - write(*,'(a,i5)') 'Tile size = ', tile_size + write(*,'(a22,i8)') 'Tile size = ', tile_size else - write(*,'(a)') 'Untiled' + write(*,'(a10)') 'Tiling off' + endif + + ! ******************************************************************** + ! ** Allocate space for the input and perform the computation + ! ******************************************************************** + + allocate( A(n,n), B(n,n), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 endif call initialize_w(is_star,r,W) @@ -294,6 +241,7 @@ program main enddo ! iterations t1 = prk_get_wtime() + stencil_time = t1 - t0 !$acc end data @@ -304,15 +252,14 @@ program main enddo enddo - stencil_time = t1 - t0 + active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) !****************************************************************************** !* Analyze and output results. !****************************************************************************** - deallocate( B ) - deallocate( A ) + deallocate( A,B ) ! verify correctness reference_norm = real(iterations+1,REAL64) * (cx + cy); diff --git a/FORTRAN/stencil-openmp-target.F90 b/FORTRAN/stencil-openmp-target.F90 index 6dcfe97b6..23a5a8868 100644 --- a/FORTRAN/stencil-openmp-target.F90 +++ b/FORTRAN/stencil-openmp-target.F90 @@ -152,10 +152,7 @@ program main use omp_lib use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension @@ -183,41 +180,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a43)') 'Fortran OpenMP TARGET Stencil execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a32,a29)') 'Usage: ./stencil <# iterations> ', & - ' [tile_size]' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - if (n .lt. 1) then - write(*,'(a,i5)') 'ERROR: array dimension must be >= 1 : ', n - stop 1 - endif - - tiling = .false. - tile_size = 0 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - if ((tile_size .lt. 1).or.(tile_size.gt.n)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',n - else - tiling = .true. - endif - endif + call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -226,54 +189,36 @@ program main is_star = .false. #endif - ! TODO: parse runtime input for radius - - if (r .lt. 1) then - write(*,'(a,i5,a)') 'ERROR: Stencil radius ',r,' should be positive' - stop 1 - else if ((2*r+1) .gt. n) then - write(*,'(a,i5,a,i5)') 'ERROR: Stencil radius ',r,& - ' exceeds grid size ',n - stop 1 - endif - - allocate( A(n,n), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(n,n), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - norm = 0.d0 - active_points = int(n-2*r,INT64)**2 + tiling = (tile_size.ne.n) - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Grid size = ', n - write(*,'(a,i8)') 'Radius of stencil = ', r + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Grid size = ', n + write(*,'(a22,i8)') 'Radius of stencil = ', r if (is_star) then - write(*,'(a,a)') 'Type of stencil = star' + write(*,'(a22,a8)') 'Type of stencil = ', 'star' stencil_size = 4*r+1 else - write(*,'(a,a)') 'Type of stencil = grid' + write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - write(*,'(a)') 'Data type = double precision' - write(*,'(a)') 'Compact representation of stencil loop body' if (tiling) then - write(*,'(a,i5)') 'Tile size = ', tile_size + write(*,'(a22,i8)') 'Tile size = ', tile_size else - write(*,'(a)') 'Untiled' + write(*,'(a10)') 'Tiling off' + endif + + ! ******************************************************************** + ! ** Allocate space for the input and perform the computation + ! ******************************************************************** + + allocate( A(n,n), B(n,n), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 endif call initialize_w(is_star,r,W) - ! HOST - ! intialize the input and output arrays !$omp parallel do collapse(2) default(none) shared(n,A,B) private(i,j) do j=1,n do i=1,n @@ -289,13 +234,13 @@ program main do k=0,iterations - if (k.eq.1) t0 = omp_get_wtime() + if (k.eq.1) then + t0 = omp_get_wtime() + endif - ! DEVICE ! Apply the stencil operator call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - ! DEVICE ! add constant to solution to force refresh of neighbor data, if any !$omp target teams distribute parallel do simd collapse(2) GPU_SCHEDULE do j=1,n @@ -313,7 +258,6 @@ program main stencil_time = t1 - t0 - ! HOST ! compute L1 norm in parallel !$omp parallel do collapse(2) & !$omp& default(none) shared(n,B) private(i,j) & @@ -324,14 +268,14 @@ program main enddo enddo !$omp end parallel do + active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) !****************************************************************************** !* Analyze and output results. !****************************************************************************** - deallocate( B ) - deallocate( A ) + deallocate( A,B ) ! verify correctness reference_norm = real(iterations+1,REAL64) * (cx + cy); diff --git a/FORTRAN/stencil-openmp.F90 b/FORTRAN/stencil-openmp.F90 index 83f191c65..507fbd49c 100644 --- a/FORTRAN/stencil-openmp.F90 +++ b/FORTRAN/stencil-openmp.F90 @@ -147,10 +147,7 @@ program main use omp_lib use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension @@ -177,41 +174,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a43)') 'Fortran OpenMP Stencil execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a32,a29)') 'Usage: ./stencil <# iterations> ', & - ' [tile_size]' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - if (n .lt. 1) then - write(*,'(a,i5)') 'ERROR: array dimension must be >= 1 : ', n - stop 1 - endif - - tiling = .false. - tile_size = n - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - if ((tile_size .lt. 1).or.(tile_size.gt.n)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',n - else - tiling = .true. - endif - endif + call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -220,49 +183,33 @@ program main is_star = .false. #endif - ! TODO: parse runtime input for radius - - if (r .lt. 1) then - write(*,'(a,i5,a)') 'ERROR: Stencil radius ',r,' should be positive' - stop 1 - else if ((2*r+1) .gt. n) then - write(*,'(a,i5,a,i5)') 'ERROR: Stencil radius ',r,& - ' exceeds grid size ',n - stop 1 - endif + tiling = (tile_size.ne.n) - allocate( A(n,n), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(n,n), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - norm = 0.d0 - active_points = int(n-2*r,INT64)**2 - - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Grid size = ', n - write(*,'(a,i8)') 'Radius of stencil = ', r + write(*,'(a22,i8)') 'Number of threads = ',omp_get_max_threads() + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Grid size = ', n + write(*,'(a22,i8)') 'Radius of stencil = ', r if (is_star) then - write(*,'(a,a)') 'Type of stencil = star' + write(*,'(a22,a8)') 'Type of stencil = ', 'star' stencil_size = 4*r+1 else - write(*,'(a,a)') 'Type of stencil = grid' + write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - write(*,'(a)') 'Data type = double precision' - write(*,'(a)') 'Compact representation of stencil loop body' if (tiling) then - write(*,'(a,i5)') 'Tile size = ', tile_size + write(*,'(a22,i8)') 'Tile size = ', tile_size else - write(*,'(a)') 'Untiled' + write(*,'(a10)') 'Tiling off' + endif + + ! ******************************************************************** + ! ** Allocate space for the input and perform the computation + ! ******************************************************************** + + allocate( A(n,n), B(n,n), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 endif call initialize_w(is_star,r,W) @@ -323,15 +270,14 @@ program main !$omp end parallel - stencil_time = t1 - t0 + active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) !****************************************************************************** !* Analyze and output results. !****************************************************************************** - deallocate( B ) - deallocate( A ) + deallocate( A,B ) ! verify correctness reference_norm = real(iterations+1,REAL64) * (cx + cy); diff --git a/FORTRAN/stencil-pretty.F90 b/FORTRAN/stencil-pretty.F90 index 68386f5c5..44ba2e4ff 100644 --- a/FORTRAN/stencil-pretty.F90 +++ b/FORTRAN/stencil-pretty.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2013, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -145,14 +146,13 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension integer(kind=INT32) :: stencil_size ! number of points in stencil + integer(kind=INT32) :: tile_size ! loop nest block factor + logical :: tiling ! boolean indication loop nest blocking logical :: is_star ! true = star, false = grid integer(kind=INT32), parameter :: r=RADIUS ! radius of stencil real(kind=REAL64) :: W(-r:r,-r:r) ! weights of points in the stencil @@ -173,28 +173,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a43)') 'Fortran pretty stencil execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a32,a29)') 'Usage: ./stencil <# iterations> ', & - ' [tile_size]' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - if (n .lt. 1) then - write(*,'(a,i5)') 'ERROR: array dimension must be >= 1 : ', n - stop 1 - endif + call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -203,45 +182,34 @@ program main is_star = .false. #endif - ! TODO: parse runtime input for radius + tiling = (tile_size.ne.n) - if (r .lt. 1) then - write(*,'(a,i5,a)') 'ERROR: Stencil radius ',r,' should be positive' - stop 1 - else if ((2*r+1) .gt. n) then - write(*,'(a,i5,a,i5)') 'ERROR: Stencil radius ',r,& - ' exceeds grid size ',n - stop 1 + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Grid size = ', n + write(*,'(a22,i8)') 'Radius of stencil = ', r + if (is_star) then + write(*,'(a22,a8)') 'Type of stencil = ', 'star' + stencil_size = 4*r+1 + else + write(*,'(a22,a8)') 'Type of stencil = ','grid' + stencil_size = (2*r+1)**2 endif - - allocate( A(n,n), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 + if (tiling) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - allocate( B(n,n), stat=err ) + ! ******************************************************************** + ! ** Allocate space for the input and perform the computation + ! ******************************************************************** + + allocate( A(n,n), B(n,n), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - norm = 0.d0 - active_points = int(n-2*r,INT64)**2 - - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Grid size = ', n - write(*,'(a,i8)') 'Radius of stencil = ', r - if (is_star) then - write(*,'(a,a)') 'Type of stencil = star' - stencil_size = 4*r+1 - else - write(*,'(a,a)') 'Type of stencil = grid' - stencil_size = (2*r+1)**2 - endif - write(*,'(a)') 'Data type = double precision' - write(*,'(a)') 'Compact representation of stencil loop body' - call initialize_w(is_star,r,W) ! initialize the input and output arrays @@ -281,18 +249,18 @@ program main enddo ! iterations t1 = prk_get_wtime() + stencil_time = t1 - t0 norm = sum(sum(abs(B(r+1:n-r,r+1:n-r)),1)) - stencil_time = t1 - t0 + active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) !****************************************************************************** !* Analyze and output results. !****************************************************************************** - deallocate( B ) - deallocate( A ) + deallocate( A,B ) ! Jeff: valgrind says that this is branching on uninitialized value (norm), ! but this is nonsense since norm is initialized to 0.0 at line 167. diff --git a/FORTRAN/stencil-stdpar.F90 b/FORTRAN/stencil-stdpar.F90 index c8ce01f9e..01336bfaa 100644 --- a/FORTRAN/stencil-stdpar.F90 +++ b/FORTRAN/stencil-stdpar.F90 @@ -137,10 +137,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension @@ -168,41 +165,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a43)') 'Fortran stdpar Stencil execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a32,a29)') 'Usage: ./stencil <# iterations> ', & - ' [tile_size]' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - if (n .lt. 1) then - write(*,'(a,i5)') 'ERROR: array dimension must be >= 1 : ', n - stop 1 - endif - - tiling = .false. - tile_size = n - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - if ((tile_size .lt. 1).or.(tile_size.gt.n)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',n - else - tiling = .true. - endif - endif + call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -211,31 +174,7 @@ program main is_star = .false. #endif - ! TODO: parse runtime input for radius - - if (r .lt. 1) then - write(*,'(a,i5,a)') 'ERROR: Stencil radius ',r,' should be positive' - stop 1 - else if ((2*r+1) .gt. n) then - write(*,'(a,i5,a,i5)') 'ERROR: Stencil radius ',r,& - ' exceeds grid size ',n - stop 1 - endif - - allocate( A(n,n), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(n,n), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - norm = 0.d0 - active_points = int(n-2*r,INT64)**2 + tiling = (tile_size.ne.n) write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Grid size = ', n @@ -247,12 +186,20 @@ program main write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - !write(*,'(a)') 'Data type = double precision' - !write(*,'(a)') 'Compact representation of stencil loop body' if (tiling) then - write(*,'(a22,i8)') 'Tile size = ', tile_size + write(*,'(a22,i8)') 'Tile size = ', tile_size else - write(*,'(a22)') 'Untiled' + write(*,'(a10)') 'Tiling off' + endif + + ! ******************************************************************** + ! ** Allocate space for the input and perform the computation + ! ******************************************************************** + + allocate( A(n,n), B(n,n), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 endif call initialize_w(is_star,r,W) @@ -272,37 +219,27 @@ program main call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) ! add constant to solution to force refresh of neighbor data, if any - !do concurrent (j=1:n, i=1:n) - ! A(i,j) = A(i,j) + 1.d0 - !enddo - do j=1,n - do i=1,n - A(i,j) = A(i,j) + 1.d0 - enddo + do concurrent (j=1:n, i=1:n) + A(i,j) = A(i,j) + 1.d0 enddo enddo ! iterations t1 = prk_get_wtime() + stencil_time = t1 - t0 - !do concurrent (j=r:n-r, i=r:n-r) - ! norm = norm + abs(B(i,j)) - !enddo - do j=r,n-r - do i=r,n-r - norm = norm + abs(B(i,j)) - enddo + do concurrent (j=r:n-r, i=r:n-r) + norm = norm + abs(B(i,j)) enddo - stencil_time = t1 - t0 + active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) !****************************************************************************** !* Analyze and output results. !****************************************************************************** - deallocate( B ) - deallocate( A ) + deallocate( A,B ) ! verify correctness reference_norm = real(iterations+1,REAL64) * (cx + cy); diff --git a/FORTRAN/stencil-taskloop-openmp.F90 b/FORTRAN/stencil-taskloop-openmp.F90 index 495fa0626..de1d76501 100644 --- a/FORTRAN/stencil-taskloop-openmp.F90 +++ b/FORTRAN/stencil-taskloop-openmp.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2013, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -148,10 +149,7 @@ program main use omp_lib use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension @@ -178,41 +176,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a52)') 'Fortran OpenMP TASKLOOP Stencil execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a32,a29)') 'Usage: ./stencil <# iterations> ', & - ' [tile_size]' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - if (n .lt. 1) then - write(*,'(a,i5)') 'ERROR: array dimension must be >= 1 : ', n - stop 1 - endif - - tiling = .false. - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - if ((tile_size .lt. 1).or.(tile_size.gt.n)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',n - else - tiling = .true. - endif - endif + call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -221,49 +185,32 @@ program main is_star = .false. #endif - ! TODO: parse runtime input for radius - - if (r .lt. 1) then - write(*,'(a,i5,a)') 'ERROR: Stencil radius ',r,' should be positive' - stop 1 - else if ((2*r+1) .gt. n) then - write(*,'(a,i5,a,i5)') 'ERROR: Stencil radius ',r,& - ' exceeds grid size ',n - stop 1 - endif - - allocate( A(n,n), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(n,n), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - norm = 0.d0 - active_points = int(n-2*r,INT64)**2 + tiling = (tile_size.ne.n) - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Grid size = ', n - write(*,'(a,i8)') 'Radius of stencil = ', r + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Grid size = ', n + write(*,'(a22,i8)') 'Radius of stencil = ', r if (is_star) then - write(*,'(a,a)') 'Type of stencil = star' + write(*,'(a22,a8)') 'Type of stencil = ', 'star' stencil_size = 4*r+1 else - write(*,'(a,a)') 'Type of stencil = grid' + write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - write(*,'(a)') 'Data type = double precision' - write(*,'(a)') 'Compact representation of stencil loop body' if (tiling) then - write(*,'(a,i5)') 'Tile size = ', tile_size + write(*,'(a22,i8)') 'Tile size = ', tile_size else - write(*,'(a)') 'Untiled' + write(*,'(a10)') 'Tiling off' + endif + + ! ******************************************************************** + ! ** Allocate space for the input and perform the computation + ! ******************************************************************** + + allocate( A(n,n), B(n,n), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation returned ',err + stop 1 endif call initialize_w(is_star,r,W) @@ -324,15 +271,14 @@ program main enddo enddo !$omp end parallel do - + active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) !****************************************************************************** !* Analyze and output results. !****************************************************************************** - deallocate( B ) - deallocate( A ) + deallocate( A,B ) ! verify correctness reference_norm = real(iterations+1,REAL64) * (cx + cy); diff --git a/FORTRAN/stencil.F90 b/FORTRAN/stencil.F90 index b4c8cc711..49a6f9b1a 100644 --- a/FORTRAN/stencil.F90 +++ b/FORTRAN/stencil.F90 @@ -138,10 +138,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension @@ -183,7 +180,7 @@ program main write(*,'(a22,i8)') 'Grid size = ', n write(*,'(a22,i8)') 'Radius of stencil = ', r if (is_star) then - write(*,'(a22,a8)') 'Type of stencil = ','star' + write(*,'(a22,a8)') 'Type of stencil = ', 'star' stencil_size = 4*r+1 else write(*,'(a22,a8)') 'Type of stencil = ','grid' @@ -249,8 +246,7 @@ program main !* Analyze and output results. !****************************************************************************** - deallocate( B ) - deallocate( A ) + deallocate( A,B ) ! verify correctness reference_norm = real(iterations+1,REAL64) * (cx + cy); From b053f86f6c8fe9ec2813a6af7ca4ec9c6c2c26d2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 15:12:04 +0200 Subject: [PATCH 175/325] add CUF/GPU support --- FORTRAN/prk_mod.F90 | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index cd6224e5e..9b5e2a202 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -12,6 +12,7 @@ end function prk_get_wtime subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? iterations, & ! everything length, offset, & ! nstream + gpu_block_size, & ! nstream GPU only order, tile_size, & ! transpose, stencil, dgemm stencil, radius) ! not supported in implementations yet use iso_fortran_env @@ -19,6 +20,7 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? character(len=*), intent(in) :: kernel integer(kind=INT32), intent(out) :: iterations integer(kind=INT64), intent(out), optional :: length, offset ! nstream + integer(kind=INT32), intent(out), optional :: gpu_block_size ! nstream GPU only integer(kind=INT32), intent(out), optional :: order, tile_size ! transpose, stencil, dgemm integer(kind=INT32), intent(out), optional :: radius ! stencil character(len=4), intent(out), optional :: stencil ! stencil @@ -33,6 +35,9 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? if (present(offset)) then offset = 0 endif + if (present(gpu_block_size)) then + gpu_block_size = 256 + endif if (present(order)) then order = 1024 endif @@ -73,8 +78,13 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? if (argc.lt.2 ) then write(*,'(a17,i2)') 'argument count = ', command_argument_count() if (kernel(1:7).eq.'nstream') then - write(*,'(a62)') 'Old Usage: <# iterations> []' - write(*,'(a87)') 'New Usage: iterations=<# iterations> length= [offset=]' + if (present(gpu_block_size)) then + write(*,'(a62)') 'Old Usage: <# iterations> []' + write(*,'(a87)') 'New Usage: iterations=<# iterations> length= [block_size=]' + else + write(*,'(a62)') 'Old Usage: <# iterations> []' + write(*,'(a87)') 'New Usage: iterations=<# iterations> length= [offset=]' + endif else if ( (kernel(1:9).eq.'transpose') & .or.(kernel(1:7).eq.'stencil') & .or.(kernel(1:5).eq.'dgemm') ) then @@ -148,6 +158,14 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? read(argtmp(p+1:arglen),'(i1)') radius endif endif + ! look for gpu_block_size + if (present(gpu_block_size)) then + q = index(argtmp(1:p-1),"block") + if (q.ne.0) then + read(argtmp(p+1:arglen),'(i5)') gpu_block_size + endif + endif + ! end looking endif endif enddo From 9ebd7e8c0feb25b6fb6f423e0add0ca017934db4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 15:12:12 +0200 Subject: [PATCH 176/325] dealloc --- FORTRAN/transpose.F90 | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/FORTRAN/transpose.F90 b/FORTRAN/transpose.F90 index 8870133dd..13b345a03 100644 --- a/FORTRAN/transpose.F90 +++ b/FORTRAN/transpose.F90 @@ -121,7 +121,6 @@ program main t0 = 0 do k=0,iterations - if (k.eq.1) then t0 = prk_get_wtime() endif @@ -168,8 +167,7 @@ program main enddo enddo - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' From db92ee6f8046e58d6d432df5dfd665a5c753d2c5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Dec 2021 15:12:16 +0200 Subject: [PATCH 177/325] copyright --- FORTRAN/p2p.F90 | 1 + 1 file changed, 1 insertion(+) diff --git a/FORTRAN/p2p.F90 b/FORTRAN/p2p.F90 index 29d328a91..024b49764 100644 --- a/FORTRAN/p2p.F90 +++ b/FORTRAN/p2p.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions From 613cd88358bfadbbcef145a19681fdedde00e227 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 12:57:38 +0200 Subject: [PATCH 178/325] add BLAS and MPI --- FORTRAN/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index bb4d5f97e..adcddef87 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -58,7 +58,7 @@ ifeq ($(findstring xlf,$(FC)),xlf) FCFLAGS += $(XLFPP)-DXLF endif -all: serial pretty openmp tasks $(EXTRA) +all: serial pretty openmp tasks blas mpi mpi-openmp $(EXTRA) serial: p2p stencil transpose nstream dgemm p2p-innerloop pic pic_soa From 77467462ed95bb041c56d274d036195bb2b59248 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 12:57:49 +0200 Subject: [PATCH 179/325] fix data use issues --- FORTRAN/nstream-mpi.F90 | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index f94ac2af4..e45735eee 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -133,11 +133,13 @@ program main call MPI_Abort(MPI_COMM_WORLD, 10) endif + scalar = 3 + #ifdef _OPENMP - !$omp parallel default(none) & - !$omp& shared(A,B,C,nstream_time) & - !$omp& firstprivate(length,iterations) & - !$omp& private(i,k,scalar,t0,t1) + !$omp parallel default(none) & + !$omp& shared(A,B,C,nstream_time) & + !$omp& firstprivate(length,iterations,scalar) & + !$omp& private(i,k,t0,t1) #endif #if defined(_OPENMP) @@ -169,8 +171,6 @@ program main call MPI_Barrier(MPI_COMM_WORLD) !$omp end master - scalar = 3 - t0 = 0 do k=0,iterations @@ -209,6 +209,7 @@ program main #endif call MPI_Barrier(MPI_COMM_WORLD) t1 = MPI_Wtime() + nstream_time = t1 - t0 #ifdef _OPENMP !$omp end master #endif @@ -217,8 +218,6 @@ program main !$omp end parallel #endif - nstream_time = t1 - t0 - ! ******************************************************************** ! ** Analyze and output results. ! ******************************************************************** From 49782d34ff3d0500e1dfef67232bb7478532576b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 13:10:48 +0200 Subject: [PATCH 180/325] stencil time init --- FORTRAN/stencil-openmp.F90 | 1 + 1 file changed, 1 insertion(+) diff --git a/FORTRAN/stencil-openmp.F90 b/FORTRAN/stencil-openmp.F90 index 507fbd49c..267054d3f 100644 --- a/FORTRAN/stencil-openmp.F90 +++ b/FORTRAN/stencil-openmp.F90 @@ -257,6 +257,7 @@ program main !$omp barrier !$omp master t1 = omp_get_wtime() + stencil_time = t1 - t0 !$omp end master ! compute L1 norm in parallel From 149f2a98c6a564202bb8d1494b0f9aa5dd8eb0d0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 14:06:43 +0200 Subject: [PATCH 181/325] remove unused --- FORTRAN/dgemm-pretty.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/dgemm-pretty.F90 b/FORTRAN/dgemm-pretty.F90 index 25d4b6140..fac1a6250 100644 --- a/FORTRAN/dgemm-pretty.F90 +++ b/FORTRAN/dgemm-pretty.F90 @@ -70,7 +70,7 @@ program main real(kind=REAL64), allocatable :: C(:,:) ! buffer to hold output matrix integer(kind=INT64) :: nflops ! runtime variables - integer(kind=INT32) :: i,j,k + integer(kind=INT32) :: i,k real(kind=REAL64) :: checksum, reference, residuum real(kind=REAL64) :: t0, t1, dgemm_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.0d-8 ! error tolerance From d944730095774e40c174adc19d68d632783dc2e1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 14:06:58 +0200 Subject: [PATCH 182/325] fix scalar scope --- FORTRAN/nstream-openmp.F90 | 8 ++++---- FORTRAN/nstream-taskloop-openmp.F90 | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/FORTRAN/nstream-openmp.F90 b/FORTRAN/nstream-openmp.F90 index f6bd84467..e98ebc0e6 100644 --- a/FORTRAN/nstream-openmp.F90 +++ b/FORTRAN/nstream-openmp.F90 @@ -108,10 +108,12 @@ program main stop 1 endif + scalar = 3 + !$omp parallel default(none) & - !$omp& shared(A,B,C,nstream_time) & + !$omp& shared(A,B,C,scalar,nstream_time) & !$omp& firstprivate(length,iterations,offset) & - !$omp& private(i,k,t0,t1,scalar) + !$omp& private(i,k,t0,t1) !$omp do do i=1,length @@ -121,8 +123,6 @@ program main enddo !$omp end do - scalar = 3 - t0 = 0 ! need this because otherwise no barrier between initialization diff --git a/FORTRAN/nstream-taskloop-openmp.F90 b/FORTRAN/nstream-taskloop-openmp.F90 index 2cce93824..64d8536fd 100644 --- a/FORTRAN/nstream-taskloop-openmp.F90 +++ b/FORTRAN/nstream-taskloop-openmp.F90 @@ -108,10 +108,12 @@ program main stop 1 endif + scalar = 3 + !$omp parallel default(none) & - !$omp& shared(A,B,C,nstream_time) & + !$omp& shared(A,B,C,scalar,nstream_time) & !$omp& firstprivate(length,iterations,offset) & - !$omp& private(i,k,t0,t1,scalar) + !$omp& private(i,k,t0,t1) !$omp master !$omp taskloop firstprivate(length,offset) shared(A,B,C) private(i) @@ -122,8 +124,6 @@ program main enddo !$omp end taskloop - scalar = 3 - t0 = 0 !$omp taskwait @@ -144,12 +144,12 @@ program main enddo ! iterations t1 = omp_get_wtime() + nstream_time = t1 - t0 + !$omp end master !$omp end parallel - nstream_time = t1 - t0 - ! ******************************************************************** ! ** Analyze and output results. ! ******************************************************************** From 966fa63db460908abdbb745da4a32b9ff5598b42 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 14:07:08 +0200 Subject: [PATCH 183/325] fix issues --- FORTRAN/pic-openmp.F90 | 18 +++++++++++------- FORTRAN/pic.F90 | 18 +++++++++++------- FORTRAN/pic_soa-openmp.F90 | 18 +++++++++++------- FORTRAN/pic_soa.F90 | 18 +++++++++++------- 4 files changed, 44 insertions(+), 28 deletions(-) diff --git a/FORTRAN/pic-openmp.F90 b/FORTRAN/pic-openmp.F90 index 55dce51c4..6d9b0b6b7 100644 --- a/FORTRAN/pic-openmp.F90 +++ b/FORTRAN/pic-openmp.F90 @@ -62,7 +62,8 @@ program pic type(particle_t), allocatable, dimension(:) :: particles type(particle_t) :: part integer(kind=INT64) :: L, n, k, m, init_mode - integer(kind=INT64) :: ip, iterations, iter + integer(kind=INT32) :: iterations, iter + integer(kind=INT64) :: ip real(kind=REAL64) :: rho real(kind=REAL64) :: t0, pic_time character(len=32) :: argtmp @@ -145,16 +146,18 @@ program pic print *, 'Number of particles placed = ', n + t0 = 0 + block real(kind=REAL64) :: fx, fy, ax, ay - do iter = 0_INT64, iterations - if(iter == 1) then + do iter = 0, iterations + if(iter == 1) then #ifdef _OPENMP - t0 = omp_get_wtime() + t0 = omp_get_wtime() #else - t0 = prk_get_wtime() + t0 = prk_get_wtime() #endif - endif + endif !$omp parallel do private(part, fx, fy, ax, ay) do ip=1,n part = particles(ip) @@ -334,7 +337,8 @@ end subroutine computeCoulomb integer function verifyParticle(part, iterations, Qgrid, L) type(particle_t), intent(in) :: part - integer(kind=REAL64), intent(in) :: iterations, L + integer(kind=INT32), intent(in) :: iterations + integer(kind=REAL64), intent(in) :: L real(kind=REAL64), allocatable, dimension(:,:), intent(in) :: Qgrid integer(kind=INT64) :: x, y real(kind=REAL64) :: x_final, y_final, x_periodic, y_periodic, disp diff --git a/FORTRAN/pic.F90 b/FORTRAN/pic.F90 index 55dce51c4..6d9b0b6b7 100644 --- a/FORTRAN/pic.F90 +++ b/FORTRAN/pic.F90 @@ -62,7 +62,8 @@ program pic type(particle_t), allocatable, dimension(:) :: particles type(particle_t) :: part integer(kind=INT64) :: L, n, k, m, init_mode - integer(kind=INT64) :: ip, iterations, iter + integer(kind=INT32) :: iterations, iter + integer(kind=INT64) :: ip real(kind=REAL64) :: rho real(kind=REAL64) :: t0, pic_time character(len=32) :: argtmp @@ -145,16 +146,18 @@ program pic print *, 'Number of particles placed = ', n + t0 = 0 + block real(kind=REAL64) :: fx, fy, ax, ay - do iter = 0_INT64, iterations - if(iter == 1) then + do iter = 0, iterations + if(iter == 1) then #ifdef _OPENMP - t0 = omp_get_wtime() + t0 = omp_get_wtime() #else - t0 = prk_get_wtime() + t0 = prk_get_wtime() #endif - endif + endif !$omp parallel do private(part, fx, fy, ax, ay) do ip=1,n part = particles(ip) @@ -334,7 +337,8 @@ end subroutine computeCoulomb integer function verifyParticle(part, iterations, Qgrid, L) type(particle_t), intent(in) :: part - integer(kind=REAL64), intent(in) :: iterations, L + integer(kind=INT32), intent(in) :: iterations + integer(kind=REAL64), intent(in) :: L real(kind=REAL64), allocatable, dimension(:,:), intent(in) :: Qgrid integer(kind=INT64) :: x, y real(kind=REAL64) :: x_final, y_final, x_periodic, y_periodic, disp diff --git a/FORTRAN/pic_soa-openmp.F90 b/FORTRAN/pic_soa-openmp.F90 index cda3cc646..41f600e81 100644 --- a/FORTRAN/pic_soa-openmp.F90 +++ b/FORTRAN/pic_soa-openmp.F90 @@ -66,7 +66,8 @@ program pic real(kind=REAL64), allocatable, dimension(:,:) :: Qgrid type(particles_t):: particles integer(kind=INT64) :: L, n, k, m, init_mode - integer(kind=INT64) :: ip, iterations, iter + integer(kind=INT32) :: iterations, iter + integer(kind=INT64) :: ip real(kind=REAL64) :: rho real(kind=REAL64) :: t0, pic_time character(len=32) :: argtmp @@ -149,16 +150,18 @@ program pic print *, 'Number of particles placed = ', n + t0 = 0 + block real(kind=REAL64) :: fx, fy, ax, ay, xval, yval, vx, vy, qval - do iter = 0_INT64, iterations - if(iter == 1) then + do iter = 0, iterations + if(iter == 1) then #ifdef _OPENMP - t0 = omp_get_wtime() + t0 = omp_get_wtime() #else - t0 = prk_get_wtime() + t0 = prk_get_wtime() #endif - endif + endif !$omp parallel do private(xval, yval, vx, vy, qval, fx, fy, ax, ay) do ip=1,n xval = particles%x(ip) @@ -353,7 +356,8 @@ end subroutine computeCoulomb integer function verifyParticle(part, iterations, Qgrid, L) type(particle_t), intent(in) :: part - integer(kind=REAL64), intent(in) :: iterations, L + integer(kind=INT32), intent(in) :: iterations + integer(kind=REAL64), intent(in) :: L real(kind=REAL64), allocatable, dimension(:,:), intent(in) :: Qgrid integer(kind=INT64) :: x, y real(kind=REAL64) :: x_final, y_final, x_periodic, y_periodic, disp diff --git a/FORTRAN/pic_soa.F90 b/FORTRAN/pic_soa.F90 index cda3cc646..41f600e81 100644 --- a/FORTRAN/pic_soa.F90 +++ b/FORTRAN/pic_soa.F90 @@ -66,7 +66,8 @@ program pic real(kind=REAL64), allocatable, dimension(:,:) :: Qgrid type(particles_t):: particles integer(kind=INT64) :: L, n, k, m, init_mode - integer(kind=INT64) :: ip, iterations, iter + integer(kind=INT32) :: iterations, iter + integer(kind=INT64) :: ip real(kind=REAL64) :: rho real(kind=REAL64) :: t0, pic_time character(len=32) :: argtmp @@ -149,16 +150,18 @@ program pic print *, 'Number of particles placed = ', n + t0 = 0 + block real(kind=REAL64) :: fx, fy, ax, ay, xval, yval, vx, vy, qval - do iter = 0_INT64, iterations - if(iter == 1) then + do iter = 0, iterations + if(iter == 1) then #ifdef _OPENMP - t0 = omp_get_wtime() + t0 = omp_get_wtime() #else - t0 = prk_get_wtime() + t0 = prk_get_wtime() #endif - endif + endif !$omp parallel do private(xval, yval, vx, vy, qval, fx, fy, ax, ay) do ip=1,n xval = particles%x(ip) @@ -353,7 +356,8 @@ end subroutine computeCoulomb integer function verifyParticle(part, iterations, Qgrid, L) type(particle_t), intent(in) :: part - integer(kind=REAL64), intent(in) :: iterations, L + integer(kind=INT32), intent(in) :: iterations + integer(kind=REAL64), intent(in) :: L real(kind=REAL64), allocatable, dimension(:,:), intent(in) :: Qgrid integer(kind=INT64) :: x, y real(kind=REAL64) :: x_final, y_final, x_periodic, y_periodic, disp From a719890b287791e9bf8e97883d5f35d107808bd2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 14:07:23 +0200 Subject: [PATCH 184/325] remove unused --- FORTRAN/transpose-pretty.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/transpose-pretty.F90 b/FORTRAN/transpose-pretty.F90 index b2ba96d05..d40956c57 100644 --- a/FORTRAN/transpose-pretty.F90 +++ b/FORTRAN/transpose-pretty.F90 @@ -64,7 +64,7 @@ program main real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix integer(kind=INT64) :: bytes ! combined size of matrices ! runtime variables - integer(kind=INT32) :: i,j,k + integer(kind=INT32) :: k integer(kind=INT64) :: j2, o2 ! for loop over order**2 real(kind=REAL64) :: abserr ! squared error real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters From c56809774938101f84b705e72d4a6246119d6f05 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 14:07:36 +0200 Subject: [PATCH 185/325] remove unused --- FORTRAN/stencil-openmp-target.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/stencil-openmp-target.F90 b/FORTRAN/stencil-openmp-target.F90 index 23a5a8868..f40415284 100644 --- a/FORTRAN/stencil-openmp-target.F90 +++ b/FORTRAN/stencil-openmp-target.F90 @@ -166,7 +166,6 @@ program main real(kind=REAL64), parameter :: cx=1.d0, cy=1.d0 ! runtime variables integer(kind=INT32) :: i, j, k - integer(kind=INT32) :: ii, jj, it, jt integer(kind=INT64) :: flops ! floating point ops per iteration real(kind=REAL64) :: norm, reference_norm ! L1 norm of solution integer(kind=INT64) :: active_points ! interior of grid with respect to stencil @@ -258,6 +257,7 @@ program main stencil_time = t1 - t0 + norm = 0 ! compute L1 norm in parallel !$omp parallel do collapse(2) & !$omp& default(none) shared(n,B) private(i,j) & From a312d56fe6c62770fd2d87b50a22528ad1528e36 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 14:08:14 +0200 Subject: [PATCH 186/325] fix bounds --- FORTRAN/generate-fortran-stencil.py | 16 ++-- FORTRAN/stencil-openmp.F90 | 14 +-- FORTRAN/stencil-taskloop-openmp.F90 | 16 ++-- FORTRAN/stencil_openmp.F90 | 144 ++++++++++++++-------------- FORTRAN/stencil_pretty.F90 | 72 +++++++------- FORTRAN/stencil_serial.F90 | 72 +++++++------- FORTRAN/stencil_target.F90 | 72 +++++++------- FORTRAN/stencil_taskloop.F90 | 72 +++++++------- 8 files changed, 239 insertions(+), 239 deletions(-) diff --git a/FORTRAN/generate-fortran-stencil.py b/FORTRAN/generate-fortran-stencil.py index aabf96dcb..5bd38826f 100755 --- a/FORTRAN/generate-fortran-stencil.py +++ b/FORTRAN/generate-fortran-stencil.py @@ -17,21 +17,21 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write('integer(kind=INT32) :: i,j\n') if (model=='openmp'): src.write(' !$omp do\n') - src.write(' do i='+str(radius)+',n-'+str(radius)+'-1\n') + src.write(' do i='+str(1+radius)+',n-'+str(radius)+'\n') src.write(' !$omp simd\n') - src.write(' do j='+str(radius)+',n-'+str(radius)+'-1\n') + src.write(' do j='+str(1+radius)+',n-'+str(radius)+'\n') if (model=='target'): src.write(' !$omp teams distribute parallel for simd collapse(2) schedule(static,1)\n') - src.write(' do i='+str(radius)+',n-'+str(radius)+'-1\n') - src.write(' do j='+str(radius)+',n-'+str(radius)+'-1\n') + src.write(' do i='+str(1+radius)+',n-'+str(radius)+'\n') + src.write(' do j='+str(1+radius)+',n-'+str(radius)+'\n') elif (model=='taskloop'): src.write(' !$omp taskloop\n') - src.write(' do i='+str(radius)+',n-'+str(radius)+'-1\n') + src.write(' do i='+str(1+radius)+',n-'+str(radius)+'\n') src.write(' !$omp simd\n') - src.write(' do j='+str(radius)+',n-'+str(radius)+'-1\n') + src.write(' do j='+str(1+radius)+',n-'+str(radius)+'\n') else: - src.write(' do i='+str(radius)+',n-'+str(radius)+'-1\n') - src.write(' do j='+str(radius)+',n-'+str(radius)+'-1\n') + src.write(' do i='+str(1+radius)+',n-'+str(radius)+'\n') + src.write(' do j='+str(1+radius)+',n-'+str(radius)+'\n') src.write(' out(i,j) = out(i,j) &\n') for j in range(0,2*radius+1): if j-radius>=0: opj='+' diff --git a/FORTRAN/stencil-openmp.F90 b/FORTRAN/stencil-openmp.F90 index 267054d3f..e7f51eec6 100644 --- a/FORTRAN/stencil-openmp.F90 +++ b/FORTRAN/stencil-openmp.F90 @@ -214,10 +214,9 @@ program main call initialize_w(is_star,r,W) - !$omp parallel default(none) & - !$omp& shared(n,A,B,W,t0,t1,iterations,tiling,tile_size,is_star) & - !$omp& private(i,j,k) & - !$omp& reduction(+:norm) + !$omp parallel default(none) & + !$omp& shared(n,A,B,W,stencil_time,iterations,tiling,tile_size,is_star) & + !$omp& private(i,j,k,t0,t1) ! intialize the input and output arrays !$omp do @@ -259,17 +258,18 @@ program main t1 = omp_get_wtime() stencil_time = t1 - t0 !$omp end master + !$omp end parallel ! compute L1 norm in parallel - !$omp do + norm = 0 + !$omp parallel do reduction(+:norm) do j=r,n-r do i=r,n-r norm = norm + abs(B(i,j)) enddo enddo - !$omp end do + !$omp end parallel do - !$omp end parallel active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) diff --git a/FORTRAN/stencil-taskloop-openmp.F90 b/FORTRAN/stencil-taskloop-openmp.F90 index de1d76501..65276a54e 100644 --- a/FORTRAN/stencil-taskloop-openmp.F90 +++ b/FORTRAN/stencil-taskloop-openmp.F90 @@ -215,9 +215,9 @@ program main call initialize_w(is_star,r,W) - !$omp parallel default(none) & - !$omp& shared(n,A,B,W,t0,t1,iterations,tiling,tile_size,is_star) & - !$omp& private(i,j,k) + !$omp parallel default(none) & + !$omp& shared(n,A,B,W,stencil_time,norm,iterations,tiling,tile_size,is_star) & + !$omp& private(i,j,k,t0,t1) !$omp master !$omp taskloop firstprivate(n) shared(A,B) @@ -257,20 +257,20 @@ program main enddo ! iterations t1 = omp_get_wtime() + stencil_time = t1 - t0 !$omp end master - !$omp end parallel - - stencil_time = t1 - t0 ! compute L1 norm in parallel - !$omp parallel do reduction(+:norm) + norm = 0.0d0 + !$omp do reduction(+:norm) do j=r,n-r do i=r,n-r norm = norm + abs(B(i,j)) enddo enddo - !$omp end parallel do + !$omp end do + !$omp end parallel active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) diff --git a/FORTRAN/stencil_openmp.F90 b/FORTRAN/stencil_openmp.F90 index 0b5ea36bd..01ad2178f 100644 --- a/FORTRAN/stencil_openmp.F90 +++ b/FORTRAN/stencil_openmp.F90 @@ -6,11 +6,11 @@ subroutine star1(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=1,n-1-1 + do i=2,n-1 !$omp simd - do j=1,n-1-1 - do i=1,n-1-1 - do j=1,n-1-1 + do j=2,n-1 + do i=2,n-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i+0,j-1) * (-0.5d0) & + in(i-1,j+0) * (-0.5d0) & @@ -31,11 +31,11 @@ subroutine star2(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=2,n-2-1 + do i=3,n-2 !$omp simd - do j=2,n-2-1 - do i=2,n-2-1 - do j=2,n-2-1 + do j=3,n-2 + do i=3,n-2 + do j=3,n-2 out(i,j) = out(i,j) & + in(i+0,j-2) * (-0.125d0) & + in(i+0,j-1) * (-0.25d0) & @@ -60,11 +60,11 @@ subroutine star3(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=3,n-3-1 + do i=4,n-3 !$omp simd - do j=3,n-3-1 - do i=3,n-3-1 - do j=3,n-3-1 + do j=4,n-3 + do i=4,n-3 + do j=4,n-3 out(i,j) = out(i,j) & + in(i+0,j-3) * (-0.05555555555555555d0) & + in(i+0,j-2) * (-0.08333333333333333d0) & @@ -93,11 +93,11 @@ subroutine star4(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=4,n-4-1 + do i=5,n-4 !$omp simd - do j=4,n-4-1 - do i=4,n-4-1 - do j=4,n-4-1 + do j=5,n-4 + do i=5,n-4 + do j=5,n-4 out(i,j) = out(i,j) & + in(i+0,j-4) * (-0.03125d0) & + in(i+0,j-3) * (-0.041666666666666664d0) & @@ -130,11 +130,11 @@ subroutine star5(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=5,n-5-1 + do i=6,n-5 !$omp simd - do j=5,n-5-1 - do i=5,n-5-1 - do j=5,n-5-1 + do j=6,n-5 + do i=6,n-5 + do j=6,n-5 out(i,j) = out(i,j) & + in(i+0,j-5) * (-0.02d0) & + in(i+0,j-4) * (-0.025d0) & @@ -171,11 +171,11 @@ subroutine star6(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=6,n-6-1 + do i=7,n-6 !$omp simd - do j=6,n-6-1 - do i=6,n-6-1 - do j=6,n-6-1 + do j=7,n-6 + do i=7,n-6 + do j=7,n-6 out(i,j) = out(i,j) & + in(i+0,j-6) * (-0.013888888888888888d0) & + in(i+0,j-5) * (-0.016666666666666666d0) & @@ -216,11 +216,11 @@ subroutine star7(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=7,n-7-1 + do i=8,n-7 !$omp simd - do j=7,n-7-1 - do i=7,n-7-1 - do j=7,n-7-1 + do j=8,n-7 + do i=8,n-7 + do j=8,n-7 out(i,j) = out(i,j) & + in(i+0,j-7) * (-0.01020408163265306d0) & + in(i+0,j-6) * (-0.011904761904761904d0) & @@ -265,11 +265,11 @@ subroutine star8(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=8,n-8-1 + do i=9,n-8 !$omp simd - do j=8,n-8-1 - do i=8,n-8-1 - do j=8,n-8-1 + do j=9,n-8 + do i=9,n-8 + do j=9,n-8 out(i,j) = out(i,j) & + in(i+0,j-8) * (-0.0078125d0) & + in(i+0,j-7) * (-0.008928571428571428d0) & @@ -318,11 +318,11 @@ subroutine star9(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=9,n-9-1 + do i=10,n-9 !$omp simd - do j=9,n-9-1 - do i=9,n-9-1 - do j=9,n-9-1 + do j=10,n-9 + do i=10,n-9 + do j=10,n-9 out(i,j) = out(i,j) & + in(i+0,j-9) * (-0.006172839506172839d0) & + in(i+0,j-8) * (-0.006944444444444444d0) & @@ -375,11 +375,11 @@ subroutine grid1(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=1,n-1-1 + do i=2,n-1 !$omp simd - do j=1,n-1-1 - do i=1,n-1-1 - do j=1,n-1-1 + do j=2,n-1 + do i=2,n-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i-1,j-1) * (-0.25d0) & + in(i+1,j-1) * (-0.25d0) & @@ -400,11 +400,11 @@ subroutine grid2(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=2,n-2-1 + do i=3,n-2 !$omp simd - do j=2,n-2-1 - do i=2,n-2-1 - do j=2,n-2-1 + do j=3,n-2 + do i=3,n-2 + do j=3,n-2 out(i,j) = out(i,j) & + in(i-2,j-2) * (-0.0625d0) & + in(i+1,j-2) * (-0.020833333333333332d0) & @@ -435,11 +435,11 @@ subroutine grid3(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=3,n-3-1 + do i=4,n-3 !$omp simd - do j=3,n-3-1 - do i=3,n-3-1 - do j=3,n-3-1 + do j=4,n-3 + do i=4,n-3 + do j=4,n-3 out(i,j) = out(i,j) & + in(i-3,j-3) * (-0.027777777777777776d0) & + in(i+1,j-3) * (-0.005555555555555556d0) & @@ -486,11 +486,11 @@ subroutine grid4(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=4,n-4-1 + do i=5,n-4 !$omp simd - do j=4,n-4-1 - do i=4,n-4-1 - do j=4,n-4-1 + do j=5,n-4 + do i=5,n-4 + do j=5,n-4 out(i,j) = out(i,j) & + in(i-4,j-4) * (-0.015625d0) & + in(i+1,j-4) * (-0.002232142857142857d0) & @@ -559,11 +559,11 @@ subroutine grid5(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=5,n-5-1 + do i=6,n-5 !$omp simd - do j=5,n-5-1 - do i=5,n-5-1 - do j=5,n-5-1 + do j=6,n-5 + do i=6,n-5 + do j=6,n-5 out(i,j) = out(i,j) & + in(i-5,j-5) * (-0.01d0) & + in(i+1,j-5) * (-0.0011111111111111111d0) & @@ -660,11 +660,11 @@ subroutine grid6(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=6,n-6-1 + do i=7,n-6 !$omp simd - do j=6,n-6-1 - do i=6,n-6-1 - do j=6,n-6-1 + do j=7,n-6 + do i=7,n-6 + do j=7,n-6 out(i,j) = out(i,j) & + in(i-6,j-6) * (-0.006944444444444444d0) & + in(i+1,j-6) * (-0.0006313131313131314d0) & @@ -795,11 +795,11 @@ subroutine grid7(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=7,n-7-1 + do i=8,n-7 !$omp simd - do j=7,n-7-1 - do i=7,n-7-1 - do j=7,n-7-1 + do j=8,n-7 + do i=8,n-7 + do j=8,n-7 out(i,j) = out(i,j) & + in(i-7,j-7) * (-0.00510204081632653d0) & + in(i+1,j-7) * (-0.0003924646781789639d0) & @@ -970,11 +970,11 @@ subroutine grid8(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=8,n-8-1 + do i=9,n-8 !$omp simd - do j=8,n-8-1 - do i=8,n-8-1 - do j=8,n-8-1 + do j=9,n-8 + do i=9,n-8 + do j=9,n-8 out(i,j) = out(i,j) & + in(i-8,j-8) * (-0.00390625d0) & + in(i+1,j-8) * (-0.00026041666666666666d0) & @@ -1191,11 +1191,11 @@ subroutine grid9(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp do - do i=9,n-9-1 + do i=10,n-9 !$omp simd - do j=9,n-9-1 - do i=9,n-9-1 - do j=9,n-9-1 + do j=10,n-9 + do i=10,n-9 + do j=10,n-9 out(i,j) = out(i,j) & + in(i-9,j-9) * (-0.0030864197530864196d0) & + in(i+1,j-9) * (-0.00018155410312273057d0) & diff --git a/FORTRAN/stencil_pretty.F90 b/FORTRAN/stencil_pretty.F90 index cb4bf8052..60f64fd66 100644 --- a/FORTRAN/stencil_pretty.F90 +++ b/FORTRAN/stencil_pretty.F90 @@ -5,8 +5,8 @@ subroutine star1(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=1,n-1-1 - do j=1,n-1-1 + do i=2,n-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i+0,j-1) * (-0.5d0) & + in(i-1,j+0) * (-0.5d0) & @@ -24,8 +24,8 @@ subroutine star2(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=2,n-2-1 - do j=2,n-2-1 + do i=3,n-2 + do j=3,n-2 out(i,j) = out(i,j) & + in(i+0,j-2) * (-0.125d0) & + in(i+0,j-1) * (-0.25d0) & @@ -47,8 +47,8 @@ subroutine star3(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=3,n-3-1 - do j=3,n-3-1 + do i=4,n-3 + do j=4,n-3 out(i,j) = out(i,j) & + in(i+0,j-3) * (-0.05555555555555555d0) & + in(i+0,j-2) * (-0.08333333333333333d0) & @@ -74,8 +74,8 @@ subroutine star4(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=4,n-4-1 - do j=4,n-4-1 + do i=5,n-4 + do j=5,n-4 out(i,j) = out(i,j) & + in(i+0,j-4) * (-0.03125d0) & + in(i+0,j-3) * (-0.041666666666666664d0) & @@ -105,8 +105,8 @@ subroutine star5(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=5,n-5-1 - do j=5,n-5-1 + do i=6,n-5 + do j=6,n-5 out(i,j) = out(i,j) & + in(i+0,j-5) * (-0.02d0) & + in(i+0,j-4) * (-0.025d0) & @@ -140,8 +140,8 @@ subroutine star6(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=6,n-6-1 - do j=6,n-6-1 + do i=7,n-6 + do j=7,n-6 out(i,j) = out(i,j) & + in(i+0,j-6) * (-0.013888888888888888d0) & + in(i+0,j-5) * (-0.016666666666666666d0) & @@ -179,8 +179,8 @@ subroutine star7(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=7,n-7-1 - do j=7,n-7-1 + do i=8,n-7 + do j=8,n-7 out(i,j) = out(i,j) & + in(i+0,j-7) * (-0.01020408163265306d0) & + in(i+0,j-6) * (-0.011904761904761904d0) & @@ -222,8 +222,8 @@ subroutine star8(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=8,n-8-1 - do j=8,n-8-1 + do i=9,n-8 + do j=9,n-8 out(i,j) = out(i,j) & + in(i+0,j-8) * (-0.0078125d0) & + in(i+0,j-7) * (-0.008928571428571428d0) & @@ -269,8 +269,8 @@ subroutine star9(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=9,n-9-1 - do j=9,n-9-1 + do i=10,n-9 + do j=10,n-9 out(i,j) = out(i,j) & + in(i+0,j-9) * (-0.006172839506172839d0) & + in(i+0,j-8) * (-0.006944444444444444d0) & @@ -320,8 +320,8 @@ subroutine grid1(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=1,n-1-1 - do j=1,n-1-1 + do i=2,n-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i-1,j-1) * (-0.25d0) & + in(i+1,j-1) * (-0.25d0) & @@ -339,8 +339,8 @@ subroutine grid2(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=2,n-2-1 - do j=2,n-2-1 + do i=3,n-2 + do j=3,n-2 out(i,j) = out(i,j) & + in(i-2,j-2) * (-0.0625d0) & + in(i+1,j-2) * (-0.020833333333333332d0) & @@ -368,8 +368,8 @@ subroutine grid3(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=3,n-3-1 - do j=3,n-3-1 + do i=4,n-3 + do j=4,n-3 out(i,j) = out(i,j) & + in(i-3,j-3) * (-0.027777777777777776d0) & + in(i+1,j-3) * (-0.005555555555555556d0) & @@ -413,8 +413,8 @@ subroutine grid4(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=4,n-4-1 - do j=4,n-4-1 + do i=5,n-4 + do j=5,n-4 out(i,j) = out(i,j) & + in(i-4,j-4) * (-0.015625d0) & + in(i+1,j-4) * (-0.002232142857142857d0) & @@ -480,8 +480,8 @@ subroutine grid5(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=5,n-5-1 - do j=5,n-5-1 + do i=6,n-5 + do j=6,n-5 out(i,j) = out(i,j) & + in(i-5,j-5) * (-0.01d0) & + in(i+1,j-5) * (-0.0011111111111111111d0) & @@ -575,8 +575,8 @@ subroutine grid6(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=6,n-6-1 - do j=6,n-6-1 + do i=7,n-6 + do j=7,n-6 out(i,j) = out(i,j) & + in(i-6,j-6) * (-0.006944444444444444d0) & + in(i+1,j-6) * (-0.0006313131313131314d0) & @@ -704,8 +704,8 @@ subroutine grid7(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=7,n-7-1 - do j=7,n-7-1 + do i=8,n-7 + do j=8,n-7 out(i,j) = out(i,j) & + in(i-7,j-7) * (-0.00510204081632653d0) & + in(i+1,j-7) * (-0.0003924646781789639d0) & @@ -873,8 +873,8 @@ subroutine grid8(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=8,n-8-1 - do j=8,n-8-1 + do i=9,n-8 + do j=9,n-8 out(i,j) = out(i,j) & + in(i-8,j-8) * (-0.00390625d0) & + in(i+1,j-8) * (-0.00026041666666666666d0) & @@ -1088,8 +1088,8 @@ subroutine grid9(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=9,n-9-1 - do j=9,n-9-1 + do i=10,n-9 + do j=10,n-9 out(i,j) = out(i,j) & + in(i-9,j-9) * (-0.0030864197530864196d0) & + in(i+1,j-9) * (-0.00018155410312273057d0) & diff --git a/FORTRAN/stencil_serial.F90 b/FORTRAN/stencil_serial.F90 index cb4bf8052..60f64fd66 100644 --- a/FORTRAN/stencil_serial.F90 +++ b/FORTRAN/stencil_serial.F90 @@ -5,8 +5,8 @@ subroutine star1(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=1,n-1-1 - do j=1,n-1-1 + do i=2,n-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i+0,j-1) * (-0.5d0) & + in(i-1,j+0) * (-0.5d0) & @@ -24,8 +24,8 @@ subroutine star2(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=2,n-2-1 - do j=2,n-2-1 + do i=3,n-2 + do j=3,n-2 out(i,j) = out(i,j) & + in(i+0,j-2) * (-0.125d0) & + in(i+0,j-1) * (-0.25d0) & @@ -47,8 +47,8 @@ subroutine star3(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=3,n-3-1 - do j=3,n-3-1 + do i=4,n-3 + do j=4,n-3 out(i,j) = out(i,j) & + in(i+0,j-3) * (-0.05555555555555555d0) & + in(i+0,j-2) * (-0.08333333333333333d0) & @@ -74,8 +74,8 @@ subroutine star4(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=4,n-4-1 - do j=4,n-4-1 + do i=5,n-4 + do j=5,n-4 out(i,j) = out(i,j) & + in(i+0,j-4) * (-0.03125d0) & + in(i+0,j-3) * (-0.041666666666666664d0) & @@ -105,8 +105,8 @@ subroutine star5(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=5,n-5-1 - do j=5,n-5-1 + do i=6,n-5 + do j=6,n-5 out(i,j) = out(i,j) & + in(i+0,j-5) * (-0.02d0) & + in(i+0,j-4) * (-0.025d0) & @@ -140,8 +140,8 @@ subroutine star6(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=6,n-6-1 - do j=6,n-6-1 + do i=7,n-6 + do j=7,n-6 out(i,j) = out(i,j) & + in(i+0,j-6) * (-0.013888888888888888d0) & + in(i+0,j-5) * (-0.016666666666666666d0) & @@ -179,8 +179,8 @@ subroutine star7(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=7,n-7-1 - do j=7,n-7-1 + do i=8,n-7 + do j=8,n-7 out(i,j) = out(i,j) & + in(i+0,j-7) * (-0.01020408163265306d0) & + in(i+0,j-6) * (-0.011904761904761904d0) & @@ -222,8 +222,8 @@ subroutine star8(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=8,n-8-1 - do j=8,n-8-1 + do i=9,n-8 + do j=9,n-8 out(i,j) = out(i,j) & + in(i+0,j-8) * (-0.0078125d0) & + in(i+0,j-7) * (-0.008928571428571428d0) & @@ -269,8 +269,8 @@ subroutine star9(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=9,n-9-1 - do j=9,n-9-1 + do i=10,n-9 + do j=10,n-9 out(i,j) = out(i,j) & + in(i+0,j-9) * (-0.006172839506172839d0) & + in(i+0,j-8) * (-0.006944444444444444d0) & @@ -320,8 +320,8 @@ subroutine grid1(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=1,n-1-1 - do j=1,n-1-1 + do i=2,n-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i-1,j-1) * (-0.25d0) & + in(i+1,j-1) * (-0.25d0) & @@ -339,8 +339,8 @@ subroutine grid2(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=2,n-2-1 - do j=2,n-2-1 + do i=3,n-2 + do j=3,n-2 out(i,j) = out(i,j) & + in(i-2,j-2) * (-0.0625d0) & + in(i+1,j-2) * (-0.020833333333333332d0) & @@ -368,8 +368,8 @@ subroutine grid3(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=3,n-3-1 - do j=3,n-3-1 + do i=4,n-3 + do j=4,n-3 out(i,j) = out(i,j) & + in(i-3,j-3) * (-0.027777777777777776d0) & + in(i+1,j-3) * (-0.005555555555555556d0) & @@ -413,8 +413,8 @@ subroutine grid4(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=4,n-4-1 - do j=4,n-4-1 + do i=5,n-4 + do j=5,n-4 out(i,j) = out(i,j) & + in(i-4,j-4) * (-0.015625d0) & + in(i+1,j-4) * (-0.002232142857142857d0) & @@ -480,8 +480,8 @@ subroutine grid5(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=5,n-5-1 - do j=5,n-5-1 + do i=6,n-5 + do j=6,n-5 out(i,j) = out(i,j) & + in(i-5,j-5) * (-0.01d0) & + in(i+1,j-5) * (-0.0011111111111111111d0) & @@ -575,8 +575,8 @@ subroutine grid6(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=6,n-6-1 - do j=6,n-6-1 + do i=7,n-6 + do j=7,n-6 out(i,j) = out(i,j) & + in(i-6,j-6) * (-0.006944444444444444d0) & + in(i+1,j-6) * (-0.0006313131313131314d0) & @@ -704,8 +704,8 @@ subroutine grid7(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=7,n-7-1 - do j=7,n-7-1 + do i=8,n-7 + do j=8,n-7 out(i,j) = out(i,j) & + in(i-7,j-7) * (-0.00510204081632653d0) & + in(i+1,j-7) * (-0.0003924646781789639d0) & @@ -873,8 +873,8 @@ subroutine grid8(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=8,n-8-1 - do j=8,n-8-1 + do i=9,n-8 + do j=9,n-8 out(i,j) = out(i,j) & + in(i-8,j-8) * (-0.00390625d0) & + in(i+1,j-8) * (-0.00026041666666666666d0) & @@ -1088,8 +1088,8 @@ subroutine grid9(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - do i=9,n-9-1 - do j=9,n-9-1 + do i=10,n-9 + do j=10,n-9 out(i,j) = out(i,j) & + in(i-9,j-9) * (-0.0030864197530864196d0) & + in(i+1,j-9) * (-0.00018155410312273057d0) & diff --git a/FORTRAN/stencil_target.F90 b/FORTRAN/stencil_target.F90 index f2c3b7785..10072a7cb 100644 --- a/FORTRAN/stencil_target.F90 +++ b/FORTRAN/stencil_target.F90 @@ -7,8 +7,8 @@ subroutine star1(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=1,n-1-1 - do j=1,n-1-1 + do i=2,n-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i+0,j-1) * (-0.5d0) & + in(i-1,j+0) * (-0.5d0) & @@ -30,8 +30,8 @@ subroutine star2(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=2,n-2-1 - do j=2,n-2-1 + do i=3,n-2 + do j=3,n-2 out(i,j) = out(i,j) & + in(i+0,j-2) * (-0.125d0) & + in(i+0,j-1) * (-0.25d0) & @@ -57,8 +57,8 @@ subroutine star3(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=3,n-3-1 - do j=3,n-3-1 + do i=4,n-3 + do j=4,n-3 out(i,j) = out(i,j) & + in(i+0,j-3) * (-0.05555555555555555d0) & + in(i+0,j-2) * (-0.08333333333333333d0) & @@ -88,8 +88,8 @@ subroutine star4(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=4,n-4-1 - do j=4,n-4-1 + do i=5,n-4 + do j=5,n-4 out(i,j) = out(i,j) & + in(i+0,j-4) * (-0.03125d0) & + in(i+0,j-3) * (-0.041666666666666664d0) & @@ -123,8 +123,8 @@ subroutine star5(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=5,n-5-1 - do j=5,n-5-1 + do i=6,n-5 + do j=6,n-5 out(i,j) = out(i,j) & + in(i+0,j-5) * (-0.02d0) & + in(i+0,j-4) * (-0.025d0) & @@ -162,8 +162,8 @@ subroutine star6(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=6,n-6-1 - do j=6,n-6-1 + do i=7,n-6 + do j=7,n-6 out(i,j) = out(i,j) & + in(i+0,j-6) * (-0.013888888888888888d0) & + in(i+0,j-5) * (-0.016666666666666666d0) & @@ -205,8 +205,8 @@ subroutine star7(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=7,n-7-1 - do j=7,n-7-1 + do i=8,n-7 + do j=8,n-7 out(i,j) = out(i,j) & + in(i+0,j-7) * (-0.01020408163265306d0) & + in(i+0,j-6) * (-0.011904761904761904d0) & @@ -252,8 +252,8 @@ subroutine star8(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=8,n-8-1 - do j=8,n-8-1 + do i=9,n-8 + do j=9,n-8 out(i,j) = out(i,j) & + in(i+0,j-8) * (-0.0078125d0) & + in(i+0,j-7) * (-0.008928571428571428d0) & @@ -303,8 +303,8 @@ subroutine star9(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=9,n-9-1 - do j=9,n-9-1 + do i=10,n-9 + do j=10,n-9 out(i,j) = out(i,j) & + in(i+0,j-9) * (-0.006172839506172839d0) & + in(i+0,j-8) * (-0.006944444444444444d0) & @@ -358,8 +358,8 @@ subroutine grid1(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=1,n-1-1 - do j=1,n-1-1 + do i=2,n-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i-1,j-1) * (-0.25d0) & + in(i+1,j-1) * (-0.25d0) & @@ -381,8 +381,8 @@ subroutine grid2(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=2,n-2-1 - do j=2,n-2-1 + do i=3,n-2 + do j=3,n-2 out(i,j) = out(i,j) & + in(i-2,j-2) * (-0.0625d0) & + in(i+1,j-2) * (-0.020833333333333332d0) & @@ -414,8 +414,8 @@ subroutine grid3(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=3,n-3-1 - do j=3,n-3-1 + do i=4,n-3 + do j=4,n-3 out(i,j) = out(i,j) & + in(i-3,j-3) * (-0.027777777777777776d0) & + in(i+1,j-3) * (-0.005555555555555556d0) & @@ -463,8 +463,8 @@ subroutine grid4(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=4,n-4-1 - do j=4,n-4-1 + do i=5,n-4 + do j=5,n-4 out(i,j) = out(i,j) & + in(i-4,j-4) * (-0.015625d0) & + in(i+1,j-4) * (-0.002232142857142857d0) & @@ -534,8 +534,8 @@ subroutine grid5(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=5,n-5-1 - do j=5,n-5-1 + do i=6,n-5 + do j=6,n-5 out(i,j) = out(i,j) & + in(i-5,j-5) * (-0.01d0) & + in(i+1,j-5) * (-0.0011111111111111111d0) & @@ -633,8 +633,8 @@ subroutine grid6(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=6,n-6-1 - do j=6,n-6-1 + do i=7,n-6 + do j=7,n-6 out(i,j) = out(i,j) & + in(i-6,j-6) * (-0.006944444444444444d0) & + in(i+1,j-6) * (-0.0006313131313131314d0) & @@ -766,8 +766,8 @@ subroutine grid7(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=7,n-7-1 - do j=7,n-7-1 + do i=8,n-7 + do j=8,n-7 out(i,j) = out(i,j) & + in(i-7,j-7) * (-0.00510204081632653d0) & + in(i+1,j-7) * (-0.0003924646781789639d0) & @@ -939,8 +939,8 @@ subroutine grid8(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=8,n-8-1 - do j=8,n-8-1 + do i=9,n-8 + do j=9,n-8 out(i,j) = out(i,j) & + in(i-8,j-8) * (-0.00390625d0) & + in(i+1,j-8) * (-0.00026041666666666666d0) & @@ -1158,8 +1158,8 @@ subroutine grid9(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp teams distribute parallel for simd collapse(2) schedule(static,1) - do i=9,n-9-1 - do j=9,n-9-1 + do i=10,n-9 + do j=10,n-9 out(i,j) = out(i,j) & + in(i-9,j-9) * (-0.0030864197530864196d0) & + in(i+1,j-9) * (-0.00018155410312273057d0) & diff --git a/FORTRAN/stencil_taskloop.F90 b/FORTRAN/stencil_taskloop.F90 index 77735c322..d0783491a 100644 --- a/FORTRAN/stencil_taskloop.F90 +++ b/FORTRAN/stencil_taskloop.F90 @@ -6,9 +6,9 @@ subroutine star1(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=1,n-1-1 + do i=2,n-1 !$omp simd - do j=1,n-1-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i+0,j-1) * (-0.5d0) & + in(i-1,j+0) * (-0.5d0) & @@ -29,9 +29,9 @@ subroutine star2(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=2,n-2-1 + do i=3,n-2 !$omp simd - do j=2,n-2-1 + do j=3,n-2 out(i,j) = out(i,j) & + in(i+0,j-2) * (-0.125d0) & + in(i+0,j-1) * (-0.25d0) & @@ -56,9 +56,9 @@ subroutine star3(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=3,n-3-1 + do i=4,n-3 !$omp simd - do j=3,n-3-1 + do j=4,n-3 out(i,j) = out(i,j) & + in(i+0,j-3) * (-0.05555555555555555d0) & + in(i+0,j-2) * (-0.08333333333333333d0) & @@ -87,9 +87,9 @@ subroutine star4(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=4,n-4-1 + do i=5,n-4 !$omp simd - do j=4,n-4-1 + do j=5,n-4 out(i,j) = out(i,j) & + in(i+0,j-4) * (-0.03125d0) & + in(i+0,j-3) * (-0.041666666666666664d0) & @@ -122,9 +122,9 @@ subroutine star5(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=5,n-5-1 + do i=6,n-5 !$omp simd - do j=5,n-5-1 + do j=6,n-5 out(i,j) = out(i,j) & + in(i+0,j-5) * (-0.02d0) & + in(i+0,j-4) * (-0.025d0) & @@ -161,9 +161,9 @@ subroutine star6(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=6,n-6-1 + do i=7,n-6 !$omp simd - do j=6,n-6-1 + do j=7,n-6 out(i,j) = out(i,j) & + in(i+0,j-6) * (-0.013888888888888888d0) & + in(i+0,j-5) * (-0.016666666666666666d0) & @@ -204,9 +204,9 @@ subroutine star7(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=7,n-7-1 + do i=8,n-7 !$omp simd - do j=7,n-7-1 + do j=8,n-7 out(i,j) = out(i,j) & + in(i+0,j-7) * (-0.01020408163265306d0) & + in(i+0,j-6) * (-0.011904761904761904d0) & @@ -251,9 +251,9 @@ subroutine star8(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=8,n-8-1 + do i=9,n-8 !$omp simd - do j=8,n-8-1 + do j=9,n-8 out(i,j) = out(i,j) & + in(i+0,j-8) * (-0.0078125d0) & + in(i+0,j-7) * (-0.008928571428571428d0) & @@ -302,9 +302,9 @@ subroutine star9(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=9,n-9-1 + do i=10,n-9 !$omp simd - do j=9,n-9-1 + do j=10,n-9 out(i,j) = out(i,j) & + in(i+0,j-9) * (-0.006172839506172839d0) & + in(i+0,j-8) * (-0.006944444444444444d0) & @@ -357,9 +357,9 @@ subroutine grid1(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=1,n-1-1 + do i=2,n-1 !$omp simd - do j=1,n-1-1 + do j=2,n-1 out(i,j) = out(i,j) & + in(i-1,j-1) * (-0.25d0) & + in(i+1,j-1) * (-0.25d0) & @@ -380,9 +380,9 @@ subroutine grid2(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=2,n-2-1 + do i=3,n-2 !$omp simd - do j=2,n-2-1 + do j=3,n-2 out(i,j) = out(i,j) & + in(i-2,j-2) * (-0.0625d0) & + in(i+1,j-2) * (-0.020833333333333332d0) & @@ -413,9 +413,9 @@ subroutine grid3(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=3,n-3-1 + do i=4,n-3 !$omp simd - do j=3,n-3-1 + do j=4,n-3 out(i,j) = out(i,j) & + in(i-3,j-3) * (-0.027777777777777776d0) & + in(i+1,j-3) * (-0.005555555555555556d0) & @@ -462,9 +462,9 @@ subroutine grid4(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=4,n-4-1 + do i=5,n-4 !$omp simd - do j=4,n-4-1 + do j=5,n-4 out(i,j) = out(i,j) & + in(i-4,j-4) * (-0.015625d0) & + in(i+1,j-4) * (-0.002232142857142857d0) & @@ -533,9 +533,9 @@ subroutine grid5(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=5,n-5-1 + do i=6,n-5 !$omp simd - do j=5,n-5-1 + do j=6,n-5 out(i,j) = out(i,j) & + in(i-5,j-5) * (-0.01d0) & + in(i+1,j-5) * (-0.0011111111111111111d0) & @@ -632,9 +632,9 @@ subroutine grid6(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=6,n-6-1 + do i=7,n-6 !$omp simd - do j=6,n-6-1 + do j=7,n-6 out(i,j) = out(i,j) & + in(i-6,j-6) * (-0.006944444444444444d0) & + in(i+1,j-6) * (-0.0006313131313131314d0) & @@ -765,9 +765,9 @@ subroutine grid7(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=7,n-7-1 + do i=8,n-7 !$omp simd - do j=7,n-7-1 + do j=8,n-7 out(i,j) = out(i,j) & + in(i-7,j-7) * (-0.00510204081632653d0) & + in(i+1,j-7) * (-0.0003924646781789639d0) & @@ -938,9 +938,9 @@ subroutine grid8(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=8,n-8-1 + do i=9,n-8 !$omp simd - do j=8,n-8-1 + do j=9,n-8 out(i,j) = out(i,j) & + in(i-8,j-8) * (-0.00390625d0) & + in(i+1,j-8) * (-0.00026041666666666666d0) & @@ -1157,9 +1157,9 @@ subroutine grid9(n, in, out) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j !$omp taskloop - do i=9,n-9-1 + do i=10,n-9 !$omp simd - do j=9,n-9-1 + do j=10,n-9 out(i,j) = out(i,j) & + in(i-9,j-9) * (-0.0030864197530864196d0) & + in(i+1,j-9) * (-0.00018155410312273057d0) & From e3b67968a8a6c8976050ef874d2cba47aaf1e693 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 14:41:41 +0200 Subject: [PATCH 187/325] p2p parse --- FORTRAN/p2p.F90 | 53 ++----------------- FORTRAN/prk_mod.F90 | 125 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 120 insertions(+), 58 deletions(-) diff --git a/FORTRAN/p2p.F90 b/FORTRAN/p2p.F90 index 024b49764..15e7fc896 100644 --- a/FORTRAN/p2p.F90 +++ b/FORTRAN/p2p.F90 @@ -74,10 +74,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: m, n @@ -98,55 +95,14 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a44)') 'Fortran Serial pipeline execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a39)') 'Usage: ./synch_p2p <# iterations> ', & - ' ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - m = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') m - - n = m - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - - mc = m - call get_command_argument(4,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') mc - - nc = n - call get_command_argument(5,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') nc - endif - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if ((m .lt. 1).or.(n .lt. 1)) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n - stop 1 - endif - - if (((mc.lt.1).or.(mc.gt.m)).or.((mc.lt.1).or.(mc.gt.m))) then - write(*,'(a,i5)') 'WARNING: chunking invalid - ignoring' - mc = m - nc = n - endif + call prk_get_arguments('p2p',iterations=iterations,dimx=m,dimy=n,tilex=mc,tiley=nc) chunk = ((mc/=m).or.(nc/=n)) write(*,'(a,i8)') 'Number of iterations = ', iterations write(*,'(a,i8,i8)') 'Grid sizes = ', m, n - write(*,'(a,i8,i8)') 'Size of chunking = ', mc, nc + if (chunk) then + write(*,'(a,i8,i8)') 'Size of chunking = ', mc, nc + endif allocate( grid(m,n), stat=err) if (err .ne. 0) then @@ -170,7 +126,6 @@ program main do k=0,iterations - ! start timer after a warmup iteration if (k.eq.1) t0 = prk_get_wtime() if (chunk) then diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index 9b5e2a202..26021839f 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -14,6 +14,8 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? length, offset, & ! nstream gpu_block_size, & ! nstream GPU only order, tile_size, & ! transpose, stencil, dgemm + dimx, dimy, & ! p2p + tilex, tiley, & ! p2p stencil, radius) ! not supported in implementations yet use iso_fortran_env implicit none @@ -22,9 +24,12 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? integer(kind=INT64), intent(out), optional :: length, offset ! nstream integer(kind=INT32), intent(out), optional :: gpu_block_size ! nstream GPU only integer(kind=INT32), intent(out), optional :: order, tile_size ! transpose, stencil, dgemm + integer(kind=INT32), intent(out), optional :: dimx, dimy ! p2p + integer(kind=INT32), intent(out), optional :: tilex, tiley ! p2p integer(kind=INT32), intent(out), optional :: radius ! stencil character(len=4), intent(out), optional :: stencil ! stencil + integer(kind=INT32), parameter :: deadbeef = -559038737 ! 3735928559 as int32 integer :: argc,arglen,err,a,p,q character(len=64) :: argtmp @@ -50,24 +55,37 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? if (present(radius)) then radius = 2 endif + if (present(dimx)) then + dimx = 1024 + endif + if (present(dimy)) then + dimy = deadbeef + endif + if (present(tilex)) then + tilex = 0 + endif + if (present(tiley)) then + tiley = deadbeef + endif #ifndef PRK_NO_ARGUMENTS if (kernel(1:7).eq.'nstream') then - if (present(length)) then - length = 0 - else + if (.not.present(length)) then print*,'You cannot parse nstream arguments without length' stop endif else if ( (kernel(1:9).eq.'transpose') & .or.(kernel(1:7).eq.'stencil') & .or.(kernel(1:5).eq.'dgemm') ) then - if (present(order)) then - order = 0 - else + if (.not.present(order)) then print*,'You cannot parse ',kernel,' arguments without order' stop endif + else if (kernel(1:3).eq.'p2p') then + if (.not.present(dimx)) then + print*,'You cannot parse ',kernel,' arguments without dimx' + stop + endif else print*,kernel,'is not supported yet' stop @@ -80,7 +98,8 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? if (kernel(1:7).eq.'nstream') then if (present(gpu_block_size)) then write(*,'(a62)') 'Old Usage: <# iterations> []' - write(*,'(a87)') 'New Usage: iterations=<# iterations> length= [block_size=]' + write(*,'(a87)') 'New Usage: iterations=<# iterations> length=', & + '[block_size=]' else write(*,'(a62)') 'Old Usage: <# iterations> []' write(*,'(a87)') 'New Usage: iterations=<# iterations> length= [offset=]' @@ -90,6 +109,18 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? .or.(kernel(1:5).eq.'dgemm') ) then write(*,'(a57)') 'Old Usage: <# iterations> []' write(*,'(a84)') 'New Usage: iterations=<# iterations> order= [tile_size=]' + else if (kernel(1:3).eq.'p2p') then + if (present(dimy)) then + write(*,'(a75)') 'Old Usage: <# iterations> ' + write(*,'(a57)') ' [ ]' + write(*,'(a46)') 'New Usage: iterations=<# iterations>' + write(*,'(a61)') ' dimx= dimy=' + write(*,'(a69)') ' [tilex= tiley=]' + else + write(*,'(a57)') 'Old Usage: <# iterations> []' + write(*,'(a84)') 'New Usage: iterations=<# iterations> dimx=', & + '[tilex=]' + endif endif STOP endif @@ -106,13 +137,27 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? read(argtmp,'(i15)') length else if (present(order)) then read(argtmp,'(i7)') order + else if (present(dimx)) then + read(argtmp,'(i7)') dimx endif else if (a.eq.3) then if (present(offset)) then read(argtmp,'(i15)') offset - endif - if (present(tile_size)) then + else if (present(tile_size)) then read(argtmp,'(i3)') tile_size + else if (present(dimy)) then + read(argtmp,'(i7)') dimy + else if (.not.present(dimy).and.present(tilex)) then + read(argtmp,'(i7)') tilex + endif + elseif (a.eq.4) then + if (present(dimx).and.present(dimy).and.present(tilex)) then + read(argtmp,'(i7)') tilex + endif + elseif (a.eq.5) then + if (present(dimx).and.present(dimy).and. & + present(tilex).and.present(tiley)) then + read(argtmp,'(i7)') tiley endif else print*,'too many positional arguments:',argc @@ -165,6 +210,34 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? read(argtmp(p+1:arglen),'(i5)') gpu_block_size endif endif + ! look for dimx + if (present(dimx)) then + q = index(argtmp(1:p-1),"dimx") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i7)') dimx + endif + ! look for tilex + if (present(tilex)) then + q = index(argtmp(1:p-1),"tilex") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i3)') tilex + endif + endif + ! look for dimy + if (present(dimy)) then + q = index(argtmp(1:p-1),"dimy") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i7)') dimy + endif + ! look for tiley + if (present(tiley)) then + q = index(argtmp(1:p-1),"tiley") + if (q.eq.1) then + read(argtmp(p+1:arglen),'(i3)') tiley + endif + endif + endif + endif ! end looking endif endif @@ -211,6 +284,40 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? endif endif endif + + ! p2p + if (present(dimx)) then + if (dimx.lt.1) then + write(*,'(a,i7)') 'ERROR: dimx must be positive : ', dimx + stop 1 + endif + if (present(tilex)) then + if ((tilex.lt.1).or.(tilex.gt.dimx)) then + write(*,'(a,i7)') 'WARNING: tilex invalid - ignoring' + tilex = dimx + endif + endif + if (present(dimy)) then + ! user did not provide it, so we assume square array + if (dimy.eq.deadbeef) then + dimy = dimx + endif + if (dimy.lt.1) then + write(*,'(a,i7)') 'ERROR: dimy must be positive : ', dimy + stop 1 + endif + if (present(tiley)) then + ! user did not provide it, so we assume square array + if (tiley.eq.deadbeef) then + tiley = tilex + endif + if ((tiley.lt.1).or.(tiley.gt.dimy)) then + write(*,'(a,i7)') 'WARNING: tiley invalid - ignoring' + tiley = dimy + endif + endif + endif + endif #endif end subroutine From 1a27165125691f4f1ca8fb6530bd4fdf2022fba3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 17 Dec 2021 14:59:14 +0200 Subject: [PATCH 188/325] p2p-* input parse --- FORTRAN/p2p-async-openacc.F90 | 53 ++++------------------------- FORTRAN/p2p-coarray.F90 | 51 +++++----------------------- FORTRAN/p2p-doacross-openmp.F90 | 40 ++++------------------ FORTRAN/p2p-innerloop-openacc.F90 | 36 +++----------------- FORTRAN/p2p-innerloop-openmp.F90 | 40 +++++----------------- FORTRAN/p2p-innerloop.F90 | 35 +++----------------- FORTRAN/p2p-openacc.F90 | 39 +++------------------- FORTRAN/p2p-openmp-target.F90 | 34 ++----------------- FORTRAN/p2p-tasks-openmp.F90 | 55 +++++-------------------------- FORTRAN/p2p.F90 | 8 ++--- 10 files changed, 57 insertions(+), 334 deletions(-) diff --git a/FORTRAN/p2p-async-openacc.F90 b/FORTRAN/p2p-async-openacc.F90 index 8104f6b9e..2f4f94de6 100644 --- a/FORTRAN/p2p-async-openacc.F90 +++ b/FORTRAN/p2p-async-openacc.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -56,7 +57,6 @@ subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) use iso_fortran_env - use prk implicit none integer(kind=INT32), intent(in) :: m,n integer(kind=INT32), intent(in) :: startm,endm @@ -74,11 +74,9 @@ subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) program main use iso_fortran_env + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: m, n @@ -99,45 +97,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a52)') 'Fortran ORNL-ACC TASKS pipeline execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,2a39)') 'Usage: ./synch_p2p <# iterations> ', & - ' ', & - ' ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - m = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') m - - n = m - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - - mc = m - call get_command_argument(4,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') mc - - nc = n - call get_command_argument(5,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') nc - endif - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if ((m .lt. 1).or.(n .lt. 1)) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n - stop 1 - endif + call prk_get_arguments('p2p',iterations=iterations,dimx=m,dimy=n,tilex=mc,tiley=nc) ! mc=m or nc=n disables chunking in that dimension, which means ! there is no task parallelism to exploit @@ -148,9 +108,9 @@ program main mc = max(1,mc) nc = max(1,nc) - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', m, n - write(*,'(a,i8,i8)') 'Size of chunking = ', mc, nc + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', m, n + write(*,'(a27,i8,i8)') 'Size of chunking = ', mc, nc allocate( grid(m,n), stat=err) if (err .ne. 0) then @@ -191,7 +151,6 @@ program main enddo !$acc async(grid(1,1)) wait(grid(lic,ljc)) grid(1,1) = -grid(m,n) - enddo t1 = prk_get_wtime() diff --git a/FORTRAN/p2p-coarray.F90 b/FORTRAN/p2p-coarray.F90 index 2766b6291..664b7ce9c 100644 --- a/FORTRAN/p2p-coarray.F90 +++ b/FORTRAN/p2p-coarray.F90 @@ -61,17 +61,14 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: m, n, m_local, max_m_local real(kind=REAL64) :: corner_val ! verification value at top right corner of grid real(kind=REAL64), allocatable :: grid(:,:)[:] ! array holding grid values ! runtime variables - integer(kind=INT32) :: i, j, k + integer(kind=INT32) :: i, j, k integer :: me, np, prev, next !, stat real(kind=REAL64) :: t0, t1, pipeline_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance @@ -85,41 +82,11 @@ program main np = num_images() ! co_broadcast is part of Fortran 2015, so we will not assume it yet. - if(me == 1) then - write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a45)') 'Fortran coarray pipeline execution on 2D grid' - endif - - if (command_argument_count().lt.3) then - if(me == 1) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a39)') 'Usage: ./synch_p2p <# iterations> ', & - ' ' - endif - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - m = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') m - - n = 1 - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if ((m .lt. 1).or.(n .lt. 1)) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n - stop 1 + if (me == 1) then + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a45)') 'Fortran coarray pipeline execution on 2D grid' endif + call prk_get_arguments('p2p',iterations=iterations,dimx=m,dimy=n) ! co_max is part of Fortran 2015, so we will not assume it. This is present ! in OpenCoarrays and has been for a while, when used with GFortran >= 6. @@ -132,14 +99,14 @@ program main allocate( grid(max_m_local,n)[*], stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of grid returned ',err + write(*,'(a22,i3)') 'allocation of grid returned ',err stop 1 endif if(me == 1) then - write(*,'(a,i8)') 'Number of threads = ', num_images() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', m, n + write(*,'(a27,i8)') 'Number of threads = ', num_images() + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', m, n endif do j=1,n diff --git a/FORTRAN/p2p-doacross-openmp.F90 b/FORTRAN/p2p-doacross-openmp.F90 index 6feba9475..e2c494b3e 100644 --- a/FORTRAN/p2p-doacross-openmp.F90 +++ b/FORTRAN/p2p-doacross-openmp.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -57,11 +58,9 @@ program main use iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: m, n @@ -79,38 +78,11 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a44)') 'Fortran OpenMP pipeline execution on 2D grid' - if (command_argument_count().lt.3) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a38)') 'Usage: ./synch_p2p <# iterations> ', & - ' ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - m = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') m - - n = 1 - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if ((m .lt. 1).or.(n .lt. 1)) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n - stop 1 - endif + call prk_get_arguments('p2p',iterations=iterations,dimx=m,dimy=n,tilex=mc,tiley=nc) - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', m, n + write(*,'(a27,i8)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', m, n allocate( grid(m,n), stat=err) if (err .ne. 0) then diff --git a/FORTRAN/p2p-innerloop-openacc.F90 b/FORTRAN/p2p-innerloop-openacc.F90 index 4c670ca77..c662ed911 100644 --- a/FORTRAN/p2p-innerloop-openacc.F90 +++ b/FORTRAN/p2p-innerloop-openacc.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -58,10 +59,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n @@ -80,20 +78,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a55)') 'Fortran OpenACC INNERLOOP pipeline execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a16)') 'Usage: ./synch_p2p <# iterations> ', & - '' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n + call prk_get_arguments('p2p',iterations=iterations,dimx=n) if (n .gt. 16384) then write(*,'(a,i5)') 'WARNING: grid size exceeds 16384: ', n @@ -101,21 +86,8 @@ program main write(*,'(a)') 'unless you compiled with -Mlarge_arrays.' endif - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if (n .lt. 1) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', n - stop 1 - endif - -#ifdef _OPENMP - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() -#endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', n, n + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', n, n allocate( grid(n,n), stat=err) if (err .ne. 0) then diff --git a/FORTRAN/p2p-innerloop-openmp.F90 b/FORTRAN/p2p-innerloop-openmp.F90 index 87262098c..d308d86d6 100644 --- a/FORTRAN/p2p-innerloop-openmp.F90 +++ b/FORTRAN/p2p-innerloop-openmp.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -59,11 +60,9 @@ program main #ifdef _OPENMP use omp_lib #endif + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n @@ -88,36 +87,13 @@ program main ' execution on 2D grid' #endif - if (command_argument_count().lt.2) then - write(*,'(a16,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a16)') 'Usage: ./synch_p2p <# iterations> ', & - '' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if (n .lt. 1) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', n - stop 1 - endif + call prk_get_arguments('p2p',iterations=iterations,dimx=n) #ifdef _OPENMP - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a27,i8)') 'Number of threads = ', omp_get_max_threads() #endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', n, n + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', n, n allocate( grid(n,n), stat=err) if (err .ne. 0) then @@ -149,9 +125,11 @@ program main enddo !$omp end do + t0 = 0 + do k=0,iterations - ! start timer after a warmup iteration + ! start timer after a warmup iteration if (k.eq.1) then !$omp barrier !$omp master diff --git a/FORTRAN/p2p-innerloop.F90 b/FORTRAN/p2p-innerloop.F90 index b7d884e31..33c06678f 100644 --- a/FORTRAN/p2p-innerloop.F90 +++ b/FORTRAN/p2p-innerloop.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -61,10 +62,7 @@ program main #endif use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n @@ -89,36 +87,13 @@ program main ' execution on 2D grid' #endif - if (command_argument_count().lt.2) then - write(*,'(a16,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a16)') 'Usage: ./synch_p2p <# iterations> ', & - '' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - n = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if (n .lt. 1) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', n - stop 1 - endif + call prk_get_arguments('p2p',iterations=iterations,dimx=n) #ifdef _OPENMP - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a27,i8)') 'Number of threads = ', omp_get_max_threads() #endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', n, n + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', n, n allocate( grid(n,n), stat=err) if (err .ne. 0) then diff --git a/FORTRAN/p2p-openacc.F90 b/FORTRAN/p2p-openacc.F90 index eb57bfb09..7e9f0b91a 100644 --- a/FORTRAN/p2p-openacc.F90 +++ b/FORTRAN/p2p-openacc.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -58,10 +59,7 @@ program main use iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: m, n @@ -79,39 +77,10 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a45)') 'Fortran OpenACC pipeline execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a39)') 'Usage: ./synch_p2p <# iterations> ', & - ' ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - m = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') m - - n = m - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - endif - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if ((m .lt. 1).or.(n .lt. 1)) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n - stop 1 - endif + call prk_get_arguments('p2p',iterations=iterations,dimx=m,dimy=n) - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', m, n + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', n, n allocate( grid(m,n), stat=err) if (err .ne. 0) then diff --git a/FORTRAN/p2p-openmp-target.F90 b/FORTRAN/p2p-openmp-target.F90 index e5d9a31dd..f895918b6 100644 --- a/FORTRAN/p2p-openmp-target.F90 +++ b/FORTRAN/p2p-openmp-target.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -58,10 +59,7 @@ program main use iso_fortran_env use omp_lib implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: m, n @@ -80,36 +78,8 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a44)') 'Fortran OpenMP TARGET pipeline execution on 2D grid' - if (command_argument_count().lt.3) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a38)') 'Usage: ./synch_p2p <# iterations> ', & - ' ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - m = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') m - - n = 1 - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if ((m .lt. 1).or.(n .lt. 1)) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n - stop 1 - endif + call prk_get_arguments('p2p',iterations=iterations,dimx=m,dimy=n) - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() write(*,'(a,i8)') 'Number of iterations = ', iterations write(*,'(a,i8,i8)') 'Grid sizes = ', m, n diff --git a/FORTRAN/p2p-tasks-openmp.F90 b/FORTRAN/p2p-tasks-openmp.F90 index fcebe0c82..741bf2d81 100644 --- a/FORTRAN/p2p-tasks-openmp.F90 +++ b/FORTRAN/p2p-tasks-openmp.F90 @@ -1,5 +1,6 @@ ! ! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions @@ -72,11 +73,9 @@ subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) program main use iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: m, n @@ -97,58 +96,21 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a50)') 'Fortran OpenMP TASKS pipeline execution on 2D grid' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a34,a39)') 'Usage: ./synch_p2p <# iterations> ', & - ' ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - - m = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') m - - n = m - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') n - - mc = m - call get_command_argument(4,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') mc - - nc = n - call get_command_argument(5,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') nc - endif - - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - if ((m .lt. 1).or.(n .lt. 1)) then - write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n - stop 1 - endif + call prk_get_arguments('p2p',iterations=iterations,dimx=m,dimy=n,tilex=mc,tiley=nc) ! mc=m or nc=n disables chunking in that dimension, which means ! there is no task parallelism to exploit - if (((mc.lt.1).or.(mc.gt.m)).or.((nc.lt.1).or.(nc.gt.n))) then + if (((mc.lt.1).or.(mc.ge.m)).or.((nc.lt.1).or.(nc.ge.n))) then mc = int(m/omp_get_max_threads()) nc = int(n/omp_get_max_threads()) endif mc = max(1,mc) nc = max(1,nc) - write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', m, n - write(*,'(a,i8,i8)') 'Size of chunking = ', mc, nc + write(*,'(a27,i8)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', m, n + write(*,'(a27,i8,i8)') 'Size of chunking = ', mc, nc allocate( grid(m,n), stat=err) if (err .ne. 0) then @@ -204,7 +166,6 @@ program main t1 = omp_get_wtime() pipeline_time = t1 - t0 - !$omp end master !$omp end parallel diff --git a/FORTRAN/p2p.F90 b/FORTRAN/p2p.F90 index 15e7fc896..9a84321cd 100644 --- a/FORTRAN/p2p.F90 +++ b/FORTRAN/p2p.F90 @@ -98,15 +98,15 @@ program main call prk_get_arguments('p2p',iterations=iterations,dimx=m,dimy=n,tilex=mc,tiley=nc) chunk = ((mc/=m).or.(nc/=n)) - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8,i8)') 'Grid sizes = ', m, n + write(*,'(a27,i8)') 'Number of iterations = ', iterations + write(*,'(a27,i8,i8)') 'Grid sizes = ', m, n if (chunk) then - write(*,'(a,i8,i8)') 'Size of chunking = ', mc, nc + write(*,'(a27,i8,i8)') 'Size of chunking = ', mc, nc endif allocate( grid(m,n), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of grid returned ',err + write(*,'(a22,i3)') 'allocation of grid returned ',err stop 1 endif From 8d9c9c4e42b78c07d6c9ddcadfbc392a9c716aba Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 28 Dec 2021 12:05:34 +0200 Subject: [PATCH 189/325] use, intrinsic :: iso_fortran_env Signed-off-by: Jeff Hammond --- FORTRAN/dgemm-blas.F90 | 2 +- FORTRAN/dgemm-ga.F90 | 2 +- FORTRAN/dgemm-openmp-target.F90 | 2 +- FORTRAN/dgemm-openmp.F90 | 4 +-- FORTRAN/dgemm-pretty.F90 | 2 +- FORTRAN/dgemm-stdpar.F90 | 4 +-- FORTRAN/dgemm-taskloop-openmp.F90 | 4 +-- FORTRAN/dgemm.F90 | 4 +-- FORTRAN/nstream-coarray.F90 | 2 +- FORTRAN/nstream-ga.F90 | 2 +- FORTRAN/nstream-mpi.F90 | 2 +- FORTRAN/nstream-openacc.F90 | 2 +- FORTRAN/nstream-openmp-target.F90 | 2 +- FORTRAN/nstream-openmp.F90 | 2 +- FORTRAN/nstream-pretty.F90 | 2 +- FORTRAN/nstream-stdpar.F90 | 2 +- FORTRAN/nstream-taskloop-openmp.F90 | 2 +- FORTRAN/nstream.F90 | 2 +- FORTRAN/p2p-async-openacc.F90 | 4 +-- FORTRAN/p2p-coarray.F90 | 2 +- FORTRAN/p2p-doacross-openmp.F90 | 2 +- FORTRAN/p2p-innerloop-openacc.F90 | 2 +- FORTRAN/p2p-innerloop-openmp.F90 | 2 +- FORTRAN/p2p-innerloop.F90 | 2 +- FORTRAN/p2p-openacc.F90 | 2 +- FORTRAN/p2p-openmp-target.F90 | 2 +- FORTRAN/p2p-tasks-openmp.F90 | 4 +-- FORTRAN/p2p.F90 | 4 +-- FORTRAN/prk_mod.F90 | 8 +++--- FORTRAN/stencil-coarray.F90 | 4 +-- FORTRAN/stencil-openacc.F90 | 4 +-- FORTRAN/stencil-openmp-target.F90 | 4 +-- FORTRAN/stencil-openmp.F90 | 4 +-- FORTRAN/stencil-pretty.F90 | 4 +-- FORTRAN/stencil-stdpar.F90 | 4 +-- FORTRAN/stencil-taskloop-openmp.F90 | 4 +-- FORTRAN/stencil.F90 | 4 +-- FORTRAN/stencil_openmp.F90 | 36 +++++++++++++-------------- FORTRAN/stencil_pretty.F90 | 36 +++++++++++++-------------- FORTRAN/stencil_serial.F90 | 36 +++++++++++++-------------- FORTRAN/stencil_target.F90 | 36 +++++++++++++-------------- FORTRAN/stencil_taskloop.F90 | 36 +++++++++++++-------------- FORTRAN/transpose-a2a-mpi.F90 | 4 +-- FORTRAN/transpose-acc-mpi.F90 | 5 ++-- FORTRAN/transpose-coarray.F90 | 2 +- FORTRAN/transpose-ga.F90 | 2 +- FORTRAN/transpose-get-mpi.F90 | 5 ++-- FORTRAN/transpose-openacc.F90 | 2 +- FORTRAN/transpose-openmp-target.F90 | 2 +- FORTRAN/transpose-openmp.F90 | 2 +- FORTRAN/transpose-p2p-mpi.F90 | 4 +-- FORTRAN/transpose-pointer.F90 | 2 +- FORTRAN/transpose-pretty.F90 | 2 +- FORTRAN/transpose-stdpar.F90 | 2 +- FORTRAN/transpose-taskloop-openmp.F90 | 2 +- FORTRAN/transpose-tasks-openmp.F90 | 2 +- FORTRAN/transpose.F90 | 2 +- 57 files changed, 166 insertions(+), 164 deletions(-) diff --git a/FORTRAN/dgemm-blas.F90 b/FORTRAN/dgemm-blas.F90 index 3f1a54f2a..5ddc39ac1 100644 --- a/FORTRAN/dgemm-blas.F90 +++ b/FORTRAN/dgemm-blas.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env #ifdef _OPENMP use omp_lib #endif diff --git a/FORTRAN/dgemm-ga.F90 b/FORTRAN/dgemm-ga.F90 index cd52c43e6..35787af1e 100644 --- a/FORTRAN/dgemm-ga.F90 +++ b/FORTRAN/dgemm-ga.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 use prk implicit none diff --git a/FORTRAN/dgemm-openmp-target.F90 b/FORTRAN/dgemm-openmp-target.F90 index 7ed137a4a..e610c9019 100644 --- a/FORTRAN/dgemm-openmp-target.F90 +++ b/FORTRAN/dgemm-openmp-target.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/dgemm-openmp.F90 b/FORTRAN/dgemm-openmp.F90 index 17ccb29cf..f1d8b7d28 100644 --- a/FORTRAN/dgemm-openmp.F90 +++ b/FORTRAN/dgemm-openmp.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* subroutine prk_dgemm(order, tile_size, A, B, C) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: order, tile_size real(kind=REAL64), intent(in) :: A(order,order) @@ -95,7 +95,7 @@ subroutine prk_dgemm(order, tile_size, A, B, C) end subroutine prk_dgemm program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/dgemm-pretty.F90 b/FORTRAN/dgemm-pretty.F90 index fac1a6250..74b6dcf85 100644 --- a/FORTRAN/dgemm-pretty.F90 +++ b/FORTRAN/dgemm-pretty.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env #ifdef NVHPC use cutensorex use cudafor diff --git a/FORTRAN/dgemm-stdpar.F90 b/FORTRAN/dgemm-stdpar.F90 index 9cb8a49e1..0cf4c35c4 100644 --- a/FORTRAN/dgemm-stdpar.F90 +++ b/FORTRAN/dgemm-stdpar.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* subroutine prk_dgemm(order, tile_size, A, B, C) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: order, tile_size real(kind=REAL64), intent(in) :: A(order,order) @@ -87,7 +87,7 @@ subroutine prk_dgemm(order, tile_size, A, B, C) end subroutine prk_dgemm program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/dgemm-taskloop-openmp.F90 b/FORTRAN/dgemm-taskloop-openmp.F90 index e663cbf72..50333144b 100644 --- a/FORTRAN/dgemm-taskloop-openmp.F90 +++ b/FORTRAN/dgemm-taskloop-openmp.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* subroutine prk_dgemm(order, tile_size, A, B, C) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: order, tile_size real(kind=REAL64), intent(in) :: A(order,order) @@ -102,7 +102,7 @@ subroutine prk_dgemm(order, tile_size, A, B, C) end subroutine prk_dgemm program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/dgemm.F90 b/FORTRAN/dgemm.F90 index 47e968e2c..e3d121d11 100644 --- a/FORTRAN/dgemm.F90 +++ b/FORTRAN/dgemm.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* subroutine prk_dgemm(order, tile_size, A, B, C) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: order, tile_size real(kind=REAL64), intent(in) :: A(order,order) @@ -102,7 +102,7 @@ subroutine prk_dgemm(order, tile_size, A, B, C) end subroutine prk_dgemm program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/nstream-coarray.F90 b/FORTRAN/nstream-coarray.F90 index cca208f97..33f28d0cb 100644 --- a/FORTRAN/nstream-coarray.F90 +++ b/FORTRAN/nstream-coarray.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: me, np, p diff --git a/FORTRAN/nstream-ga.F90 b/FORTRAN/nstream-ga.F90 index fc9343b6a..6e8a7634a 100644 --- a/FORTRAN/nstream-ga.F90 +++ b/FORTRAN/nstream-ga.F90 @@ -66,7 +66,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 use prk implicit none diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index e45735eee..2f4e58937 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env #ifdef _OPENMP use omp_lib #endif diff --git a/FORTRAN/nstream-openacc.F90 b/FORTRAN/nstream-openacc.F90 index 952e4ed35..04fb96c9f 100644 --- a/FORTRAN/nstream-openacc.F90 +++ b/FORTRAN/nstream-openacc.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/nstream-openmp-target.F90 b/FORTRAN/nstream-openmp-target.F90 index adda23a4a..ba6c08121 100644 --- a/FORTRAN/nstream-openmp-target.F90 +++ b/FORTRAN/nstream-openmp-target.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/nstream-openmp.F90 b/FORTRAN/nstream-openmp.F90 index e98ebc0e6..65f001140 100644 --- a/FORTRAN/nstream-openmp.F90 +++ b/FORTRAN/nstream-openmp.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/nstream-pretty.F90 b/FORTRAN/nstream-pretty.F90 index 65a46ea95..1e893d2d4 100644 --- a/FORTRAN/nstream-pretty.F90 +++ b/FORTRAN/nstream-pretty.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/nstream-stdpar.F90 b/FORTRAN/nstream-stdpar.F90 index 3e747787f..2d29f0319 100644 --- a/FORTRAN/nstream-stdpar.F90 +++ b/FORTRAN/nstream-stdpar.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/nstream-taskloop-openmp.F90 b/FORTRAN/nstream-taskloop-openmp.F90 index 64d8536fd..ac33e4026 100644 --- a/FORTRAN/nstream-taskloop-openmp.F90 +++ b/FORTRAN/nstream-taskloop-openmp.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/nstream.F90 b/FORTRAN/nstream.F90 index 85fff7971..989a971f4 100644 --- a/FORTRAN/nstream.F90 +++ b/FORTRAN/nstream.F90 @@ -64,7 +64,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/p2p-async-openacc.F90 b/FORTRAN/p2p-async-openacc.F90 index 2f4f94de6..9efee8c64 100644 --- a/FORTRAN/p2p-async-openacc.F90 +++ b/FORTRAN/p2p-async-openacc.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: m,n integer(kind=INT32), intent(in) :: startm,endm @@ -73,7 +73,7 @@ subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) end subroutine program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/p2p-coarray.F90 b/FORTRAN/p2p-coarray.F90 index 664b7ce9c..0f4aafbb5 100644 --- a/FORTRAN/p2p-coarray.F90 +++ b/FORTRAN/p2p-coarray.F90 @@ -58,7 +58,7 @@ ! ******************************************************************** program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/p2p-doacross-openmp.F90 b/FORTRAN/p2p-doacross-openmp.F90 index e2c494b3e..8202811b2 100644 --- a/FORTRAN/p2p-doacross-openmp.F90 +++ b/FORTRAN/p2p-doacross-openmp.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/p2p-innerloop-openacc.F90 b/FORTRAN/p2p-innerloop-openacc.F90 index c662ed911..6183ea536 100644 --- a/FORTRAN/p2p-innerloop-openacc.F90 +++ b/FORTRAN/p2p-innerloop-openacc.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/p2p-innerloop-openmp.F90 b/FORTRAN/p2p-innerloop-openmp.F90 index d308d86d6..42b086d8e 100644 --- a/FORTRAN/p2p-innerloop-openmp.F90 +++ b/FORTRAN/p2p-innerloop-openmp.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env #ifdef _OPENMP use omp_lib #endif diff --git a/FORTRAN/p2p-innerloop.F90 b/FORTRAN/p2p-innerloop.F90 index 33c06678f..9df15fdf1 100644 --- a/FORTRAN/p2p-innerloop.F90 +++ b/FORTRAN/p2p-innerloop.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env #ifdef _OPENMP use omp_lib #endif diff --git a/FORTRAN/p2p-openacc.F90 b/FORTRAN/p2p-openacc.F90 index 7e9f0b91a..f2d25db16 100644 --- a/FORTRAN/p2p-openacc.F90 +++ b/FORTRAN/p2p-openacc.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/p2p-openmp-target.F90 b/FORTRAN/p2p-openmp-target.F90 index f895918b6..8c3bbe7f3 100644 --- a/FORTRAN/p2p-openmp-target.F90 +++ b/FORTRAN/p2p-openmp-target.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib implicit none integer :: err diff --git a/FORTRAN/p2p-tasks-openmp.F90 b/FORTRAN/p2p-tasks-openmp.F90 index 741bf2d81..067132815 100644 --- a/FORTRAN/p2p-tasks-openmp.F90 +++ b/FORTRAN/p2p-tasks-openmp.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: m,n integer(kind=INT32), intent(in) :: startm,endm @@ -71,7 +71,7 @@ subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) end subroutine program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/p2p.F90 b/FORTRAN/p2p.F90 index 9a84321cd..06f07caad 100644 --- a/FORTRAN/p2p.F90 +++ b/FORTRAN/p2p.F90 @@ -56,7 +56,7 @@ ! ******************************************************************* subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: m,n integer(kind=INT32), intent(in) :: startm,endm @@ -71,7 +71,7 @@ subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) end subroutine program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/prk_mod.F90 b/FORTRAN/prk_mod.F90 index 26021839f..43fe37053 100644 --- a/FORTRAN/prk_mod.F90 +++ b/FORTRAN/prk_mod.F90 @@ -1,7 +1,7 @@ module prk contains function prk_get_wtime() result(t) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none real(kind=REAL64) :: t integer(kind=INT64) :: c, r @@ -17,7 +17,7 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? dimx, dimy, & ! p2p tilex, tiley, & ! p2p stencil, radius) ! not supported in implementations yet - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none character(len=*), intent(in) :: kernel integer(kind=INT32), intent(out) :: iterations @@ -322,7 +322,7 @@ subroutine prk_get_arguments(kernel, & ! which kernel am i parsing? end subroutine subroutine initialize_w(is_star,r,W) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star integer(kind=INT32), intent(in) :: r @@ -352,7 +352,7 @@ subroutine initialize_w(is_star,r,W) end subroutine initialize_w subroutine print_matrix(mat, label) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none real(kind=REAL64), intent(in) :: mat(:,:) integer(kind=INT32), intent(in), optional :: label diff --git a/FORTRAN/stencil-coarray.F90 b/FORTRAN/stencil-coarray.F90 index 61c248cc7..efb5efa98 100644 --- a/FORTRAN/stencil-coarray.F90 +++ b/FORTRAN/stencil-coarray.F90 @@ -64,7 +64,7 @@ ! ************************************************************************* subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n @@ -145,7 +145,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) end subroutine apply_stencil program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/stencil-openacc.F90 b/FORTRAN/stencil-openacc.F90 index c656ec9f1..a5543e5f3 100644 --- a/FORTRAN/stencil-openacc.F90 +++ b/FORTRAN/stencil-openacc.F90 @@ -62,7 +62,7 @@ ! ******************************************************************* subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n @@ -142,7 +142,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) end subroutine apply_stencil program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/stencil-openmp-target.F90 b/FORTRAN/stencil-openmp-target.F90 index f40415284..c22527368 100644 --- a/FORTRAN/stencil-openmp-target.F90 +++ b/FORTRAN/stencil-openmp-target.F90 @@ -62,7 +62,7 @@ ! ******************************************************************* subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n @@ -148,7 +148,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) end subroutine apply_stencil program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/stencil-openmp.F90 b/FORTRAN/stencil-openmp.F90 index e7f51eec6..caaa7974f 100644 --- a/FORTRAN/stencil-openmp.F90 +++ b/FORTRAN/stencil-openmp.F90 @@ -62,7 +62,7 @@ ! ******************************************************************* subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n @@ -143,7 +143,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) end subroutine apply_stencil program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/stencil-pretty.F90 b/FORTRAN/stencil-pretty.F90 index 44ba2e4ff..285f1b8e0 100644 --- a/FORTRAN/stencil-pretty.F90 +++ b/FORTRAN/stencil-pretty.F90 @@ -62,7 +62,7 @@ ! ******************************************************************* subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n @@ -143,7 +143,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) end subroutine apply_stencil program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/stencil-stdpar.F90 b/FORTRAN/stencil-stdpar.F90 index 01336bfaa..465962421 100644 --- a/FORTRAN/stencil-stdpar.F90 +++ b/FORTRAN/stencil-stdpar.F90 @@ -62,7 +62,7 @@ ! ******************************************************************* subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n @@ -134,7 +134,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) end subroutine apply_stencil program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/stencil-taskloop-openmp.F90 b/FORTRAN/stencil-taskloop-openmp.F90 index 65276a54e..2410c5555 100644 --- a/FORTRAN/stencil-taskloop-openmp.F90 +++ b/FORTRAN/stencil-taskloop-openmp.F90 @@ -62,7 +62,7 @@ ! ******************************************************************* subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n @@ -145,7 +145,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) end subroutine apply_stencil program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib use prk implicit none diff --git a/FORTRAN/stencil.F90 b/FORTRAN/stencil.F90 index 49a6f9b1a..484871019 100644 --- a/FORTRAN/stencil.F90 +++ b/FORTRAN/stencil.F90 @@ -62,7 +62,7 @@ ! ******************************************************************* subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) - use iso_fortran_env + use, intrinsic :: iso_fortran_env implicit none logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n @@ -135,7 +135,7 @@ subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) end subroutine apply_stencil program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err diff --git a/FORTRAN/stencil_openmp.F90 b/FORTRAN/stencil_openmp.F90 index 01ad2178f..ae48ca2fe 100644 --- a/FORTRAN/stencil_openmp.F90 +++ b/FORTRAN/stencil_openmp.F90 @@ -1,5 +1,5 @@ subroutine star1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -24,7 +24,7 @@ subroutine star1(n, in, out) end subroutine subroutine star2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -53,7 +53,7 @@ subroutine star2(n, in, out) end subroutine subroutine star3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -86,7 +86,7 @@ subroutine star3(n, in, out) end subroutine subroutine star4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -123,7 +123,7 @@ subroutine star4(n, in, out) end subroutine subroutine star5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -164,7 +164,7 @@ subroutine star5(n, in, out) end subroutine subroutine star6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -209,7 +209,7 @@ subroutine star6(n, in, out) end subroutine subroutine star7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -258,7 +258,7 @@ subroutine star7(n, in, out) end subroutine subroutine star8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -311,7 +311,7 @@ subroutine star8(n, in, out) end subroutine subroutine star9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -368,7 +368,7 @@ subroutine star9(n, in, out) end subroutine subroutine grid1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -393,7 +393,7 @@ subroutine grid1(n, in, out) end subroutine subroutine grid2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -428,7 +428,7 @@ subroutine grid2(n, in, out) end subroutine subroutine grid3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -479,7 +479,7 @@ subroutine grid3(n, in, out) end subroutine subroutine grid4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -552,7 +552,7 @@ subroutine grid4(n, in, out) end subroutine subroutine grid5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -653,7 +653,7 @@ subroutine grid5(n, in, out) end subroutine subroutine grid6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -788,7 +788,7 @@ subroutine grid6(n, in, out) end subroutine subroutine grid7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -963,7 +963,7 @@ subroutine grid7(n, in, out) end subroutine subroutine grid8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -1184,7 +1184,7 @@ subroutine grid8(n, in, out) end subroutine subroutine grid9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) diff --git a/FORTRAN/stencil_pretty.F90 b/FORTRAN/stencil_pretty.F90 index 60f64fd66..f9a71cbc9 100644 --- a/FORTRAN/stencil_pretty.F90 +++ b/FORTRAN/stencil_pretty.F90 @@ -1,5 +1,5 @@ subroutine star1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -18,7 +18,7 @@ subroutine star1(n, in, out) end subroutine subroutine star2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -41,7 +41,7 @@ subroutine star2(n, in, out) end subroutine subroutine star3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -68,7 +68,7 @@ subroutine star3(n, in, out) end subroutine subroutine star4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -99,7 +99,7 @@ subroutine star4(n, in, out) end subroutine subroutine star5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -134,7 +134,7 @@ subroutine star5(n, in, out) end subroutine subroutine star6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -173,7 +173,7 @@ subroutine star6(n, in, out) end subroutine subroutine star7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -216,7 +216,7 @@ subroutine star7(n, in, out) end subroutine subroutine star8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -263,7 +263,7 @@ subroutine star8(n, in, out) end subroutine subroutine star9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -314,7 +314,7 @@ subroutine star9(n, in, out) end subroutine subroutine grid1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -333,7 +333,7 @@ subroutine grid1(n, in, out) end subroutine subroutine grid2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -362,7 +362,7 @@ subroutine grid2(n, in, out) end subroutine subroutine grid3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -407,7 +407,7 @@ subroutine grid3(n, in, out) end subroutine subroutine grid4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -474,7 +474,7 @@ subroutine grid4(n, in, out) end subroutine subroutine grid5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -569,7 +569,7 @@ subroutine grid5(n, in, out) end subroutine subroutine grid6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -698,7 +698,7 @@ subroutine grid6(n, in, out) end subroutine subroutine grid7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -867,7 +867,7 @@ subroutine grid7(n, in, out) end subroutine subroutine grid8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -1082,7 +1082,7 @@ subroutine grid8(n, in, out) end subroutine subroutine grid9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) diff --git a/FORTRAN/stencil_serial.F90 b/FORTRAN/stencil_serial.F90 index 60f64fd66..f9a71cbc9 100644 --- a/FORTRAN/stencil_serial.F90 +++ b/FORTRAN/stencil_serial.F90 @@ -1,5 +1,5 @@ subroutine star1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -18,7 +18,7 @@ subroutine star1(n, in, out) end subroutine subroutine star2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -41,7 +41,7 @@ subroutine star2(n, in, out) end subroutine subroutine star3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -68,7 +68,7 @@ subroutine star3(n, in, out) end subroutine subroutine star4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -99,7 +99,7 @@ subroutine star4(n, in, out) end subroutine subroutine star5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -134,7 +134,7 @@ subroutine star5(n, in, out) end subroutine subroutine star6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -173,7 +173,7 @@ subroutine star6(n, in, out) end subroutine subroutine star7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -216,7 +216,7 @@ subroutine star7(n, in, out) end subroutine subroutine star8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -263,7 +263,7 @@ subroutine star8(n, in, out) end subroutine subroutine star9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -314,7 +314,7 @@ subroutine star9(n, in, out) end subroutine subroutine grid1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -333,7 +333,7 @@ subroutine grid1(n, in, out) end subroutine subroutine grid2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -362,7 +362,7 @@ subroutine grid2(n, in, out) end subroutine subroutine grid3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -407,7 +407,7 @@ subroutine grid3(n, in, out) end subroutine subroutine grid4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -474,7 +474,7 @@ subroutine grid4(n, in, out) end subroutine subroutine grid5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -569,7 +569,7 @@ subroutine grid5(n, in, out) end subroutine subroutine grid6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -698,7 +698,7 @@ subroutine grid6(n, in, out) end subroutine subroutine grid7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -867,7 +867,7 @@ subroutine grid7(n, in, out) end subroutine subroutine grid8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -1082,7 +1082,7 @@ subroutine grid8(n, in, out) end subroutine subroutine grid9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) diff --git a/FORTRAN/stencil_target.F90 b/FORTRAN/stencil_target.F90 index 10072a7cb..c0041b42f 100644 --- a/FORTRAN/stencil_target.F90 +++ b/FORTRAN/stencil_target.F90 @@ -1,5 +1,5 @@ subroutine star1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -22,7 +22,7 @@ subroutine star1(n, in, out) end subroutine subroutine star2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -49,7 +49,7 @@ subroutine star2(n, in, out) end subroutine subroutine star3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -80,7 +80,7 @@ subroutine star3(n, in, out) end subroutine subroutine star4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -115,7 +115,7 @@ subroutine star4(n, in, out) end subroutine subroutine star5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -154,7 +154,7 @@ subroutine star5(n, in, out) end subroutine subroutine star6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -197,7 +197,7 @@ subroutine star6(n, in, out) end subroutine subroutine star7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -244,7 +244,7 @@ subroutine star7(n, in, out) end subroutine subroutine star8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -295,7 +295,7 @@ subroutine star8(n, in, out) end subroutine subroutine star9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -350,7 +350,7 @@ subroutine star9(n, in, out) end subroutine subroutine grid1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -373,7 +373,7 @@ subroutine grid1(n, in, out) end subroutine subroutine grid2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -406,7 +406,7 @@ subroutine grid2(n, in, out) end subroutine subroutine grid3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -455,7 +455,7 @@ subroutine grid3(n, in, out) end subroutine subroutine grid4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -526,7 +526,7 @@ subroutine grid4(n, in, out) end subroutine subroutine grid5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -625,7 +625,7 @@ subroutine grid5(n, in, out) end subroutine subroutine grid6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -758,7 +758,7 @@ subroutine grid6(n, in, out) end subroutine subroutine grid7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -931,7 +931,7 @@ subroutine grid7(n, in, out) end subroutine subroutine grid8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n @@ -1150,7 +1150,7 @@ subroutine grid8(n, in, out) end subroutine subroutine grid9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none !$omp declare target integer(kind=INT32), intent(in) :: n diff --git a/FORTRAN/stencil_taskloop.F90 b/FORTRAN/stencil_taskloop.F90 index d0783491a..b80f01eac 100644 --- a/FORTRAN/stencil_taskloop.F90 +++ b/FORTRAN/stencil_taskloop.F90 @@ -1,5 +1,5 @@ subroutine star1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -22,7 +22,7 @@ subroutine star1(n, in, out) end subroutine subroutine star2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -49,7 +49,7 @@ subroutine star2(n, in, out) end subroutine subroutine star3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -80,7 +80,7 @@ subroutine star3(n, in, out) end subroutine subroutine star4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -115,7 +115,7 @@ subroutine star4(n, in, out) end subroutine subroutine star5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -154,7 +154,7 @@ subroutine star5(n, in, out) end subroutine subroutine star6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -197,7 +197,7 @@ subroutine star6(n, in, out) end subroutine subroutine star7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -244,7 +244,7 @@ subroutine star7(n, in, out) end subroutine subroutine star8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -295,7 +295,7 @@ subroutine star8(n, in, out) end subroutine subroutine star9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -350,7 +350,7 @@ subroutine star9(n, in, out) end subroutine subroutine grid1(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -373,7 +373,7 @@ subroutine grid1(n, in, out) end subroutine subroutine grid2(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -406,7 +406,7 @@ subroutine grid2(n, in, out) end subroutine subroutine grid3(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -455,7 +455,7 @@ subroutine grid3(n, in, out) end subroutine subroutine grid4(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -526,7 +526,7 @@ subroutine grid4(n, in, out) end subroutine subroutine grid5(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -625,7 +625,7 @@ subroutine grid5(n, in, out) end subroutine subroutine grid6(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -758,7 +758,7 @@ subroutine grid6(n, in, out) end subroutine subroutine grid7(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -931,7 +931,7 @@ subroutine grid7(n, in, out) end subroutine subroutine grid8(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) @@ -1150,7 +1150,7 @@ subroutine grid8(n, in, out) end subroutine subroutine grid9(n, in, out) -use iso_fortran_env +use, intrinsic :: iso_fortran_env implicit none integer(kind=INT32), intent(in) :: n real(kind=REAL64), intent(in) :: in(n,n) diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90 index 11d098e6b..c121b037a 100644 --- a/FORTRAN/transpose-a2a-mpi.F90 +++ b/FORTRAN/transpose-a2a-mpi.F90 @@ -56,7 +56,7 @@ module prk_mpi contains subroutine mpi_print_matrix(mat,clabel) - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 use prk implicit none @@ -83,7 +83,7 @@ subroutine mpi_print_matrix(mat,clabel) end module prk_mpi program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 use prk use prk_mpi diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 index 0c3e37952..9023a006f 100644 --- a/FORTRAN/transpose-acc-mpi.F90 +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -56,7 +56,7 @@ module prk_mpi contains subroutine mpi_print_matrix(mat,clabel) - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 use prk implicit none @@ -83,7 +83,8 @@ subroutine mpi_print_matrix(mat,clabel) end module prk_mpi program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env + use, intrinsic :: iso_c_binding use mpi_f08 use prk use prk_mpi diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90 index 4b5ace825..bc15f1238 100644 --- a/FORTRAN/transpose-coarray.F90 +++ b/FORTRAN/transpose-coarray.F90 @@ -55,7 +55,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none ! for argument parsing diff --git a/FORTRAN/transpose-ga.F90 b/FORTRAN/transpose-ga.F90 index 97c9b0421..8d81c038d 100644 --- a/FORTRAN/transpose-ga.F90 +++ b/FORTRAN/transpose-ga.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 implicit none #include "global.fh" diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90 index 4da287053..b153117ca 100644 --- a/FORTRAN/transpose-get-mpi.F90 +++ b/FORTRAN/transpose-get-mpi.F90 @@ -56,7 +56,7 @@ module prk_mpi contains subroutine mpi_print_matrix(mat,clabel) - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 use prk implicit none @@ -83,7 +83,8 @@ subroutine mpi_print_matrix(mat,clabel) end module prk_mpi program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env + use, intrinsic :: iso_c_binding use mpi_f08 use prk use prk_mpi diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index b0da3c8e2..02ab0ab9d 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none ! for argument parsing diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index 7ea77ad8e..4aa431b18 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib implicit none ! for argument parsing diff --git a/FORTRAN/transpose-openmp.F90 b/FORTRAN/transpose-openmp.F90 index b83106b14..93dab50a8 100644 --- a/FORTRAN/transpose-openmp.F90 +++ b/FORTRAN/transpose-openmp.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib implicit none ! for argument parsing diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index e376c99eb..3d72cb36c 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -56,7 +56,7 @@ module prk_mpi contains subroutine mpi_print_matrix(mat,clabel) - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 use prk implicit none @@ -83,7 +83,7 @@ subroutine mpi_print_matrix(mat,clabel) end module prk_mpi program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use mpi_f08 use prk use prk_mpi diff --git a/FORTRAN/transpose-pointer.F90 b/FORTRAN/transpose-pointer.F90 index 72cac8cc7..87c3eaac1 100644 --- a/FORTRAN/transpose-pointer.F90 +++ b/FORTRAN/transpose-pointer.F90 @@ -54,7 +54,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none ! for argument parsing diff --git a/FORTRAN/transpose-pretty.F90 b/FORTRAN/transpose-pretty.F90 index d40956c57..885c4ac3d 100644 --- a/FORTRAN/transpose-pretty.F90 +++ b/FORTRAN/transpose-pretty.F90 @@ -50,7 +50,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none ! for argument parsing diff --git a/FORTRAN/transpose-stdpar.F90 b/FORTRAN/transpose-stdpar.F90 index c69dc9a9b..26c0e87f5 100644 --- a/FORTRAN/transpose-stdpar.F90 +++ b/FORTRAN/transpose-stdpar.F90 @@ -53,7 +53,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none ! for argument parsing diff --git a/FORTRAN/transpose-taskloop-openmp.F90 b/FORTRAN/transpose-taskloop-openmp.F90 index 21ff1ba7e..3cc0fbc78 100644 --- a/FORTRAN/transpose-taskloop-openmp.F90 +++ b/FORTRAN/transpose-taskloop-openmp.F90 @@ -52,7 +52,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib implicit none ! for argument parsing diff --git a/FORTRAN/transpose-tasks-openmp.F90 b/FORTRAN/transpose-tasks-openmp.F90 index b5a97635c..a0ac9afb9 100644 --- a/FORTRAN/transpose-tasks-openmp.F90 +++ b/FORTRAN/transpose-tasks-openmp.F90 @@ -52,7 +52,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use omp_lib implicit none ! for argument parsing diff --git a/FORTRAN/transpose.F90 b/FORTRAN/transpose.F90 index 13b345a03..4e398a1bf 100644 --- a/FORTRAN/transpose.F90 +++ b/FORTRAN/transpose.F90 @@ -54,7 +54,7 @@ ! ******************************************************************* program main - use iso_fortran_env + use, intrinsic :: iso_fortran_env use prk implicit none integer :: err From 35bef6dd751a5f30cddffb991cc42c3a455fafc9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 28 Dec 2021 12:17:12 +0200 Subject: [PATCH 190/325] 2022 release supports tasking now --- FORTRAN/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index adcddef87..4a1315edb 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -34,7 +34,7 @@ ifeq ($(findstring ifort,$(FC)),ifort) endif # OpenMP target stuff is only in IFX now ifeq ($(findstring ifx,$(FC)),ifx) - EXTRA = target + EXTRA = target tasks taskloop endif # GCC (also matches pgfortran so PGI must come after) ifeq ($(findstring gfortran,$(FC)),gfortran) From 1fdfef3310816bd2be971de20994ce2479e64594 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 28 Dec 2021 13:23:02 +0200 Subject: [PATCH 191/325] detect MPI datatypes Signed-off-by: Jeff Hammond --- Cxx11/prk_mpi.h | 69 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/Cxx11/prk_mpi.h b/Cxx11/prk_mpi.h index 8a05e0325..c9aac492a 100644 --- a/Cxx11/prk_mpi.h +++ b/Cxx11/prk_mpi.h @@ -9,6 +9,7 @@ #include #include #include // exclusive_scan +#include #include #include @@ -52,6 +53,19 @@ namespace prk } } + template + MPI_Datatype get_MPI_Datatype(T t) { return MPI_DATATYPE_NULL; } + + template <> + MPI_Datatype get_MPI_Datatype(double d) { return MPI_DOUBLE; } + template <> + MPI_Datatype get_MPI_Datatype(int i) { return MPI_INT; } + template <> + MPI_Datatype get_MPI_Datatype(size_t s) { + static_assert( sizeof(size_t) == sizeof(int64_t) && sizeof(size_t) == sizeof(uint64_t) ); + return ( std::is_signed() ? MPI_INT64_T : MPI_UINT64_T ); + } + class state { private: @@ -104,47 +118,61 @@ namespace prk prk::MPI::check( MPI_Barrier(comm) ); } - double min(double in, MPI_Comm comm = MPI_COMM_WORLD) { - double out; - prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_MIN, comm) ); + template + T min(T in, MPI_Comm comm = MPI_COMM_WORLD) { + T out; + MPI_Datatype dt = prk::MPI::get_MPI_Datatype(in); + prk::MPI::check( MPI_Allreduce(&in, &out, 1, dt, MPI_MIN, comm) ); return out; } - double max(double in, MPI_Comm comm = MPI_COMM_WORLD) { - double out; - prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_MAX, comm) ); + template + T max(T in, MPI_Comm comm = MPI_COMM_WORLD) { + T out; + MPI_Datatype dt = prk::MPI::get_MPI_Datatype(in); + prk::MPI::check( MPI_Allreduce(&in, &out, 1, dt, MPI_MAX, comm) ); return out; } - double sum(double in, MPI_Comm comm = MPI_COMM_WORLD) { - double out; - prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_SUM, comm) ); + template + T sum(T in, MPI_Comm comm = MPI_COMM_WORLD) { + T out; + MPI_Datatype dt = prk::MPI::get_MPI_Datatype(in); + prk::MPI::check( MPI_Allreduce(&in, &out, 1, dt, MPI_SUM, comm) ); return out; } - double avg(double in, MPI_Comm comm = MPI_COMM_WORLD) { - double out; - prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_SUM, comm) ); + template + T avg(T in, MPI_Comm comm = MPI_COMM_WORLD) { + T out; + MPI_Datatype dt = prk::MPI::get_MPI_Datatype(in); + prk::MPI::check( MPI_Allreduce(&in, &out, 1, dt, MPI_SUM, comm) ); out /= prk::MPI::size(comm); return out; } - void stats(double in, double * min, double * max, double * avg, MPI_Comm comm = MPI_COMM_WORLD) { - prk::MPI::check( MPI_Allreduce(&in, min, 1, MPI_DOUBLE, MPI_MIN, comm) ); - prk::MPI::check( MPI_Allreduce(&in, max, 1, MPI_DOUBLE, MPI_MAX, comm) ); - prk::MPI::check( MPI_Allreduce(&in, avg, 1, MPI_DOUBLE, MPI_SUM, comm) ); + template + void stats(T in, T * min, T * max, T * avg, MPI_Comm comm = MPI_COMM_WORLD) { + MPI_Datatype dt = prk::MPI::get_MPI_Datatype(in); + prk::MPI::check( MPI_Allreduce(&in, min, 1, dt, MPI_MIN, comm) ); + prk::MPI::check( MPI_Allreduce(&in, max, 1, dt, MPI_MAX, comm) ); + prk::MPI::check( MPI_Allreduce(&in, avg, 1, dt, MPI_SUM, comm) ); *avg /= prk::MPI::size(comm); } - bool is_same(int in, MPI_Comm comm = MPI_COMM_WORLD) { - int min=INT_MAX, max=0; - prk::MPI::check( MPI_Allreduce(&in, &min, 1, MPI_INT, MPI_MIN, comm) ); - prk::MPI::check( MPI_Allreduce(&in, &max, 1, MPI_INT, MPI_MAX, comm) ); + template + bool is_same(T in, MPI_Comm comm = MPI_COMM_WORLD) { + T min=std::numeric_limits::max(); + T max=std::numeric_limits::min(); + MPI_Datatype dt = prk::MPI::get_MPI_Datatype(in); + prk::MPI::check( MPI_Allreduce(&in, &min, 1, dt, MPI_MIN, comm) ); + prk::MPI::check( MPI_Allreduce(&in, &max, 1, dt, MPI_MAX, comm) ); return (min==max); } bool is_same(size_t in, MPI_Comm comm = MPI_COMM_WORLD) { size_t min=SIZE_MAX, max=0; + static_assert( sizeof(size_t) == sizeof(int64_t) && sizeof(size_t) == sizeof(uint64_t) ); MPI_Datatype dt = (std::is_signed() ? MPI_INT64_T : MPI_UINT64_T); prk::MPI::check( MPI_Allreduce(&in, &min, 1, dt, MPI_MIN, comm) ); prk::MPI::check( MPI_Allreduce(&in, &max, 1, dt, MPI_MAX, comm) ); @@ -153,6 +181,7 @@ namespace prk size_t sum(size_t in, MPI_Comm comm = MPI_COMM_WORLD) { size_t out; + static_assert( sizeof(size_t) == sizeof(int64_t) && sizeof(size_t) == sizeof(uint64_t) ); MPI_Datatype dt = (std::is_signed() ? MPI_INT64_T : MPI_UINT64_T); prk::MPI::check( MPI_Allreduce(&in, &out, 1, dt, MPI_SUM, comm) ); return out; From c086730dbaedac00254abcd8120d674d18ac8191 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 28 Dec 2021 13:23:13 +0200 Subject: [PATCH 192/325] oneAPI update --- common/make.defs.oneapi | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index cfd040dc8..16980b7d4 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -12,9 +12,9 @@ CC=icx -std=c11 -pthread #EXTRA_CLIBS=-lrt # All of the Fortran code is written for the 2008 standard and requires preprocessing. -FC=ifx -std08 -fpp +FC=ifx -std18 -fpp # C++11 may not be required but does no harm here. -CXX=icpx -std=c++17 -pthread +CXX=icpx -std=c++20 -pthread #--gcc-toolchain=/opt/gcc/11.2.0 # # Compiler flags # @@ -28,7 +28,7 @@ DEFAULT_OPT_FLAGS=-g -O3 -xHOST # # OpenMP flags # -OPENMPFLAG=-fiopenmp +OPENMPFLAG=-qopenmp OPENMPSIMDFLAG=-qopenmp-simd OFFLOADFLAG=-fopenmp-targets=spir64 OFFLOADFLAG+=-DGPU_SCHEDULE="" @@ -75,6 +75,7 @@ TBBFLAG=-tbb # Parallel STL, Boost, etc. # BOOSTFLAG= +#RANGEFLAG=-DUSE_GCC_RANGES RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -I./pstl/stdlib -I./pstl/include @@ -88,13 +89,13 @@ THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # CBLAS for C++ DGEMM # #CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions -BLASFLAG=-mkl -CBLASFLAG=-DMKL -mkl +BLASFLAG=-qmkl +CBLASFLAG=-DMKL -qmkl #MKLROOT=/opt/intel/inteloneapi/mkl/latest #ONEMKLFLAG=-I$(MKLROOT)/include -DMKL_ILP64 ${MKLROOT}/lib/intel64/libmkl_sycl.a -L${MKLROOT}/lib/intel64 -lmkl_intel_ilp64 -lmkl_tbb_thread ${TBBFLAG} -lmkl_core -lOpenCL -ldl #ONEMKLFLAG+=-I/opt/intel/oneapi/mkl/latest/include/ -ONEMKLFLAG=-I$(MKLROOT)/include -DMKL_ILP64 ${MKLROOT}/lib/intel64/libmkl_sycl.a -L${MKLROOT}/lib/intel64 -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lOpenCL -ldl -#ONEMKLFLAG=-mkl +#ONEMKLFLAG=-I$(MKLROOT)/include -DMKL_ILP64 ${MKLROOT}/lib/intel64/libmkl_sycl.a -L${MKLROOT}/lib/intel64 -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lOpenCL -ldl +ONEMKLFLAG=-qmkl # # CUDA flags # @@ -124,7 +125,7 @@ MPICC=${MPIDIR}/bin/mpiicc MPICXX=${MPIDIR}/bin/mpiicpc MPIFORT=${MPIDIR}/bin/mpiifort MPIINC=-I${MPIDIR}/include -MPILIB=-L${MPIDIR}/lib -lmpi +MPILIB=-L${MPIDIR}/lib -L${MPIDIR}/lib/release -lmpi #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi #MPIINC=-I/usr/include/mpich-3.2-x86_64 #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi From d828adad749ac1d1591c26468268c3053874054e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 28 Dec 2021 13:29:19 +0200 Subject: [PATCH 193/325] GPU_SCHEDULE and ignore older GF Signed-off-by: Jeff Hammond --- ci/build-run-prk.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/build-run-prk.sh b/ci/build-run-prk.sh index e9cc6877d..26a108577 100644 --- a/ci/build-run-prk.sh +++ b/ci/build-run-prk.sh @@ -658,7 +658,7 @@ case "$PRK_TARGET" in export PRK_TARGET_PATH=FORTRAN case "$FC" in gfortran) - for major in "-14" "-13" "-12" "-11" "-10" "-9" "-8" "-7" "-6" "-5" "-4" "-3" "-2" "-1" "" ; do + for major in "-14" "-13" "-12" "-11" "-10" "-9" "-8" "-7" "" ; do if [ -f "`which gfortran$major`" ]; then export PRK_FC="gfortran$major" echo "Found GCC Fortran: $PRK_FC" @@ -696,6 +696,9 @@ case "$PRK_TARGET" in ;; esac + # ignore this in CI + echo "OFFLOADFLAG+=-DGPU_SCHEDULE=\"\"" >> common/make.defs + # Serial ${MAKE} -C ${PRK_TARGET_PATH} p2p p2p-innerloop stencil transpose nstream dgemm $PRK_TARGET_PATH/p2p 10 1024 1024 From fcc72c65de3b6ea9735ab5d6c90847bc91a79ac7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 28 Dec 2021 13:36:33 +0200 Subject: [PATCH 194/325] update compiler version search and disable ranges Signed-off-by: Jeff Hammond --- ci/build-run-prk.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/build-run-prk.sh b/ci/build-run-prk.sh index 26a108577..f3e849a19 100644 --- a/ci/build-run-prk.sh +++ b/ci/build-run-prk.sh @@ -113,7 +113,7 @@ case "$PRK_TARGET" in export PRK_TARGET_PATH=C1z case $CC in g*) - for major in "-9" "-8" "-7" "-6" "-5" "" ; do + for major in "-14" "-13" "-12" "-11" "-10" "-9" "-8" "-7" "-6" "-5" "" ; do if [ -f "`which ${CC}${major}`" ]; then export PRK_CC="${CC}${major}" echo "Found C: $PRK_CC" @@ -125,7 +125,7 @@ case "$PRK_TARGET" in fi ;; clang*) - for version in "-10" "-9" "-8" "-7" "-6" "-5" "" ; do + for version in "-14" "-13" "-12" "-11" "-10" "-9" "-8" "-7" "-6" "-5" "" ; do if [ -f "`which ${CC}${version}`" ]; then export PRK_CC="${CC}${version}" echo "Found C: $PRK_CC" @@ -268,7 +268,7 @@ case "$PRK_TARGET" in if [ "$os" = "Darwin" ] && [ "x$PRK_CXX" = "x" ] ; then brew list brew search llvm - for version in "9" "8" "7" "6" "5" ; do + for version in "14" "13" "12" "11" "10" "9" "8" "7" "6" "5" ; do if [ -f "`which /usr/local/opt/gcc@${version}/bin/g++-${version}`" ]; then export PRK_CXX="`which /usr/local/opt/gcc@${version}/bin/g++-${version}`" echo "Found C++: $PRK_CXX" @@ -277,7 +277,7 @@ case "$PRK_TARGET" in done fi if [ "x$PRK_CXX" = "x" ] ; then - for major in "-9" "-8" "-7" "-6" "-5" "" ; do + for major in "-14" "-13" "-12" "-11" "-10" "-9" "-8" "-7" "-6" "-5" "" ; do if [ -f "`which ${CXX}${major}`" ]; then export PRK_CXX="${CXX}${major}" echo "Found C++: $PRK_CXX" @@ -301,7 +301,7 @@ case "$PRK_TARGET" in done fi if [ "x$PRK_CXX" = "x" ] ; then - for version in "-11" "-10" "-9" "-8" "-7" "-6" "-5" "" ; do + for version in "-14" "-13" "-12" "-11" "-10" "-9" "-8" "-7" "-6" "-5" ; do if [ -f "`which ${CXX}${version}`" ]; then export PRK_CXX="${CXX}${version}" echo "Found C++: $PRK_CXX" From d5081e58e13cffc4b03c0ba0e9dd45f88c24acd1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 28 Dec 2021 17:01:49 +0200 Subject: [PATCH 195/325] fix a bunch of ranges stuff --- Cxx11/nstream-pstl.cc | 4 +++- Cxx11/nstream-ranges.cc | 10 ++++++---- Cxx11/nstream-stl.cc | 14 ++++++++------ Cxx11/prk_pstl.h | 6 +++++- Cxx11/prk_ranges.h | 1 + Cxx11/prk_util.h | 4 ---- Cxx11/stencil-pstl.cc | 3 ++- Cxx11/stencil-ranges.cc | 4 +--- Cxx11/stencil-stl.cc | 1 + Cxx11/transpose-pstl.cc | 2 ++ Cxx11/transpose-ranges.cc | 4 +--- Cxx11/transpose-stl.cc | 1 + 12 files changed, 31 insertions(+), 23 deletions(-) diff --git a/Cxx11/nstream-pstl.cc b/Cxx11/nstream-pstl.cc index 7d9225683..3ab96319f 100644 --- a/Cxx11/nstream-pstl.cc +++ b/Cxx11/nstream-pstl.cc @@ -1,5 +1,6 @@ /// /// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -62,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include #include "prk_pstl.h" // See ParallelSTL.md for important information. @@ -110,7 +112,7 @@ int main(int argc, char * argv[]) std::vector B(length); std::vector C(length); - auto range = prk::range(static_cast(0), length); + auto range = std::ranges::views::iota(static_cast(0), length); double scalar(3); diff --git a/Cxx11/nstream-ranges.cc b/Cxx11/nstream-ranges.cc index 7a4e3da84..7f696688c 100644 --- a/Cxx11/nstream-ranges.cc +++ b/Cxx11/nstream-ranges.cc @@ -1,5 +1,6 @@ /// /// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -62,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include // See ParallelSTL.md for important information. @@ -105,11 +107,11 @@ int main(int argc, char * argv[]) double nstream_time{0}; - prk::vector A(length,0.0); - prk::vector B(length,2.0); - prk::vector C(length,2.0); + std::vector A(length,0.0); + std::vector B(length,2.0); + std::vector C(length,2.0); - auto range = prk::range(0,length); + auto range = std::ranges::views::iota(static_cast(0), length); double scalar(3); diff --git a/Cxx11/nstream-stl.cc b/Cxx11/nstream-stl.cc index 2946f2160..591678e4d 100644 --- a/Cxx11/nstream-stl.cc +++ b/Cxx11/nstream-stl.cc @@ -1,5 +1,6 @@ /// /// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -62,10 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" - -#include "boost/iterator/zip_iterator.hpp" -#include "boost/tuple/tuple.hpp" -#include "boost/tuple/tuple_comparison.hpp" +#include int main(int argc, char * argv[]) { @@ -111,7 +109,7 @@ int main(int argc, char * argv[]) std::vector B(length); std::vector C(length); - //auto range = prk::range(static_cast(0), length); + auto range = std::ranges::views::iota(static_cast(0), length); double scalar(3); @@ -125,7 +123,11 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); -#if 0 +#if 1 + std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + A[i] += B[i] + scalar * C[i]; + }); +#elif 0 // stupid version std::transform( std::begin(A), std::end(A), std::begin(B), std::begin(A), [](auto&& x, auto&& y) { diff --git a/Cxx11/prk_pstl.h b/Cxx11/prk_pstl.h index 7d593b718..ba917c0c4 100644 --- a/Cxx11/prk_pstl.h +++ b/Cxx11/prk_pstl.h @@ -1,5 +1,6 @@ /// /// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -32,12 +33,15 @@ #ifndef PRK_PSTL_H #define PRK_PSTL_H +#include + #if defined(__GNUC__) && (__GNUC__ >= 9) # include # include # include -namespace exec = __pstl::execution; +//namespace exec = __pstl::execution; +namespace exec = std::execution; #elif defined(USE_LLVM_PSTL) diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h index eab958366..081cbd964 100644 --- a/Cxx11/prk_ranges.h +++ b/Cxx11/prk_ranges.h @@ -41,6 +41,7 @@ # include "range/v3/view/iota.hpp" # include "range/v3/view/slice.hpp" # include "range/v3/view/stride.hpp" +# include "range/v3/view/cartesian_product.hpp" #else # error You have not provided a version of ranges to use. #endif diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index d316ed14c..e5314fd81 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -69,10 +69,6 @@ #include "prk_simd.h" -#ifdef USE_RANGES -# include "prk_ranges.h" -#endif - // used in OpenMP target and CUDA code because std::min etc are not declare target #ifndef MIN #define MIN(x,y) ((x)<(y)?(x):(y)) diff --git a/Cxx11/stencil-pstl.cc b/Cxx11/stencil-pstl.cc index 56662260e..44f6d0169 100644 --- a/Cxx11/stencil-pstl.cc +++ b/Cxx11/stencil-pstl.cc @@ -1,6 +1,6 @@ - /// /// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -61,6 +61,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_ranges.h" #include "prk_pstl.h" #include "stencil_pstl.hpp" diff --git a/Cxx11/stencil-ranges.cc b/Cxx11/stencil-ranges.cc index 55486a5c8..df8cc364e 100644 --- a/Cxx11/stencil-ranges.cc +++ b/Cxx11/stencil-ranges.cc @@ -61,9 +61,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" - -#include "range/v3/view/cartesian_product.hpp" -#include "range/v3/view/stride.hpp" +#include "prk_ranges.h" #include "stencil_ranges.hpp" diff --git a/Cxx11/stencil-stl.cc b/Cxx11/stencil-stl.cc index b809ff68b..d1553d568 100644 --- a/Cxx11/stencil-stl.cc +++ b/Cxx11/stencil-stl.cc @@ -61,6 +61,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_ranges.h" #include "stencil_stl.hpp" void nothing(const int n, const int t, std::vector & in, std::vector & out) diff --git a/Cxx11/transpose-pstl.cc b/Cxx11/transpose-pstl.cc index 75cdc5089..f9bfd7338 100644 --- a/Cxx11/transpose-pstl.cc +++ b/Cxx11/transpose-pstl.cc @@ -1,5 +1,6 @@ /// /// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -50,6 +51,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_ranges.h" #include "prk_pstl.h" int main(int argc, char * argv[]) diff --git a/Cxx11/transpose-ranges.cc b/Cxx11/transpose-ranges.cc index 9f6d5d5f0..d4e7a2226 100644 --- a/Cxx11/transpose-ranges.cc +++ b/Cxx11/transpose-ranges.cc @@ -54,9 +54,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" - -#include "range/v3/view/cartesian_product.hpp" -#include "range/v3/view/stride.hpp" +#include "prk_ranges.h" int main(int argc, char * argv[]) { diff --git a/Cxx11/transpose-stl.cc b/Cxx11/transpose-stl.cc index a87863aa2..65cc00365 100644 --- a/Cxx11/transpose-stl.cc +++ b/Cxx11/transpose-stl.cc @@ -50,6 +50,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_ranges.h" int main(int argc, char * argv[]) { From d8d12d421496d345ae9a29bdc44ee63c6229983a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 28 Dec 2021 17:03:02 +0200 Subject: [PATCH 196/325] fix a bunch of ranges stuff --- common/make.defs.gcc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 9e088b0c2..7c6d9b188 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -9,9 +9,9 @@ VERSION=-11 CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt # All of the Fortran code is written for the 2008 standard and requires preprocessing. -FC=gfortran${VERSION} -std=f2008 -cpp -fexternal-blas -fblas-matmul-limit=0 +FC=gfortran${VERSION} -std=f2018 -cpp -fexternal-blas -fblas-matmul-limit=0 # C++11 may not be required but does no harm here. -CXX=g++${VERSION} -std=gnu++20 -pthread +CXX=g++${VERSION} -std=gnu++20 -pthread -fmax-errors=1 # # Compiler flags # @@ -135,6 +135,7 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_1/include # M1 Big Sur #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include +#RANGEFLAG=-DUSE_GCC_RANGES PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} #PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages KOKKOSDIR=/opt/kokkos/gcc From 1fa82aeaf3e7c39574322c9a8744e804039c42a3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 29 Dec 2021 11:59:37 +0200 Subject: [PATCH 197/325] update this --- common/make.defs.nvhpc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index 0eb9330d7..84388a6c1 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -1,7 +1,8 @@ # # This file shows the NVHPC toolchain options. -#NVHPC_PATH=/proj/nv/Linux_x86_64/21.3 -#NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_x86_64/2021 +#NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11 +#NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021 +NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11 #NVHPC_CBIN=${NVHPC_PATH}/compilers/bin # # Base compilers and language options @@ -37,7 +38,7 @@ CUFORTFLAG+=-Minfo=accel # # OpenCL flags # -OPENCLDIR=/usr/local/cuda-11.2/targets/x86_64-linux +OPENCLDIR=/usr/local/cuda-11.4/targets/$$(uname -m)-linux OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL #OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations #OPENCLFLAG+=-Wno-deprecated-declarations -Wno-missing-braces @@ -55,7 +56,7 @@ KOKKOSCXX=${KOKKOSDIR}/bin/nvcc_wrapper KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkoscore RAJADIR= RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} -THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/compilers/include-stdpar +THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11/compilers/include-stdpar THRUSTFLAG=-I${THRUSTDIR} # # CBLAS for C++ DGEMM @@ -75,8 +76,8 @@ CUDAFLAGS+=--gpu-architecture=sm_75 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' -CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/11.2/targets/x86_64-linux/include -CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/11.2/targets/x86_64-linux/lib +CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/11.4/targets/$$(uname -m)-linux/include +CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/11.4/targets/$$(uname -m)-linux/lib # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 # heavy hammer: CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED @@ -111,7 +112,7 @@ MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi -#MPIINC=-I/usr/include/mpich-3.2-x86_64 +#MPIINC=-I/usr/include/mpich-3.2-$$(uname -m) #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi # # Global Arrays From 0b52d57d7089bf48e89eda89269a6a836c99996c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 29 Dec 2021 12:13:13 +0200 Subject: [PATCH 198/325] conditionalize OpenMP tasking for PGI/NVHPC --- Cxx11/Makefile | 2 +- Cxx11/prk_openmp.h | 25 +++++++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index bc3ccec5f..aec1971c9 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -82,7 +82,7 @@ vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream valarray: transpose-valarray nstream-valarray -openmp: p2p-hyperplane-openmp p2p-tasks-openmp stencil-openmp transpose-openmp nstream-openmp +openmp: p2p-hyperplane-openmp stencil-openmp transpose-openmp nstream-openmp p2p-tasks-openmp target: stencil-openmp-target transpose-openmp-target nstream-openmp-target diff --git a/Cxx11/prk_openmp.h b/Cxx11/prk_openmp.h index 72e1e08d3..50c94eb6c 100644 --- a/Cxx11/prk_openmp.h +++ b/Cxx11/prk_openmp.h @@ -47,26 +47,35 @@ # if (_OPENMP >= 201300) || (__ibmxl_version__ >= 16) # define OMP_SIMD PRAGMA(omp simd) # define OMP_FOR_SIMD PRAGMA(omp for simd) -# define OMP_TASK(x) PRAGMA(omp task x) -# define OMP_TASKLOOP(x) PRAGMA(omp taskloop x ) -# if defined(__INTEL_COMPILER) -# define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop x ) +// PGI/NVHPC compilers do not support OpenMP tasking/ordered +# if !( defined(__PGIC__) || defined(__PGI) || defined(__NVCOMPILER) ) +# define OMP_ORDERED(x) PRAGMA(omp ordered x) +# define OMP_TASK(x) PRAGMA(omp task x) +# define OMP_TASKLOOP(x) PRAGMA(omp taskloop x ) +# if defined(__INTEL_COMPILER) +# define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop x ) +# else +# define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop collapse(n) x ) +# endif +# define OMP_TASKWAIT PRAGMA(omp taskwait) # else -# define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop collapse(n) x ) +# define OMP_ORDERED(x) +# define OMP_TASK(x) +# define OMP_TASKLOOP(x) +# define OMP_TASKLOOP_COLLAPSE(n,x) +# define OMP_TASKWAIT # endif -# define OMP_TASKWAIT PRAGMA(omp taskwait) -# define OMP_ORDERED(x) PRAGMA(omp ordered x) # define OMP_TARGET(x) PRAGMA(omp target x) # define OMP_DECLARE_TARGET PRAGMA(omp declare target) # define OMP_END_DECLARE_TARGET PRAGMA(omp end declare target) # else # define OMP_SIMD # define OMP_FOR_SIMD PRAGMA(omp for) +# define OMP_ORDERED(x) # define OMP_TASK(x) # define OMP_TASKLOOP(x) # define OMP_TASKLOOP_COLLAPSE(n,x) # define OMP_TASKWAIT -# define OMP_ORDERED(x) # define OMP_TARGET(x) # define OMP_DECLARE_TARGET # define OMP_END_DECLARE_TARGET From 789e4a408d9c5eefdcc8ca04596958b6f6f9a41d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 29 Dec 2021 12:27:23 +0200 Subject: [PATCH 199/325] ranges update --- Cxx11/prk_ranges.h | 28 ++++++++++++++++++++++++++-- Cxx11/transpose-ranges.cc | 16 +++------------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h index 081cbd964..15c5627d9 100644 --- a/Cxx11/prk_ranges.h +++ b/Cxx11/prk_ranges.h @@ -37,6 +37,7 @@ # include #elif defined(USE_BOOST_IRANGE) # include "boost/range/irange.hpp" +# include "boost/hana/fwd/cartesian_product.hpp" #elif defined(USE_RANGES_TS) # include "range/v3/view/iota.hpp" # include "range/v3/view/slice.hpp" @@ -59,11 +60,11 @@ namespace prk { #endif } -#if UNUSED template auto range(S start, E end, B blocking) { #if defined(USE_GCC_RANGES) -#error FIXME +#warning This implementation does not support tiling! + return std::ranges::views::iota(static_cast(start), end); #elif defined(USE_BOOST_IRANGE) return boost::irange(static_cast(start), end, static_cast(blocking) ); #elif defined(USE_RANGES_TS) @@ -75,7 +76,30 @@ namespace prk { ranges::views::stride(static_cast(blocking)); #endif } + + template + auto range2(S start, E end) { + auto range1 = prk::range(start,end); +#if defined(USE_GCC_RANGES) + return std::ranges::views::iota(static_cast(start), end); +#elif defined(USE_BOOST_IRANGE) + return boost::hana::cartesian_product(range1,range1); +#elif defined(USE_RANGES_TS) + return ranges::views::cartesian_product(range1,range1); +#endif + } + + template + auto range2(S start, E end, B blocking) { + auto range1 = prk::range(start,end,blocking); +#if defined(USE_GCC_RANGES) + return std::ranges::views::iota(static_cast(start), end); +#elif defined(USE_BOOST_IRANGE) + return boost::hana::cartesian_product(range1,range1); +#elif defined(USE_RANGES_TS) + return ranges::views::cartesian_product(range1,range1); #endif + } } // namespace prk diff --git a/Cxx11/transpose-ranges.cc b/Cxx11/transpose-ranges.cc index d4e7a2226..f6e53b822 100644 --- a/Cxx11/transpose-ranges.cc +++ b/Cxx11/transpose-ranges.cc @@ -116,20 +116,10 @@ int main(int argc, char * argv[]) std::iota(A.begin(), A.end(), 0.0); // untiled - auto v = ranges::views::cartesian_product( - ranges::views::iota(0, order), - ranges::views::iota(0, order) - ); - + auto v = prk::range2(0,order); // tiled: s is the strided (outer) view and t is the tile (inner) view - auto s = ranges::views::cartesian_product( - ranges::stride_view(ranges::views::iota(0, order), tile_size), - ranges::stride_view(ranges::views::iota(0, order), tile_size) - ); - auto t = ranges::views::cartesian_product( - ranges::views::iota(0, tile_size), - ranges::views::iota(0, tile_size) - ); + auto s = prk::range2(0, order, tile_size); + auto t = prk::range2(0, tile_size); for (int iter = 0; iter<=iterations; iter++) { From bfb5dbaf4c35645247b62083b1b88e0c37538f26 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 29 Dec 2021 12:29:44 +0200 Subject: [PATCH 200/325] ranges and tbb --- common/make.defs.nvhpc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index 84388a6c1..591e006ed 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -46,9 +46,11 @@ OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL # # Parallel STL, Boost, etc. # +TBBFLAG=-L/usr/lib/$$(uname -m)-linux-gnu -ltbb BOOSTFLAG= #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include +#RANGEFLAG=-DUSE_GCC_RANGES PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} KOKKOSDIR=../deps/kokkos-cuda PRK_KOKKOS_BACKEND=Cuda From a41050ccd55141f7567905ee65e07585cabe0550 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 29 Dec 2021 12:29:14 +0200 Subject: [PATCH 201/325] oneAPI ranges update --- common/make.defs.oneapi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index 16980b7d4..ec6421d24 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -76,8 +76,8 @@ TBBFLAG=-tbb # BOOSTFLAG= #RANGEFLAG=-DUSE_GCC_RANGES -RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -I./pstl/stdlib -I./pstl/include KOKKOSDIR=/opt/kokkos/intel KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl From f64372f3b24fec381b264886f07fa71f29dd2039 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 29 Dec 2021 18:16:08 +0200 Subject: [PATCH 202/325] mitigate the nonportability of omp requires etc --- C1z/nstream-alloc-target.c | 11 ++++++----- C1z/nstream-memcpy-target.c | 7 ++++--- C1z/nstream-ua-target.c | 8 ++++---- C1z/nstream-usm-target.c | 11 ++++++----- C1z/prk_openmp.h | 7 +++++++ 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/C1z/nstream-alloc-target.c b/C1z/nstream-alloc-target.c index 82bd89c47..d18aaa246 100644 --- a/C1z/nstream-alloc-target.c +++ b/C1z/nstream-alloc-target.c @@ -1,5 +1,6 @@ /// /// Copyright (c) 2019, Intel Corporation +/// Copyright (c) 2021, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -63,11 +64,11 @@ /// ////////////////////////////////////////////////////////////////////// -#pragma omp requires unified_address - #include "prk_util.h" #include "prk_openmp.h" +OMP_REQUIRES(unified_address) + int main(int argc, char * argv[]) { printf("Parallel Research Kernels version %d\n", PRKVERSION ); @@ -118,7 +119,7 @@ int main(int argc, char * argv[]) double scalar = 3.0; - #pragma omp target teams distribute parallel for simd schedule(static) device(device) is_device_ptr(A,B,C) + OMP_TARGET( teams distribute parallel for simd schedule(static) device(device) is_device_ptr(A,B,C) ) for (size_t i=0; i= 201811) +# define OMP_REQUIRES(x) PRAGMA(omp requires x) +# else +# define OMP_REQUIRES(x) +# endif #else # define OMP(x) # define OMP_PARALLEL(x) @@ -80,6 +86,7 @@ # define OMP_TASKWAIT # define OMP_ORDERED(x) # define OMP_TARGET(x) +# define OMP_REQUIRES(x) #endif #endif /* PRK_OPENMP_H */ From a4c997103fb3ed60f2463af5608aa45d03ed28f1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 29 Dec 2021 18:44:24 +0200 Subject: [PATCH 203/325] warn on No OMP 5 --- C1z/prk_openmp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/C1z/prk_openmp.h b/C1z/prk_openmp.h index 48e4941f6..5b3cb850e 100644 --- a/C1z/prk_openmp.h +++ b/C1z/prk_openmp.h @@ -69,6 +69,7 @@ # if (_OPENMP >= 201811) # define OMP_REQUIRES(x) PRAGMA(omp requires x) # else +# warning No OpenMP 5+ features! # define OMP_REQUIRES(x) # endif #else From b5b422f1f8f1987d8f8720f412fee512b2ac0721 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 31 Dec 2021 17:53:42 +0200 Subject: [PATCH 204/325] fix make.defs* --- common/make.defs.gcc | 39 -------------------------------------- common/make.defs.intel | 29 +--------------------------- common/make.defs.llvm | 23 ---------------------- common/make.defs.upcxx-hpx | 9 +++++++++ 4 files changed, 10 insertions(+), 90 deletions(-) create mode 100644 common/make.defs.upcxx-hpx diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 7a179c356..7c6d9b188 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -56,16 +56,6 @@ METALFLAG=-framework MetalPerformanceShaders # SYCL flags # # Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md -<<<<<<< HEAD -#SYCLDIR=/opt/isycl -#SYCLCXX=${SYCLDIR}/bin/clang++ -#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib -#SYCLFLAG+=-std=c++17 -O3 -# CodePlay ComputeCpp -#SYCLDIR=/opt/sycl/latest -#SYCLCXX=${SYCLDIR}/bin/compute++ -#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp -======= # #SYCLDIR=/opt/isycl #SYCLCXX=${SYCLDIR}/bin/clang++ @@ -87,7 +77,6 @@ METALFLAG=-framework MetalPerformanceShaders #SYCLDIR=/opt/sycl/latest #SYCLCXX=${SYCLDIR}/bin/compute++ #SYCLFLAG=-sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp ->>>>>>> default #SYCLFLAG+=-std=c++14 -O3 # This makes a huge difference in e.g. nstream... #SYCLFLAG+=-no-serial-memop @@ -107,8 +96,6 @@ METALFLAG=-framework MetalPerformanceShaders #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # -<<<<<<< HEAD -======= # hipSYCL # SYCLDIR=/opt/hipSYCL @@ -124,7 +111,6 @@ CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/ CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime MPIINC=-I/usr/include/mpich-3.2-x86_64 MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi ->>>>>>> default # # OCCA # @@ -137,11 +123,7 @@ MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -<<<<<<< HEAD -TBBDIR=/usr/local/Cellar/tbb/2020_U0 -======= TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 ->>>>>>> default TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb @@ -149,19 +131,11 @@ TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # Parallel STL, Boost, etc. # #BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include -<<<<<<< HEAD -BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include -BOOSTFLAG+=-I${BOOSTROOT} -BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 -RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -======= #BOOSTFLAG=-I/usr/include/boost169 BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_1/include # M1 Big Sur #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include #RANGEFLAG=-DUSE_GCC_RANGES ->>>>>>> default PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} #PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages KOKKOSDIR=/opt/kokkos/gcc @@ -170,11 +144,8 @@ RAJADIR=/opt/raja/gcc RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} -<<<<<<< HEAD -======= EXECUTORSDIR=./libunifex EXECUTORSFLAG=-I${EXECUTORSDIR}/include -I${EXECUTORSDIR}/build/include ->>>>>>> default # HPX is more complicated... HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx @@ -209,15 +180,6 @@ CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # # Halide # -<<<<<<< HEAD -HALIDECXX=c++ -HALIDEDIR=/opt/halide -HALIDEFLAG=-I${HALIDEDIR}/include -HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide -#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 -HALIDEFLAG+=${DEFAULT_OPT_FLAGS} -HALIDEFLAG+=-std=c++17 -g3 -======= HALIDECXX=${CXX} HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux HALIDEFLAG=-I${HALIDEDIR}/include @@ -225,7 +187,6 @@ HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide #HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 HALIDEFLAG+=${DEFAULT_OPT_FLAGS} HALIDEFLAG+=-std=c++17 ->>>>>>> default # # ISPC # diff --git a/common/make.defs.intel b/common/make.defs.intel index 047c363ad..1abbb0c75 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -43,22 +43,10 @@ OFFLOADFLAG+=-DGPU_SCHEDULE="" # Linux OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -#OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations -METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # # Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md -<<<<<<< HEAD -#SYCLDIR=/opt/isycl -#SYCLCXX=${SYCLDIR}/bin/clang++ -#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib -#SYCLFLAG+=-std=c++17 -O3 -# CodePlay ComputeCpp -#SYCLDIR=/opt/sycl/latest -#SYCLCXX=${SYCLDIR}/bin/compute++ -#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp -======= # #SYCLDIR=/opt/isycl #SYCLCXX=${SYCLDIR}/bin/clang++ @@ -81,7 +69,6 @@ METALFLAG=-framework MetalPerformanceShaders #SYCLDIR=/opt/codeplay/latest #SYCLCXX=${SYCLDIR}/bin/compute++ #SYCLFLAG=-sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp ->>>>>>> default #SYCLFLAG+=-std=c++14 -O3 # This makes a huge difference in e.g. nstream... #SYCLFLAG+=-no-serial-memop @@ -93,19 +80,11 @@ METALFLAG=-framework MetalPerformanceShaders #SYCLFLAG+=${OPENCLFLAG} # NVIDIA target #SYCLFLAG+=-sycl-target ptx64 -<<<<<<< HEAD -======= #SYCLFLAG+=-DPRK_NO_OPENCL_GPU ->>>>>>> default # # triSYCL # # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -<<<<<<< HEAD -#SYCLDIR=./triSYCL -#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) -#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL -======= SYCLDIR=./triSYCL SYCLCXX=${CXX} ${OPENMPFLAG} SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include @@ -123,7 +102,6 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include #CELERITYDIR=${SYCLDIR} #CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor #CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime ->>>>>>> default # # OCCA # @@ -131,7 +109,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # # Cilk # -#CILKFLAG=-intel-extensions # default +CILKFLAG=-intel-extensions # default # # TBB # @@ -150,11 +128,6 @@ RAJADIR=/opt/raja/intel RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} -# HPX is more complicated... -HWLOCFLAG=-I/usr/local/include -HPXDIR=./hpx -HPXCXX=${HPXDIR}/bin/hpxcxx -HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 502331404..730e1fa08 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -128,11 +128,6 @@ SYCLFLAG+=${OPENCLFLAG} # triSYCL # # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -<<<<<<< HEAD -SYCLDIR=./triSYCL -SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) -SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL -======= #SYCLDIR=./triSYCL #SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) #SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL @@ -155,7 +150,6 @@ SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL CELERITYDIR=${SYCLDIR} CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime ->>>>>>> default # # OCCA # @@ -164,26 +158,13 @@ CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -<<<<<<< HEAD -TBBDIR=/usr/local/Cellar/tbb/2020_U0 -======= TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 ->>>>>>> default TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb # # Parallel STL, Boost, etc. # -<<<<<<< HEAD -#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include -BOOSTROOT=/usr/local/Cellar/boost/1.72.0/include -BOOSTFLAG+=-I${BOOSTROOT} -BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 -#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -Wno-\#pragma-messages -DUSE_INTEL_PSTL -I./pstl/include -======= #BOOSTFLAG=-I/usr/local/Cellar/boost/1.72.0/include # old Homebrew #BOOSTFLAG=-I/usr/include/boost169 # Linux BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_2/include # new Homebrew @@ -196,7 +177,6 @@ SYCLFLAG+=${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} PSTLFLAG+=-I./llvm-pstl/include -DLLVM_PSTL ->>>>>>> default KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos -ldl KOKKOSFLAG+=${OPENMPFLAG} @@ -205,11 +185,8 @@ RAJADIR=/opt/raja/clang RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} -<<<<<<< HEAD -======= EXECUTORSDIR=./libunifex EXECUTORSFLAG=-I${EXECUTORSDIR}/include -I${EXECUTORSDIR}/build/include ->>>>>>> default # HPX is more complicated... HWLOCFLAG=-I/usr/local/include HPXDIR=./hpx diff --git a/common/make.defs.upcxx-hpx b/common/make.defs.upcxx-hpx new file mode 100644 index 000000000..a30623ad5 --- /dev/null +++ b/common/make.defs.upcxx-hpx @@ -0,0 +1,9 @@ +UPCXXDIR=./upcxx +UPCXX=${UPCXXDIR}/bin/upcxx +UPCXXFLAG=-codemode={O3,debug} +UPCXXFLAG+=-std=c++17 +UPCXXFLAG+=-mtune=native -ffast-math + +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} From 15e798548b8aa9f07faadb2b9df2002f1de1f898 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 4 Jan 2022 11:30:44 +0200 Subject: [PATCH 205/325] fix error in OpenMP computation of norm thanks to @mbycklin for this contribution. --- FORTRAN/stencil-taskloop-openmp.F90 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/FORTRAN/stencil-taskloop-openmp.F90 b/FORTRAN/stencil-taskloop-openmp.F90 index 2410c5555..797c22363 100644 --- a/FORTRAN/stencil-taskloop-openmp.F90 +++ b/FORTRAN/stencil-taskloop-openmp.F90 @@ -1,5 +1,5 @@ ! -! Copyright (c) 2013, Intel Corporation +! Copyright (c) 2022, Intel Corporation ! Copyright (c) 2021, NVIDIA ! ! Redistribution and use in source and binary forms, with or without @@ -216,7 +216,7 @@ program main call initialize_w(is_star,r,W) !$omp parallel default(none) & - !$omp& shared(n,A,B,W,stencil_time,norm,iterations,tiling,tile_size,is_star) & + !$omp& shared(n,A,B,W,stencil_time,iterations,tiling,tile_size,is_star) & !$omp& private(i,j,k,t0,t1) !$omp master @@ -260,17 +260,17 @@ program main stencil_time = t1 - t0 !$omp end master + !$omp end parallel ! compute L1 norm in parallel norm = 0.0d0 - !$omp do reduction(+:norm) + !$omp parallel do reduction(+:norm) do j=r,n-r do i=r,n-r norm = norm + abs(B(i,j)) enddo enddo - !$omp end do - !$omp end parallel + !$omp end parallel do active_points = int(n-2*r,INT64)**2 norm = norm / real(active_points,REAL64) From 285677d201e1d04ccb68eda8df07eb86b52d549c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 4 Jan 2022 12:15:37 +0200 Subject: [PATCH 206/325] factor out PRK MPI module utility --- FORTRAN/Makefile | 11 +++++-- FORTRAN/prk_mpi.F90 | 60 +++++++++++++++++++++++++++++++++++ FORTRAN/transpose-a2a-mpi.F90 | 29 ----------------- FORTRAN/transpose-acc-mpi.F90 | 29 ----------------- FORTRAN/transpose-get-mpi.F90 | 29 ----------------- FORTRAN/transpose-p2p-mpi.F90 | 29 ----------------- 6 files changed, 68 insertions(+), 119 deletions(-) create mode 100644 FORTRAN/prk_mpi.F90 diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 4a1315edb..54e0b11ae 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -96,6 +96,9 @@ blas: dgemm-blas prk.mod prk_mod.o: prk_mod.F90 $(FC) $(FCFLAGS) -c $< -o prk_mod.o +prk_mpi.mod prk_mpi_mod.o: prk_mpi.F90 + $(FC) $(FCFLAGS) -c $< -o prk_mpi_mod.o + stencil: stencil.F90 prk.mod $(FC) $(FCFLAGS) -c stencil_serial.F90 $(FC) $(FCFLAGS) stencil.F90 stencil_serial.o prk_mod.o -o $@ @@ -119,10 +122,10 @@ dgemm-blas: dgemm-blas.F90 prk.mod $(MPIFORT) $(FCFLAGS) $< prk_mod.o $(GAFLAG) -o $@ %-mpi-openmp: %-mpi.F90 prk.mod - $(MPIFORT) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o -o $@ + $(MPIFORT) $(FCFLAGS) $(OPENMPFLAG) $< prk_mod.o prk_mpi_mod.o -o $@ -%-mpi: %-mpi.F90 prk.mod - $(MPIFORT) $(FCFLAGS) $< prk_mod.o -o $@ +%-mpi: %-mpi.F90 prk.mod prk_mpi.mod + $(MPIFORT) $(FCFLAGS) $< prk_mod.o prk_mpi_mod.o -o $@ %-coarray: %-coarray.F90 prk.mod $(CAFC) $(FCFLAGS) $< prk_mod.o $(COARRAYFLAG) -o $@ @@ -142,6 +145,8 @@ dgemm-blas: dgemm-blas.F90 prk.mod clean: -rm -f prk.mod -rm -f prk.f18.mod + -rm -f prk_mpi.mod + -rm -f prk_mpi.f18.mod -rm -f *.o -rm -f *.i90 -rm -f *.dbg diff --git a/FORTRAN/prk_mpi.F90 b/FORTRAN/prk_mpi.F90 new file mode 100644 index 000000000..f1508f450 --- /dev/null +++ b/FORTRAN/prk_mpi.F90 @@ -0,0 +1,60 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +module prk_mpi + contains + subroutine mpi_print_matrix(mat,clabel) + use, intrinsic :: iso_fortran_env + use mpi_f08 + use prk + implicit none + real(kind=REAL64), intent(in) :: mat(:,:) + character(*), intent(in), optional :: clabel + integer(kind=INT32) :: r, me, np + flush(6) + call MPI_Comm_rank(MPI_COMM_WORLD, me) + call MPI_Comm_size(MPI_COMM_WORLD, np) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + if (me.eq.0) print*,clabel + flush(6) + call MPI_Barrier(MPI_COMM_WORLD) + flush(6) + do r=0,np-1 + if (me.eq.r) then + call print_matrix(mat,me) + endif + call MPI_Barrier(MPI_COMM_WORLD) + enddo + flush(6) + end subroutine +end module prk_mpi diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90 index c121b037a..a57615201 100644 --- a/FORTRAN/transpose-a2a-mpi.F90 +++ b/FORTRAN/transpose-a2a-mpi.F90 @@ -53,35 +53,6 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* -module prk_mpi - contains - subroutine mpi_print_matrix(mat,clabel) - use, intrinsic :: iso_fortran_env - use mpi_f08 - use prk - implicit none - real(kind=REAL64), intent(in) :: mat(:,:) - character(*), intent(in), optional :: clabel - integer(kind=INT32) :: r, me, np - flush(6) - call MPI_Comm_rank(MPI_COMM_WORLD, me) - call MPI_Comm_size(MPI_COMM_WORLD, np) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - if (me.eq.0) print*,clabel - flush(6) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - do r=0,np-1 - if (me.eq.r) then - call print_matrix(mat,me) - endif - call MPI_Barrier(MPI_COMM_WORLD) - enddo - flush(6) - end subroutine -end module prk_mpi - program main use, intrinsic :: iso_fortran_env use mpi_f08 diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 index 9023a006f..6e2904a47 100644 --- a/FORTRAN/transpose-acc-mpi.F90 +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -53,35 +53,6 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* -module prk_mpi - contains - subroutine mpi_print_matrix(mat,clabel) - use, intrinsic :: iso_fortran_env - use mpi_f08 - use prk - implicit none - real(kind=REAL64), intent(in) :: mat(:,:) - character(*), intent(in), optional :: clabel - integer(kind=INT32) :: r, me, np - flush(6) - call MPI_Comm_rank(MPI_COMM_WORLD, me) - call MPI_Comm_size(MPI_COMM_WORLD, np) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - if (me.eq.0) print*,clabel - flush(6) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - do r=0,np-1 - if (me.eq.r) then - call print_matrix(mat,me) - endif - call MPI_Barrier(MPI_COMM_WORLD) - enddo - flush(6) - end subroutine -end module prk_mpi - program main use, intrinsic :: iso_fortran_env use, intrinsic :: iso_c_binding diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90 index b153117ca..ecd6ed18d 100644 --- a/FORTRAN/transpose-get-mpi.F90 +++ b/FORTRAN/transpose-get-mpi.F90 @@ -53,35 +53,6 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* -module prk_mpi - contains - subroutine mpi_print_matrix(mat,clabel) - use, intrinsic :: iso_fortran_env - use mpi_f08 - use prk - implicit none - real(kind=REAL64), intent(in) :: mat(:,:) - character(*), intent(in), optional :: clabel - integer(kind=INT32) :: r, me, np - flush(6) - call MPI_Comm_rank(MPI_COMM_WORLD, me) - call MPI_Comm_size(MPI_COMM_WORLD, np) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - if (me.eq.0) print*,clabel - flush(6) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - do r=0,np-1 - if (me.eq.r) then - call print_matrix(mat,me) - endif - call MPI_Barrier(MPI_COMM_WORLD) - enddo - flush(6) - end subroutine -end module prk_mpi - program main use, intrinsic :: iso_fortran_env use, intrinsic :: iso_c_binding diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index 3d72cb36c..1ae8dbc9a 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -53,35 +53,6 @@ ! MPI by Jeff Hammond, November 2021 ! ******************************************************************* -module prk_mpi - contains - subroutine mpi_print_matrix(mat,clabel) - use, intrinsic :: iso_fortran_env - use mpi_f08 - use prk - implicit none - real(kind=REAL64), intent(in) :: mat(:,:) - character(*), intent(in), optional :: clabel - integer(kind=INT32) :: r, me, np - flush(6) - call MPI_Comm_rank(MPI_COMM_WORLD, me) - call MPI_Comm_size(MPI_COMM_WORLD, np) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - if (me.eq.0) print*,clabel - flush(6) - call MPI_Barrier(MPI_COMM_WORLD) - flush(6) - do r=0,np-1 - if (me.eq.r) then - call print_matrix(mat,me) - endif - call MPI_Barrier(MPI_COMM_WORLD) - enddo - flush(6) - end subroutine -end module prk_mpi - program main use, intrinsic :: iso_fortran_env use mpi_f08 From a640d7add111bbcc05341cd4e162dd7080e4244d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 5 Jan 2022 07:31:12 -0800 Subject: [PATCH 207/325] extra targets --- FORTRAN/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 3cd82fe25..bcc68182a 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -38,7 +38,7 @@ ifeq ($(findstring ifx,$(FC)),ifx) endif # GCC (also matches pgfortran so PGI must come after) ifeq ($(findstring gfortran,$(FC)),gfortran) - EXTRA = target coarray taskloop openacc + EXTRA = target coarray taskloop openacc blas endif # PGI and LLVM Flang ifeq ($(findstring flang,$(FC)),flang) @@ -50,7 +50,7 @@ ifeq ($(findstring pgf,$(FC)),pgf) FCFLAGS += -DPGI endif ifeq ($(findstring nvf,$(FC)),nvf) - EXTRA = target openacc cufortran + EXTRA = target openacc cufortran stdpar blas FCFLAGS += -DNVHPC endif ifeq ($(findstring xlf,$(FC)),xlf) From fedfc5252c9f63ee27e5d42589e07c2555c1252b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 6 Jan 2022 00:53:35 -0800 Subject: [PATCH 208/325] updates --- common/make.defs.nvhpc | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index 0eb9330d7..ea843e9af 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -1,8 +1,7 @@ # # This file shows the NVHPC toolchain options. -#NVHPC_PATH=/proj/nv/Linux_x86_64/21.3 -#NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_x86_64/2021 -#NVHPC_CBIN=${NVHPC_PATH}/compilers/bin +NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11 +NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/ # # Base compilers and language options # @@ -29,7 +28,7 @@ OFFLOADFLAG+=-DGPU_SCHEDULE="schedule(static,1)" OPENACCFLAG=-acc -target=gpu OPENACCFLAG+=-Mlarge_arrays OPENACCFLAG+=-Minfo=accel -STDPARFLAG=-stdpar -gpu=managed +STDPARFLAG=-stdpar=gpu -gpu=managed STDPARFLAG+=-Minfo=accel STDPARFLAG+=-cudalib=cublas,cutensor CUFORTFLAG=-cuda -gpu=managed -acc # ACC required for CUF+managed @@ -37,7 +36,7 @@ CUFORTFLAG+=-Minfo=accel # # OpenCL flags # -OPENCLDIR=/usr/local/cuda-11.2/targets/x86_64-linux +OPENCLDIR=/usr/local/cuda/targets/$$(uname -m)-linux OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL #OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations #OPENCLFLAG+=-Wno-deprecated-declarations -Wno-missing-braces @@ -55,7 +54,7 @@ KOKKOSCXX=${KOKKOSDIR}/bin/nvcc_wrapper KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkoscore RAJADIR= RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} -THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_x86_64/21.3/compilers/include-stdpar +THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.3/compilers/include-stdpar THRUSTFLAG=-I${THRUSTDIR} # # CBLAS for C++ DGEMM @@ -75,8 +74,8 @@ CUDAFLAGS+=--gpu-architecture=sm_75 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' -CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/11.2/targets/x86_64-linux/include -CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/11.2/targets/x86_64-linux/lib +CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/11.5/targets/$$(uname -m)-linux/include +CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/11.5/targets/$$(uname -m)-linux/lib # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 # heavy hammer: CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED @@ -111,7 +110,7 @@ MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi -#MPIINC=-I/usr/include/mpich-3.2-x86_64 +#MPIINC=-I/usr/include/mpich-3.2-$$(uname -m) #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi # # Global Arrays From 8cea8e843c9c0f7a5ab8918748e94886ae1dd875 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 2 Mar 2022 14:35:44 -0600 Subject: [PATCH 209/325] Conditionally use deprecated feature vs. SYCL 2020 --- Cxx11/prk_sycl.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h index f8dc0f024..8d37e489d 100644 --- a/Cxx11/prk_sycl.h +++ b/Cxx11/prk_sycl.h @@ -8,6 +8,15 @@ namespace sycl = cl::sycl; +#if defined(__LIBSYCL_MAJOR_VERSION) && defined(__LIBSYCL_MINOR_VERSION) && defined(__LIBSYCL_PATCH_VERSION) +# define __LIBSYCL_VERSION \ + (__LIBSYCL_MAJOR_VERSION * 10000 + __LIBSYCL_MINOR_VERSION * 100 + __LIBSYCL_PATCH_VERSION) +#else +# define __LIBSYCL_VERSION 0 +#endif + +#define _PRK_SYCL2020_FEATURES (__LIBSYCL_VERSION >= 50300) + //#ifdef __COMPUTECPP //#include //namespace syclx = cl::sycl::experimental; @@ -64,7 +73,11 @@ namespace prk { return true; #else auto device = q.get_device(); +#if _PRK_SYCL2020_FEATURES + return device.has(sycl::aspect::fp64); +#else return device.has_extension(sycl::string_class("cl_khr_fp64")); +#endif #endif } From a697425d03c373da0f0e9b754878c8eba20a89d7 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 2 Mar 2022 14:47:19 -0600 Subject: [PATCH 210/325] Remove deprecated global offset parameter for stencils (SYCL2020) --- Cxx11/stencil_sycl.hpp | 81 +++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index de3cde61b..024e796c4 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -1,3 +1,4 @@ + // declare the kernel name used in SYCL parallel_for template class star1_1d; @@ -7,9 +8,9 @@ void star1(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer q.submit([&](sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) { + const auto i = it[0] + 1; + const auto j = it[1] + 1; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.5) +in[i*n+(j-1)] * static_cast(-0.5) +in[(i+1)*n+j] * static_cast(0.5) @@ -29,8 +30,8 @@ void star1(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf auto out = d_out.template get_access(h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); - h.parallel_for>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) { - sycl::id<2> xy = it.get_id(); + h.parallel_for>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id() + sycl::id<2> {1,1}; out[xy] += +in[xy+dx1] * static_cast(0.5) +in[xy-dx1] * static_cast(-0.5) +in[xy+dy1] * static_cast(0.5) @@ -46,9 +47,9 @@ template void star1(sycl::queue & q, const size_t n, const T * in, T * out) { q.submit([&](sycl::handler& h) { - h.parallel_for>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) { + const auto i = it[0] + 1; + const auto j = it[1] + 1; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.5) +in[i*n+(j-1)] * static_cast(-0.5) +in[(i+1)*n+j] * static_cast(0.5) @@ -66,9 +67,9 @@ void star2(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer q.submit([&](sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-2,n-2}, [=] (sycl::item<2> it) { + const auto i = it[0] + 2; + const auto j = it[1] + 2; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.25) +in[i*n+(j-1)] * static_cast(-0.25) +in[(i+1)*n+j] * static_cast(0.25) @@ -94,8 +95,8 @@ void star2(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); sycl::id<2> dy2(sycl::range<2> {0,2}); - h.parallel_for>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) { - sycl::id<2> xy = it.get_id(); + h.parallel_for>(sycl::range<2> {n-2,n-2}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id() + sycl::id<2> {2,2}; out[xy] += +in[xy+dx1] * static_cast(0.25) +in[xy-dx1] * static_cast(-0.25) +in[xy+dy1] * static_cast(0.25) @@ -115,9 +116,9 @@ template void star2(sycl::queue & q, const size_t n, const T * in, T * out) { q.submit([&](sycl::handler& h) { - h.parallel_for>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-2,n-2}, [=] (sycl::item<2> it) { + const auto i = it[0] + 2; + const auto j = it[1] + 2; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.25) +in[i*n+(j-1)] * static_cast(-0.25) +in[(i+1)*n+j] * static_cast(0.25) @@ -139,9 +140,9 @@ void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer q.submit([&](sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { + const auto i = it[0] + 3; + const auto j = it[1] + 3; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.166666666667) +in[i*n+(j-1)] * static_cast(-0.166666666667) +in[(i+1)*n+j] * static_cast(0.166666666667) @@ -173,8 +174,8 @@ void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf sycl::id<2> dy2(sycl::range<2> {0,2}); sycl::id<2> dx3(sycl::range<2> {3,0}); sycl::id<2> dy3(sycl::range<2> {0,3}); - h.parallel_for>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) { - sycl::id<2> xy = it.get_id(); + h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id() + sycl::id<2> {3,3}; out[xy] += +in[xy+dx1] * static_cast(0.166666666667) +in[xy-dx1] * static_cast(-0.166666666667) +in[xy+dy1] * static_cast(0.166666666667) @@ -198,9 +199,9 @@ template void star3(sycl::queue & q, const size_t n, const T * in, T * out) { q.submit([&](sycl::handler& h) { - h.parallel_for>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { + const auto i = it[0] + 3; + const auto j = it[1] + 3; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.166666666667) +in[i*n+(j-1)] * static_cast(-0.166666666667) +in[(i+1)*n+j] * static_cast(0.166666666667) @@ -226,9 +227,9 @@ void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer q.submit([&](sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-4,n-4}, [=] (sycl::item<2> it) { + const auto i = it[0] + 4; + const auto j = it[1] + 4; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.125) +in[i*n+(j-1)] * static_cast(-0.125) +in[(i+1)*n+j] * static_cast(0.125) @@ -266,8 +267,8 @@ void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf sycl::id<2> dy3(sycl::range<2> {0,3}); sycl::id<2> dx4(sycl::range<2> {4,0}); sycl::id<2> dy4(sycl::range<2> {0,4}); - h.parallel_for>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) { - sycl::id<2> xy = it.get_id(); + h.parallel_for>(sycl::range<2> {n-4,n-4}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id() + sycl::id<2> {4,4}; out[xy] += +in[xy+dx1] * static_cast(0.125) +in[xy-dx1] * static_cast(-0.125) +in[xy+dy1] * static_cast(0.125) @@ -295,9 +296,9 @@ template void star4(sycl::queue & q, const size_t n, const T * in, T * out) { q.submit([&](sycl::handler& h) { - h.parallel_for>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-4,n-4}, [=] (sycl::item<2> it) { + const auto i = it[0] + 4; + const auto j = it[1] + 4; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.125) +in[i*n+(j-1)] * static_cast(-0.125) +in[(i+1)*n+j] * static_cast(0.125) @@ -327,9 +328,9 @@ void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer q.submit([&](sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-5,n-5}, [=] (sycl::item<2> it) { + const auto i = it[0] + 5; + const auto j = it[1] + 5; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.1) +in[i*n+(j-1)] * static_cast(-0.1) +in[(i+1)*n+j] * static_cast(0.1) @@ -373,8 +374,8 @@ void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf sycl::id<2> dy4(sycl::range<2> {0,4}); sycl::id<2> dx5(sycl::range<2> {5,0}); sycl::id<2> dy5(sycl::range<2> {0,5}); - h.parallel_for>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) { - sycl::id<2> xy = it.get_id(); + h.parallel_for>(sycl::range<2> {n-5,n-5}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id() + sycl::id<2> {5,5}; out[xy] += +in[xy+dx1] * static_cast(0.1) +in[xy-dx1] * static_cast(-0.1) +in[xy+dy1] * static_cast(0.1) @@ -406,9 +407,9 @@ template void star5(sycl::queue & q, const size_t n, const T * in, T * out) { q.submit([&](sycl::handler& h) { - h.parallel_for>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) { - const auto i = it[0]; - const auto j = it[1]; + h.parallel_for>(sycl::range<2> {n-5,n-5}, [=] (sycl::item<2> it) { + const auto i = it[0] + 5; + const auto j = it[1] + 5; out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.1) +in[i*n+(j-1)] * static_cast(-0.1) +in[(i+1)*n+j] * static_cast(0.1) From cd18db43d2f8e01303ca87e181d008dc91dc3178 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 22 Apr 2022 03:06:06 -0700 Subject: [PATCH 211/325] ARM compiler and BLAS --- common/make.defs.arm | 283 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 common/make.defs.arm diff --git a/common/make.defs.arm b/common/make.defs.arm new file mode 100644 index 000000000..1862969b9 --- /dev/null +++ b/common/make.defs.arm @@ -0,0 +1,283 @@ +# +# This file shows the LLVM toolchain options for PRKs using +# OpenMP, MPI and/or Fortran coarrays only. +# +# Base compilers and language options +# +#LLVM_ROOT=/usr/local/Cellar/llvm/9.0.0 +#LLVM_PATH=${LLVM_ROOT}/bin/ +LLVM_PATH=/opt/arm/22.0.1/arm-linux-compiler-22.0.1_Generic-AArch64_Ubuntu-20.04_aarch64-linux/llvm-bin/ +#CLANG_VERSION=-9 +# C99 is required in some implementations. +CC=${LLVM_PATH}clang${CLANG_VERSION} -std=c11 -pthread +# All of the Fortran code is written for the 2008 standard and requires preprocessing. +#FC=/opt/llvm/pgi-flang/bin/flang -Mpreprocess -Mfreeform -L/opt/llvm/pgi-flang/lib -Wl,-rpath=/opt/llvm/pgi-flang/lib +FC=${LLVM_PATH}flang -Mpreprocess -Mfreeform -DPGI +# C++11 may not be required but does no harm here. +CXX=${LLVM_PATH}clang++${CLANG_VERSION} -std=c++2a -pthread +# +# Compiler flags +# +# -mtune=native is appropriate for most cases. +# -march=native is appropriate if you want portable binaries. +# +DEFAULT_OPT_FLAGS=-g -O3 -ffast-math +#DEFAULT_OPT_FLAGS+=-mllvm -polly -mllvm -polly-vectorizer=stripmine +# +# If you want to be specific, get the architecture options from: +# ${LLVM_PATH}llc --version +# and then get the CPU/ISA options from (e.g. for x86-64): +# ${LLVM_PATH}llc -march=x86-64 -mcpu=help +# +# These are useful to understand why the compiler does not vectorize loops: +# DEFAULT_OPT_FLAGS+=-Rpass-analysis=loop-vectorize +# DEFAULT_OPT_FLAGS+=-Rpass=loop-vectorize +#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed +DEFAULT_OPT_FLAGS+=-Wall #-Werror +#DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations +#DEFAULT_OPT_FLAGS+=-mavx -mfma +# +# OpenMP flags +# +OPENMPFLAG=-fopenmp +OPENMPSIMDFLAG=-fopenmp-simd +OFFLOADFLAG=-fopenmp +OFFLOADFLAG+=-DGPU_SCHEDULE="" +OPENACCFLAG=-fopenacc +# Klondike weirdness +# OPENMPFLAG+=-L/opt/intel/compilers_and_libraries_2018.0.082/linux/compiler/lib/intel64_lin -liomp5 +# Mac weirdness +#OPENMPFLAG+=-L${LLVM_ROOT}/lib +# LLVM +# OPENMPFLAG+=-L/opt/llvm/4.0.0/lib -lomp +# +# OpenCL flags +# +# MacOS +#OPENCLFLAG=-framework OpenCL +# POCL +# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... +#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL +# Linux +#OPENCLDIR=/etc/alternatives +#OPENCLDIR=/etc/alternatives/opencl-intel-tools +#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +#OPENCLFLAG+=-Wno-ignored-attributes +#OPENCLFLAG+=-Wno-deprecated-declarations +#OPENCLFLAG+=-Wno-missing-braces +# oneAPI +#OPENCLDIR=/opt/intel/oneapi/compiler/latest/linux +#OPENCLFLAG=-I${OPENCLDIR}/include/sycl -L${OPENCLDIR}/lib -lOpenCL +OPENCLFLAG+=-DCL_TARGET_OPENCL_VERSION=220 +# +# Metal (MacOS-only, unused) +# +#METALFLAG=-framework MetalPerformanceShaders +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# SYCL flags +# +# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md +# +#SYCLDIR=/opt/isycl +#SYCLDIR=${HOME}/ISYCL/llvm/build +#SYCLCXX=${SYCLDIR}/bin/clang++ +#SYCLFLAG=-std=c++17 -O3 +#SYCLFLAG+=-fsycl -fsycl-unnamed-lambda +#SYCLFLAG+=-L${SYCLDIR}/lib -lsycl -Wl,-rpath=${SYCLDIR}/lib +# +# Intel oneAPI +# +#SYCLCXX=dpcpp +#SYCLFLAG=-fsycl +#SYCLFLAG+=-std=c++17 -O3 +#SYCLFLAG+=--gcc-toolchain=/opt/rh/devtoolset-7/root/usr +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=1 +#SYCLFLAG+=-stdlib=c++ +# +# CodePlay ComputeCpp +# +#SYCLDIR=/opt/sycl/latest +SYCLDIR=/opt/codeplay/latest +SYCLCXX=${SYCLDIR}/bin/compute++ +SYCLFLAG=-std=c++17 -O3 +SYCLFLAG+=-sycl-driver +SYCLFLAG+=-Wsycl-pedantic +# USM-related +SYCLFLAG+=-DSYCL_LANGUAGE_VERSION=2020 +SYCLFLAG+=-fno-sycl-address-space +# This makes a huge difference in e.g. nstream... +SYCLFLAG+=-no-serial-memop +# SPIR target +SYCLFLAG+=-sycl-target spir64 +# NVIDIA target +#SYCLFLAG+=-sycl-target ptx64 +#SYCLFLAG+=-DPRK_NO_OPENCL_GPU +SYCLFLAG+=-I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +# If not found automatically +SYCLFLAG+=${OPENCLFLAG} +# These are only necessary on systems with very old GCC as the default +# CentOS7 and Ubuntu14 built for this +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +# PRK header rejects GCC4 +#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0 +# +# triSYCL +# +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +#SYCLDIR=./triSYCL +#SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) +#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL +# +# hipSYCL +# +#SYCLDIR=/opt/hipsycl/usr/local # if installed with DESTDIR +#SYCLDIR=/opt/hipSYCL +#SYCLDIR=/opt/spack/spack/opt/spack/linux-ubuntu18.04-haswell/gcc-8.3.0/hipsycl-master-appurj662qod4y4z5zxipr2fwthl66k7 +#SYCLCXX=${SYCLDIR}/bin/syclcc-clang +#SYCLFLAG=-std=c++17 -O3 +#SYCLFLAG+=-DHIPSYCL +# CPU platform +#SYCLFLAG+=--hipsycl-platform=cpu +#SYCLFLAG+=--hipsycl-platform=cuda +#SYCLFLAG+=--hipsycl-gpu-arch=sm_60 +#SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib # wrong? +#SYCLFLAG+=-Wl,-rpath=${SYCLDIR}/lib +# +CELERITYDIR=${SYCLDIR} +CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor +CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# TBB +# +#TBBDIR=/usr/lib/x86_64-linux-gnu +TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 +TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +#TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb +#TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb +# +# Parallel STL, Boost, etc. +# +#BOOSTFLAG=-I/usr/local/Cellar/boost/1.72.0/include # old Homebrew +#BOOSTFLAG=-I/usr/include/boost169 # Linux +BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_2/include # new Homebrew +BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 + +# triSYCL requires Boost +SYCLFLAG+=${BOOSTFLAG} + +#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} +PSTLFLAG+=-I./llvm-pstl/include -DLLVM_PSTL +KOKKOSDIR=/opt/kokkos/clang +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos -ldl +KOKKOSFLAG+=${OPENMPFLAG} +#KOKKOSFLAG+=-DPRK_KOKKOS_BACKEND=Threads # Mac Clang does not support OpenMP +RAJADIR=/opt/raja/clang +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +THRUSTDIR=/opt/nvidia/thrust +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +EXECUTORSDIR=./libunifex +EXECUTORSFLAG=-I${EXECUTORSDIR}/include -I${EXECUTORSDIR}/build/include +# HPX is more complicated... +HWLOCFLAG=-I/usr/local/include +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} +# UPC++ +UPCXXDIR=./upcxx +UPCXX=${UPCXXDIR}/bin/upcxx +UPCXXFLAG=-codemode={O3,debug} +UPCXXFLAG+=-std=c++17 +UPCXXFLAG+=-mtune=native -ffast-math +# +# CBLAS for C++ DGEMM +# +#BLASFLAG=-DACCELERATE -framework Accelerate +#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +BLASPATH=/opt/arm/22.0.1/armpl-22.0.1_AArch64_Ubuntu-20.04_arm-linux-compiler_aarch64-linux/lib +BLASFLAG=-L${BLASPATH} -Wl,-rpath=${BLASPATH} -larmpl_ilp64_mp +CBLASFLAG=-L${BLASPATH} -Wl,-rpath=${BLASPATH} -larmpl_lp64_mp +CBLASFLAG+=-I/opt/arm/22.0.1/armpl-22.0.1_AArch64_Ubuntu-20.04_arm-linux-compiler_aarch64-linux/include_lp64 +CBLASFLAG+=-L/opt/arm/22.0.1/arm-linux-compiler-22.0.1_Generic-AArch64_Ubuntu-20.04_aarch64-linux/lib -larmflang +CBLASFLAG+=-Wl,-rpath=/opt/arm/22.0.1/arm-linux-compiler-22.0.1_Generic-AArch64_Ubuntu-20.04_aarch64-linux/lib +#/opt/arm/22.0.1/arm-linux-compiler-22.0.1_Generic-AArch64_Ubuntu-20.04_aarch64-linux/lib/libarmflang.so +# +# CUDA flags +# +# Mac w/ CUDA emulation via https://github.com/hughperkins/coriander +NVCC=/opt/llvm/cocl/bin/cocl +# Linux w/ NVIDIA CUDA +#NVCC=nvcc -arch=sm_50 +CUDAFLAGS=-g -O3 -std=c++11 +CUDAFLAGS+=-arch=sm_50 +# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 +CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +# +# Halide +# +HALIDECXX=c++ +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 -g3 +# +# ISPC +# +ISPC=ispc +ISPCFLAG=-O3 --target=host --opt=fast-math +# +# MPI-3 +# +MPIDIR=/usr +MPICC=${MPIDIR}/bin/mpicc +MPICXX=${MPIDIR}/bin/mpicxx +MPIFORT=${MPIDIR}/bin/mpifort +MPIINC=-I${MPIDIR}/include +MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi +#MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi +#MPIINC=-I/usr/include/mpich-3.2-x86_64 +#MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi +# +# Global Arrays +# +GADIR=../deps/ga +GAFLAG=-I${GADIR}/include +GAFLAG+=-L${GADIR}/lib -lga +GAFLAG+=-L${GADIR}/../armci-mpi/lib -larmci # ARMCI-MPI +#GAFLAG+=-L${GADIR}/lib -larmci -lcomex # ARMCI/ComEx +GAFLAG+=${MPIINC} ${MPILIB} +GAFLAG+=-lmpifort -lmpi +GAFLAG+=${BLASFLAG} +GAFLAG+=-fdefault-integer-8 # GA is compiled with 64b integers on 64-bit systems +# +# PETSc +# +PETSCDIR=../deps/petsc +PETSCFLAG=-I${PETSCDIR}/include +PETSCFLAG+=-L${PETSCDIR}/lib -lpetsc +PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib +# +# Fortran 2008 coarrays +# +# see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details +# single-node +COARRAYFLAG=-fcoarray=single -lcaf_single +# multi-node +#COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +#COARRAYFLAG=-fcoarray=lib -L/usr/lib/x86_64-linux-gnu/open-coarrays/mpich/lib -lcaf_mpi +# +# MEMKIND (used in C1z) +# +MEMKINDDIR=/home/parallels/PRK/deps +MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib From 8ae525acad099f3672b35754e0e683d708aa9bda Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 22 Apr 2022 03:10:10 -0700 Subject: [PATCH 212/325] LP64 for Fortran --- common/make.defs.arm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/make.defs.arm b/common/make.defs.arm index 1862969b9..bfeb82966 100644 --- a/common/make.defs.arm +++ b/common/make.defs.arm @@ -204,7 +204,7 @@ UPCXXFLAG+=-mtune=native -ffast-math #BLASFLAG=-DACCELERATE -framework Accelerate #CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions BLASPATH=/opt/arm/22.0.1/armpl-22.0.1_AArch64_Ubuntu-20.04_arm-linux-compiler_aarch64-linux/lib -BLASFLAG=-L${BLASPATH} -Wl,-rpath=${BLASPATH} -larmpl_ilp64_mp +BLASFLAG=-L${BLASPATH} -Wl,-rpath=${BLASPATH} -larmpl_lp64_mp CBLASFLAG=-L${BLASPATH} -Wl,-rpath=${BLASPATH} -larmpl_lp64_mp CBLASFLAG+=-I/opt/arm/22.0.1/armpl-22.0.1_AArch64_Ubuntu-20.04_arm-linux-compiler_aarch64-linux/include_lp64 CBLASFLAG+=-L/opt/arm/22.0.1/arm-linux-compiler-22.0.1_Generic-AArch64_Ubuntu-20.04_aarch64-linux/lib -larmflang From 4ffe012670028752bddc13eebc488c6521f4d848 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 22 Apr 2022 04:08:35 -0700 Subject: [PATCH 213/325] ARM GCC + PL --- common/make.defs.armgcc | 246 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 common/make.defs.armgcc diff --git a/common/make.defs.armgcc b/common/make.defs.armgcc new file mode 100644 index 000000000..ab74cf813 --- /dev/null +++ b/common/make.defs.armgcc @@ -0,0 +1,246 @@ +# +# This file shows the GCC toolchain options for PRKs using +# OpenMP, MPI and/or Fortran coarrays only. +# +# Base compilers and language options +# +GCC_ROOT=/opt/arm/22.0.1/gcc-11.2.0_Generic-AArch64_Ubuntu-20.04_aarch64-linux +GCC_PATH=${GCC_ROOT}/bin/ +#GCC_VERSION=-11 +# C99 is required in some implementations. +CC=${GCC_PATH}gcc${GCC_VERSION} -std=c11 -pthread +# All of the Fortran code is written for the 2008 standard and requires preprocessing. +FC=${GCC_PATH}gfortran${GCC_VERSION} -std=f2018 -cpp -fexternal-blas -fblas-matmul-limit=0 +# C++11 may not be required but does no harm here. +CXX=${GCC_PATH}g++${GCC_VERSION} -std=gnu++20 -pthread -fmax-errors=1 +# +# Compiler flags +# +# -mtune=native is appropriate for most cases. +# -march=native is appropriate if you want portable binaries. +DEFAULT_OPT_FLAGS=-O3 -mtune=native -ffast-math +#DEFAULT_OPT_FLAGS=-O0 +DEFAULT_OPT_FLAGS+=-g3 +#DEFAULT_OPT_FLAGS+=-fsanitize=undefined +#DEFAULT_OPT_FLAGS+=-fsanitize=undefined,leak +#DEFAULT_OPT_FLAGS+=-fsanitize=address +#DEFAULT_OPT_FLAGS+=-fsanitize=thread +# If you are compiling for KNL on a Xeon login node, use the following: +# DEFAULT_OPT_FLAGS=-g -O3 -march=knl +# See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details. +# +#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed +DEFAULT_OPT_FLAGS+=-Wall #-Werror +#DEFAULT_OPT_FLAGS+=-mavx -mfma # these should be used on Haswell and later +# +# OpenMP flags +# +OPENMPFLAG=-fopenmp +OPENMPSIMDFLAG=-fopenmp-simd +OFFLOADFLAG=-foffload="-O3 -v" +OFFLOADFLAG+=-DGPU_SCHEDULE="" +OPENACCFLAG=-fopenacc + +OPENMPFLAG+=-L${GCC_ROOT}/lib64 -lgomp -Wl,-rpath=${GCC_ROOT}/lib64 +# +# OpenCL flags +# +# MacOS +#OPENCLFLAG=-framework OpenCL +# POCL +# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... +#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL +# Linux +#OPENCLDIR=/etc/alternatives/opencl-intel-tools +#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations +METALFLAG=-framework MetalPerformanceShaders +# +# SYCL flags +# +# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md +# +#SYCLDIR=/opt/isycl +#SYCLCXX=${SYCLDIR}/bin/clang++ +#SYCLFLAG=-std=c++17 -O3 +#SYCLFLAG+=-fsycl +#SYCLFLAG+=-L${SYCLDIR}/lib -lsycl -Wl,-rpath=${SYCLDIR}/lib +# +# Intel oneAPI +# +#SYCLCXX=dpcpp +#SYCLFLAG=-fsycl +#SYCLFLAG+=-std=c++17 -O3 +#SYCLFLAG+=--gcc-toolchain=/opt/rh/devtoolset-7/root/usr +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=1 +#SYCLFLAG+=-stdlib=c++ +# +# CodePlay ComputeCpp +# +#SYCLDIR=/opt/sycl/latest +#SYCLCXX=${SYCLDIR}/bin/compute++ +#SYCLFLAG=-sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +#SYCLFLAG+=-std=c++14 -O3 +# This makes a huge difference in e.g. nstream... +#SYCLFLAG+=-no-serial-memop +# CentOS7 and Ubuntu14 built for this +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +# PRK header rejects GCC4 +#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0 +# If not found automatically +#SYCLFLAG+=${OPENCLFLAG} +# NVIDIA target +#SYCLFLAG+=-sycl-target ptx64 +# +# triSYCL +# +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +#SYCLDIR=./triSYCL +#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL +# +# hipSYCL +# +SYCLDIR=/opt/hipSYCL +SYCLCXX=${SYCLDIR}/bin/syclcc-clang +SYCLFLAG=-std=c++17 -O3 +SYCLFLAG+=-DHIPSYCL +# CPU platform +SYCLFLAG+=--hipsycl-platform=cpu +SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib +# +CELERITYDIR=${SYCLDIR} +CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor +CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime +MPIINC=-I/usr/include/mpich-3.2-x86_64 +MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# Cilk +# +#CILKFLAG=-fcilkplus +# +# TBB +# +#TBBDIR=/usr/lib/x86_64-linux-gnu +TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 +TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +#TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb +#TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb +# +# Parallel STL, Boost, etc. +# +#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include +#BOOSTFLAG=-I/usr/include/boost169 +BOOSTFLAG=-I/opt/homebrew/Cellar/boost/1.75.0_1/include # M1 Big Sur +#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +RANGEFLAG=-DUSE_RANGES_TS -I../deps/range-v3/include +#RANGEFLAG=-DUSE_GCC_RANGES +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} +#PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages +KOKKOSDIR=/opt/kokkos/gcc +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} +RAJADIR=/opt/raja/gcc +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +EXECUTORSDIR=./libunifex +EXECUTORSFLAG=-I${EXECUTORSDIR}/include -I${EXECUTORSDIR}/build/include +# HPX is more complicated... +HWLOCFLAG=-I/usr/local/include +HPXDIR=./hpx +HPXCXX=${HPXDIR}/bin/hpxcxx +HPXFLAG=-Wno-unused-local-typedef ${HWLOCFLAG} +# UPC++ +UPCXXDIR=./upcxx +UPCXX=${UPCXXDIR}/bin/upcxx +UPCXXFLAG=-codemode={O3,debug} +UPCXXFLAG+=-std=c++17 +UPCXXFLAG+=-mtune=native -ffast-math +# +# CBLAS for C++ DGEMM +# +#BLASFLAG=-DACCELERATE -framework Accelerate +#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +BLASPATH=/opt/arm/22.0.1/armpl-22.0.1_AArch64_Ubuntu-20.04_gcc_aarch64-linux/lib +BLASFLAG=-L${BLASPATH} -Wl,-rpath=${BLASPATH} -larmpl_lp64_mp +BLASFLAG+=${OPENMPFLAG} +CBLASFLAG=${BLASFLAG} +CBLASFLAG+=-I/opt/arm/22.0.1/armpl-22.0.1_AArch64_Ubuntu-20.04_gcc_aarch64-linux/include_lp64 +CBLASFLAG+=-L/opt/arm/22.0.1/gcc-22.0.1_Generic-AArch64_Ubuntu-20.04_aarch64-linux/lib -lgfortran +CBLASFLAG+=-Wl,-rpath=/opt/arm/22.0.1/gcc-22.0.1_Generic-AArch64_Ubuntu-20.04_aarch64-linux/lib +#/opt/arm/22.0.1/gcc-22.0.1_Generic-AArch64_Ubuntu-20.04_aarch64-linux/lib/libarmflang.so +# +# CUDA flags +# +# Mac w/ CUDA emulation via https://github.com/hughperkins/coriander +#NVCC=/opt/llvm/cocl/bin/cocl +# Linux w/ NVIDIA CUDA +NVCC=nvcc +CUDAFLAGS=-g -O3 -std=c++11 +CUDAFLAGS+=-arch=sm_50 +# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 +CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +# +# Halide +# +HALIDECXX=${CXX} +HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 +# +# ISPC +# +ISPC=ispc +ISPCFLAG=-O3 --target=host --opt=fast-math +# +# MPI-3 +# +MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.1_2 +MPICC=${MPIDIR}/bin/mpicc +MPICXX=${MPIDIR}/bin/mpicxx +MPIFORT=${MPIDIR}/bin/mpifort +MPIINC=-I${MPIDIR}/include +MPILIB=-L${MPIDIR}/lib -lmpi_usempif08 -lmpi +#MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi +#MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi +#MPIINC=-I/usr/include/mpich-3.2-x86_64 +#MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi +# +# Global Arrays +# +GADIR=../deps +GAFLAG=-I${GADIR}/include +GAFLAG+=-L${GADIR}/lib -lga +GAFLAG+=-L${GADIR}/lib -larmci # ARMCI-MPI +#GAFLAG+=-L${GADIR}/lib -larmci -lcomex # ARMCI/ComEx +#GAFLAG+=${MPIINC} ${MPILIB} +#GAFLAG+=-lmpifort -lmpi +GAFLAG+=${BLASFLAG} +GAFLAG+=-fdefault-integer-8 # GA is compiled with 64b integers on 64-bit systems +# +# PETSc +# +PETSCDIR=../deps/petsc +PETSCFLAG=-I${PETSCDIR}/include +PETSCFLAG+=-L${PETSCDIR}/lib -lpetsc +PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib +# +# Fortran 2008 coarrays +# +# see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details +# single-node +#COARRAYFLAG=-fcoarray=single -lcaf_single +# multi-node +COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +# +# MEMKIND (used in C1z) +# +MEMKINDDIR=/home/parallels/PRK/deps +MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib From b40219cbb867aba3bf40baf00fcac697e4111d45 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 28 Apr 2022 12:15:14 +0300 Subject: [PATCH 214/325] transpose with loop --- FORTRAN/Makefile | 7 +- FORTRAN/transpose-openmp-target-loop.F90 | 242 +++++++++++++++++++++++ 2 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 FORTRAN/transpose-openmp-target-loop.F90 diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 4a1315edb..95639a9ba 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -78,7 +78,8 @@ mpi-openmp: nstream-mpi-openmp ga: nstream-ga transpose-ga dgemm-ga -target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgemm-openmp-target +target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgemm-openmp-target \ + transpose-openmp-target-loop openacc: p2p-openacc p2p-innerloop-openacc stencil-openacc transpose-openacc nstream-openacc @@ -130,6 +131,9 @@ dgemm-blas: dgemm-blas.F90 prk.mod %-target: %-target.F90 prk.mod $(FC) $(FCFLAGS) $(OPENMPFLAG) $(OFFLOADFLAG) $< prk_mod.o -o $@ +%-target-loop: %-target-loop.F90 prk.mod + $(FC) $(FCFLAGS) $(OPENMPFLAG) $(OFFLOADFLAG) $< prk_mod.o -o $@ + %-openacc: %-openacc.F90 prk.mod $(FC) $(FCFLAGS) $(OPENACCFLAG) $< prk_mod.o -o $@ @@ -161,6 +165,7 @@ clean: -rm -f *-ga -rm -f *-openmp -rm -f *-target + -rm -f *-target-loop -rm -f *-openacc -rm -f *-stdpar -rm -f *-cufortran diff --git a/FORTRAN/transpose-openmp-target-loop.F90 b/FORTRAN/transpose-openmp-target-loop.F90 new file mode 100644 index 000000000..f10d45506 --- /dev/null +++ b/FORTRAN/transpose-openmp-target-loop.F90 @@ -0,0 +1,242 @@ +! +! Copyright (c) 2015, Intel Corporation +! Copyright (c) 2021, NVIDIA +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! +! NAME: transpose +! +! PURPOSE: This program measures the time for the transpose of a +! column-major stored matrix into a row-major stored matrix. +! +! USAGE: Program input is the matrix order and the number of times to +! repeat the operation: +! +! transpose <# iterations> [tile size] +! +! An optional parameter specifies the tile size used to divide the +! individual matrix blocks for improved cache and TLB performance. +! +! The output consists of diagnostics to make sure the +! transpose worked and timing statistics. +! +! HISTORY: Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2015 +! ******************************************************************* + +program main + use, intrinsic :: iso_fortran_env + use omp_lib + implicit none + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations ! number of times to do the transpose + integer(kind=INT32) :: order ! order of a the matrix + real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix + real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix + real(kind=REAL64) :: T(0:32,0:32) ! Tile + integer(kind=INT64) :: bytes ! combined size of matrices + ! runtime variables + integer(kind=INT32) :: i, j, k + integer(kind=INT32) :: it, jt, tile_size + real(kind=REAL64) :: abserr, addit, temp ! squared error + real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters + real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a47)') 'Fortran OpenMP TARGET Matrix transpose: B = A^T' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a62)') 'Usage: ./transpose <# iterations> []' + stop 1 + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + if (iterations .lt. 1) then + write(*,'(a33,i5)') 'ERROR: iterations must be >= 1 : ', iterations + stop 1 + endif + + order = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') order + if (order .lt. 1) then + write(*,'(a28,i5)') 'ERROR: order must be >= 1 : ', order + stop 1 + endif + + ! same default as the C implementation + tile_size = 32 + if (command_argument_count().gt.2) then + call get_command_argument(3,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') tile_size + endif + if ((tile_size.gt.order).or.(tile_size.lt.1)) then + tile_size = order + endif + + if (tile_size.lt.order) then + if (mod(order,tile_size).ne.0) then + write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size' + stop 1 + endif + if (tile_size.gt.32) then + write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space' + stop 1 + endif + endif + + write(*,'(a,i8)') 'Number of iterations = ', iterations + write(*,'(a,i8)') 'Matrix order = ', order + write(*,'(a,i8)') 'Tile size = ', tile_size + + ! ******************************************************************** + ! ** Allocate space for the input and transpose matrix + ! ******************************************************************** + + allocate( A(order,order), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of A returned ',err + stop 1 + endif + + allocate( B(order,order), stat=err ) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of B returned ',err + stop 1 + endif + + t0 = 0 + + !$omp target data map(to: A) map(tofrom: B) + + !$omp target teams loop collapse(2) + do j=1,order + do i=1,order + A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + B(i,j) = 0 + enddo + enddo + !$omp end target teams loop + + do k=0,iterations + + if (k.eq.1) t0 = omp_get_wtime() + + if (tile_size.lt.order) then + !$omp target teams loop collapse(2) private(T,it,jt,i,j) + do jt=1,order,tile_size + do it=1,order,tile_size + !$omp loop collapse(2) + do j=0,tile_size-1 + do i=0,tile_size-1 + T(i,j) = A(it+i,jt+j) + A(it+i,jt+j) = A(it+i,jt+j) + 1 + enddo + enddo + !$omp end loop + !$omp loop collapse(2) + do i=0,tile_size-1 + do j=0,tile_size-1 + B(jt+j,it+i) = B(jt+j,it+i) + T(i,j) + enddo + enddo + !$omp end loop + enddo + enddo + !$omp end target teams loop + else + !$omp target teams loop collapse(2) private(it,jt,i,j) + do j=1,order + do i=1,order + B(j,i) = B(j,i) + A(i,j) + A(i,j) = A(i,j) + 1 + enddo + enddo + !$omp end target teams loop + endif + + enddo ! iterations + + t1 = omp_get_wtime() + trans_time = t1 - t0 + + !$omp end target data + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + abserr = 0.0 + ! this will overflow if iterations>>1000 + addit = (0.5*iterations) * (iterations+1) + !$omp parallel do collapse(2) & + !$omp& default(none) & + !$omp& shared(B) & + !$omp& firstprivate(order,iterations,addit) & + !$omp& private(i,j,temp) & + !$omp& reduction(+:abserr) + do j=1,order + do i=1,order + temp = ((real(order,REAL64)*real(i-1,REAL64))+real(j-1,REAL64)) & + * real(iterations+1,REAL64) + abserr = abserr + abs(B(i,j) - (temp+addit)) + enddo + enddo + !$omp end parallel do + + deallocate( B ) + deallocate( A ) + + if (abserr .lt. epsilon) then + write(*,'(a)') 'Solution validates' + avgtime = trans_time/iterations + bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(A)/8 + write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & + ' Avg time (s): ', avgtime + else + write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & + 'exceeds threshold ',epsilon + stop 1 + endif + +end program main + From 9f14334c32275b1270784000030d32fbeb7aa96d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 28 Apr 2022 12:15:28 +0300 Subject: [PATCH 215/325] use tile clause instead --- FORTRAN/transpose-openacc.F90 | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index 02ab0ab9d..fcb0a3342 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -137,16 +137,11 @@ program main t0 = 0 if (tile_size.lt.order) then - !$acc parallel loop gang collapse(2) - do jt=1,order,tile_size - do it=1,order,tile_size - !$acc loop vector collapse(2) - do j=jt,min(order,jt+tile_size-1) - do i=it,min(order,it+tile_size-1) - A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) - B(i,j) = 0.0 - enddo - enddo + !$acc parallel loop tile(tile_size,tile_size) + do j=1,order + do i=1,order + A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + B(i,j) = 0.0 enddo enddo else @@ -166,16 +161,11 @@ program main ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if (tile_size.lt.order) then - !$acc parallel loop gang collapse(2) - do jt=1,order,tile_size - do it=1,order,tile_size - !$acc loop vector collapse(2) - do j=jt,min(order,jt+tile_size-1) - do i=it,min(order,it+tile_size-1) - B(j,i) = B(j,i) + A(i,j) - A(i,j) = A(i,j) + 1.0 - enddo - enddo + !$acc parallel loop tile(tile_size,tile_size) + do j=1,order + do i=1,order + B(j,i) = B(j,i) + A(i,j) + A(i,j) = A(i,j) + 1.0 enddo enddo else From a9bf6e2b2cbb8947a9f7b8d55f39db88e730ba26 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 28 Apr 2022 12:15:47 +0300 Subject: [PATCH 216/325] more explicit private --- FORTRAN/transpose-openmp-target.F90 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index 4aa431b18..0f5a6cfec 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -162,10 +162,10 @@ program main if (k.eq.1) t0 = omp_get_wtime() if (tile_size.lt.order) then - !$omp target teams distribute collapse(2) private(T) + !$omp target teams distribute collapse(2) private(T,it,jt,i,j) do jt=1,order,tile_size do it=1,order,tile_size - !$omp parallel do simd collapse(2) schedule(static,4) + !$omp parallel do simd collapse(2) schedule(static) do j=0,tile_size-1 do i=0,tile_size-1 T(i,j) = A(it+i,jt+j) @@ -173,7 +173,7 @@ program main enddo enddo !$omp end parallel do simd - !$omp parallel do simd collapse(2) schedule(static,4) + !$omp parallel do simd collapse(2) schedule(static) do i=0,tile_size-1 do j=0,tile_size-1 B(jt+j,it+i) = B(jt+j,it+i) + T(i,j) @@ -184,7 +184,7 @@ program main enddo !$omp end target teams distribute else - !$omp target teams distribute parallel do simd collapse(2) GPU_SCHEDULE + !$omp target teams distribute parallel do simd collapse(2) private(it,jt,i,j) GPU_SCHEDULE do j=1,order do i=1,order B(j,i) = B(j,i) + A(i,j) @@ -208,7 +208,7 @@ program main abserr = 0.0 ! this will overflow if iterations>>1000 addit = (0.5*iterations) * (iterations+1) - !$omp parallel do collapse(2) & + !$omp parallel do collapse(2) & !$omp& default(none) & !$omp& shared(B) & !$omp& firstprivate(order,iterations,addit) & From ff9fa2bcd77599e56daa29823372773c2901eddd Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 28 Apr 2022 18:21:13 +0300 Subject: [PATCH 217/325] do not create dopevector per thread --- FORTRAN/nstream-cufortran.cuf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.cuf index 51ea47858..a6ea2ea14 100644 --- a/FORTRAN/nstream-cufortran.cuf +++ b/FORTRAN/nstream-cufortran.cuf @@ -66,13 +66,13 @@ module nstream use iso_fortran_env contains - attributes(global) subroutine kernel(scalar, A, B, C) + attributes(global) subroutine kernel(n, scalar, A, B, C) implicit none - real(kind=REAL64), intent(inout) :: A(:) - real(kind=REAL64), intent(in) :: B(:), C(:) + integer(kind=INT64), intent(in), value :: n real(kind=REAL64), intent(in), value :: scalar - integer :: i, n - n = size(A) + real(kind=REAL64), intent(inout) :: A(n) + real(kind=REAL64), intent(in) :: B(n), C(n) + integer :: i i = blockDim%x * (blockIdx%x - 1) + threadIdx%x if (i <= n) then A(i) = A(i) + B(i) + scalar * C(i) @@ -146,7 +146,7 @@ program main t0 = prk_get_wtime() endif - call kernel<<>>(scalar, A, B, C) + call kernel<<>>(length, scalar, A, B, C) enddo ! iterations From 719977ce107152e4355629ca749d2914957ca570 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 11:56:26 +0300 Subject: [PATCH 218/325] implement Pablo's changes in code generator Signed-off-by: Jeff Hammond --- Cxx11/generate-sycl-stencil.py | 26 ++----- Cxx11/stencil_sycl.hpp | 121 ++++++++++++++++----------------- 2 files changed, 64 insertions(+), 83 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index 22756399e..c67f2d124 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -33,20 +33,14 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm): src.write(' sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n') src.write(' sycl::id<2> dy'+str(r)+'(sycl::range<2> {0,'+str(r)+'});\n') src.write(' h.parallel_for>(') - src.write('sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') - src.write('sycl::id<2> {'+str(radius)+','+str(radius)+'}, ') + src.write('sycl::range<2> {n-'+str(radius)+',n-'+str(radius)+'}, ') src.write('[=] (sycl::item<2> it) {\n') if (dim==2): - src.write(' sycl::id<2> xy = it.get_id();\n') + src.write(' sycl::id<2> xy = it.get_id() + sycl::id<2> {'+str(radius)+','+str(radius)+'};\n') src.write(' out[xy] += ') else: - # 1D indexing the slow way - #src.write(' auto i = it[0];\n') - #src.write(' auto j = it[1];\n') - #src.write(' out[i*n+j] += ') - # 1D indexing the fast way - src.write(' const auto i = it[0];\n') - src.write(' const auto j = it[1];\n') + src.write(' const auto i = it[0] + '+str(radius)+';\n') + src.write(' const auto j = it[1] + '+str(radius)+';\n') src.write(' out[i*n+j] += ') if pattern == 'star': for i in range(1,radius+1): @@ -62,18 +56,6 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm): src.write('\n'+19*' ') src.write('+in[xy-dy'+str(i)+'] * static_cast('+str(-1./(2.*i*radius))+')') else: - # 1D indexing the slow way - #if i > 1: - # src.write('\n') - # src.write(22*' ') - #src.write('+in[i*n+(j+'+str(i)+')] * static_cast('+str(+1./(2.*i*radius))+')') - #src.write('\n'+22*' ') - #src.write('+in[i*n+(j-'+str(i)+')] * static_cast('+str(-1./(2.*i*radius))+')') - #src.write('\n'+22*' ') - #src.write('+in[(i+'+str(i)+')*n+j] * static_cast('+str(+1./(2.*i*radius))+')') - #src.write('\n'+22*' ') - #src.write('+in[(i-'+str(i)+')*n+j] * static_cast('+str(-1./(2.*i*radius))+')') - # 1D indexing the fast way if i > 1: src.write('\n') src.write(30*' ') diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 024e796c4..64af40b79 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -1,4 +1,3 @@ - // declare the kernel name used in SYCL parallel_for template class star1_1d; @@ -143,18 +142,18 @@ void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { const auto i = it[0] + 3; const auto j = it[1] + 3; - out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.166666666667) - +in[i*n+(j-1)] * static_cast(-0.166666666667) - +in[(i+1)*n+j] * static_cast(0.166666666667) - +in[(i-1)*n+j] * static_cast(-0.166666666667) - +in[i*n+(j+2)] * static_cast(0.0833333333333) - +in[i*n+(j-2)] * static_cast(-0.0833333333333) - +in[(i+2)*n+j] * static_cast(0.0833333333333) - +in[(i-2)*n+j] * static_cast(-0.0833333333333) - +in[i*n+(j+3)] * static_cast(0.0555555555556) - +in[i*n+(j-3)] * static_cast(-0.0555555555556) - +in[(i+3)*n+j] * static_cast(0.0555555555556) - +in[(i-3)*n+j] * static_cast(-0.0555555555556); + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.16666666666666666) + +in[i*n+(j-1)] * static_cast(-0.16666666666666666) + +in[(i+1)*n+j] * static_cast(0.16666666666666666) + +in[(i-1)*n+j] * static_cast(-0.16666666666666666) + +in[i*n+(j+2)] * static_cast(0.08333333333333333) + +in[i*n+(j-2)] * static_cast(-0.08333333333333333) + +in[(i+2)*n+j] * static_cast(0.08333333333333333) + +in[(i-2)*n+j] * static_cast(-0.08333333333333333) + +in[i*n+(j+3)] * static_cast(0.05555555555555555) + +in[i*n+(j-3)] * static_cast(-0.05555555555555555) + +in[(i+3)*n+j] * static_cast(0.05555555555555555) + +in[(i-3)*n+j] * static_cast(-0.05555555555555555); }); }); } @@ -176,18 +175,18 @@ void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf sycl::id<2> dy3(sycl::range<2> {0,3}); h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { sycl::id<2> xy = it.get_id() + sycl::id<2> {3,3}; - out[xy] += +in[xy+dx1] * static_cast(0.166666666667) - +in[xy-dx1] * static_cast(-0.166666666667) - +in[xy+dy1] * static_cast(0.166666666667) - +in[xy-dy1] * static_cast(-0.166666666667) - +in[xy+dx2] * static_cast(0.0833333333333) - +in[xy-dx2] * static_cast(-0.0833333333333) - +in[xy+dy2] * static_cast(0.0833333333333) - +in[xy-dy2] * static_cast(-0.0833333333333) - +in[xy+dx3] * static_cast(0.0555555555556) - +in[xy-dx3] * static_cast(-0.0555555555556) - +in[xy+dy3] * static_cast(0.0555555555556) - +in[xy-dy3] * static_cast(-0.0555555555556); + out[xy] += +in[xy+dx1] * static_cast(0.16666666666666666) + +in[xy-dx1] * static_cast(-0.16666666666666666) + +in[xy+dy1] * static_cast(0.16666666666666666) + +in[xy-dy1] * static_cast(-0.16666666666666666) + +in[xy+dx2] * static_cast(0.08333333333333333) + +in[xy-dx2] * static_cast(-0.08333333333333333) + +in[xy+dy2] * static_cast(0.08333333333333333) + +in[xy-dy2] * static_cast(-0.08333333333333333) + +in[xy+dx3] * static_cast(0.05555555555555555) + +in[xy-dx3] * static_cast(-0.05555555555555555) + +in[xy+dy3] * static_cast(0.05555555555555555) + +in[xy-dy3] * static_cast(-0.05555555555555555); }); }); } @@ -202,18 +201,18 @@ void star3(sycl::queue & q, const size_t n, const T * in, T * out) h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { const auto i = it[0] + 3; const auto j = it[1] + 3; - out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.166666666667) - +in[i*n+(j-1)] * static_cast(-0.166666666667) - +in[(i+1)*n+j] * static_cast(0.166666666667) - +in[(i-1)*n+j] * static_cast(-0.166666666667) - +in[i*n+(j+2)] * static_cast(0.0833333333333) - +in[i*n+(j-2)] * static_cast(-0.0833333333333) - +in[(i+2)*n+j] * static_cast(0.0833333333333) - +in[(i-2)*n+j] * static_cast(-0.0833333333333) - +in[i*n+(j+3)] * static_cast(0.0555555555556) - +in[i*n+(j-3)] * static_cast(-0.0555555555556) - +in[(i+3)*n+j] * static_cast(0.0555555555556) - +in[(i-3)*n+j] * static_cast(-0.0555555555556); + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.16666666666666666) + +in[i*n+(j-1)] * static_cast(-0.16666666666666666) + +in[(i+1)*n+j] * static_cast(0.16666666666666666) + +in[(i-1)*n+j] * static_cast(-0.16666666666666666) + +in[i*n+(j+2)] * static_cast(0.08333333333333333) + +in[i*n+(j-2)] * static_cast(-0.08333333333333333) + +in[(i+2)*n+j] * static_cast(0.08333333333333333) + +in[(i-2)*n+j] * static_cast(-0.08333333333333333) + +in[i*n+(j+3)] * static_cast(0.05555555555555555) + +in[i*n+(j-3)] * static_cast(-0.05555555555555555) + +in[(i+3)*n+j] * static_cast(0.05555555555555555) + +in[(i-3)*n+j] * static_cast(-0.05555555555555555); }); }); } @@ -238,10 +237,10 @@ void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer +in[i*n+(j-2)] * static_cast(-0.0625) +in[(i+2)*n+j] * static_cast(0.0625) +in[(i-2)*n+j] * static_cast(-0.0625) - +in[i*n+(j+3)] * static_cast(0.0416666666667) - +in[i*n+(j-3)] * static_cast(-0.0416666666667) - +in[(i+3)*n+j] * static_cast(0.0416666666667) - +in[(i-3)*n+j] * static_cast(-0.0416666666667) + +in[i*n+(j+3)] * static_cast(0.041666666666666664) + +in[i*n+(j-3)] * static_cast(-0.041666666666666664) + +in[(i+3)*n+j] * static_cast(0.041666666666666664) + +in[(i-3)*n+j] * static_cast(-0.041666666666666664) +in[i*n+(j+4)] * static_cast(0.03125) +in[i*n+(j-4)] * static_cast(-0.03125) +in[(i+4)*n+j] * static_cast(0.03125) @@ -277,10 +276,10 @@ void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf +in[xy-dx2] * static_cast(-0.0625) +in[xy+dy2] * static_cast(0.0625) +in[xy-dy2] * static_cast(-0.0625) - +in[xy+dx3] * static_cast(0.0416666666667) - +in[xy-dx3] * static_cast(-0.0416666666667) - +in[xy+dy3] * static_cast(0.0416666666667) - +in[xy-dy3] * static_cast(-0.0416666666667) + +in[xy+dx3] * static_cast(0.041666666666666664) + +in[xy-dx3] * static_cast(-0.041666666666666664) + +in[xy+dy3] * static_cast(0.041666666666666664) + +in[xy-dy3] * static_cast(-0.041666666666666664) +in[xy+dx4] * static_cast(0.03125) +in[xy-dx4] * static_cast(-0.03125) +in[xy+dy4] * static_cast(0.03125) @@ -307,10 +306,10 @@ void star4(sycl::queue & q, const size_t n, const T * in, T * out) +in[i*n+(j-2)] * static_cast(-0.0625) +in[(i+2)*n+j] * static_cast(0.0625) +in[(i-2)*n+j] * static_cast(-0.0625) - +in[i*n+(j+3)] * static_cast(0.0416666666667) - +in[i*n+(j-3)] * static_cast(-0.0416666666667) - +in[(i+3)*n+j] * static_cast(0.0416666666667) - +in[(i-3)*n+j] * static_cast(-0.0416666666667) + +in[i*n+(j+3)] * static_cast(0.041666666666666664) + +in[i*n+(j-3)] * static_cast(-0.041666666666666664) + +in[(i+3)*n+j] * static_cast(0.041666666666666664) + +in[(i-3)*n+j] * static_cast(-0.041666666666666664) +in[i*n+(j+4)] * static_cast(0.03125) +in[i*n+(j-4)] * static_cast(-0.03125) +in[(i+4)*n+j] * static_cast(0.03125) @@ -339,10 +338,10 @@ void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer +in[i*n+(j-2)] * static_cast(-0.05) +in[(i+2)*n+j] * static_cast(0.05) +in[(i-2)*n+j] * static_cast(-0.05) - +in[i*n+(j+3)] * static_cast(0.0333333333333) - +in[i*n+(j-3)] * static_cast(-0.0333333333333) - +in[(i+3)*n+j] * static_cast(0.0333333333333) - +in[(i-3)*n+j] * static_cast(-0.0333333333333) + +in[i*n+(j+3)] * static_cast(0.03333333333333333) + +in[i*n+(j-3)] * static_cast(-0.03333333333333333) + +in[(i+3)*n+j] * static_cast(0.03333333333333333) + +in[(i-3)*n+j] * static_cast(-0.03333333333333333) +in[i*n+(j+4)] * static_cast(0.025) +in[i*n+(j-4)] * static_cast(-0.025) +in[(i+4)*n+j] * static_cast(0.025) @@ -384,10 +383,10 @@ void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buf +in[xy-dx2] * static_cast(-0.05) +in[xy+dy2] * static_cast(0.05) +in[xy-dy2] * static_cast(-0.05) - +in[xy+dx3] * static_cast(0.0333333333333) - +in[xy-dx3] * static_cast(-0.0333333333333) - +in[xy+dy3] * static_cast(0.0333333333333) - +in[xy-dy3] * static_cast(-0.0333333333333) + +in[xy+dx3] * static_cast(0.03333333333333333) + +in[xy-dx3] * static_cast(-0.03333333333333333) + +in[xy+dy3] * static_cast(0.03333333333333333) + +in[xy-dy3] * static_cast(-0.03333333333333333) +in[xy+dx4] * static_cast(0.025) +in[xy-dx4] * static_cast(-0.025) +in[xy+dy4] * static_cast(0.025) @@ -418,10 +417,10 @@ void star5(sycl::queue & q, const size_t n, const T * in, T * out) +in[i*n+(j-2)] * static_cast(-0.05) +in[(i+2)*n+j] * static_cast(0.05) +in[(i-2)*n+j] * static_cast(-0.05) - +in[i*n+(j+3)] * static_cast(0.0333333333333) - +in[i*n+(j-3)] * static_cast(-0.0333333333333) - +in[(i+3)*n+j] * static_cast(0.0333333333333) - +in[(i-3)*n+j] * static_cast(-0.0333333333333) + +in[i*n+(j+3)] * static_cast(0.03333333333333333) + +in[i*n+(j-3)] * static_cast(-0.03333333333333333) + +in[(i+3)*n+j] * static_cast(0.03333333333333333) + +in[(i-3)*n+j] * static_cast(-0.03333333333333333) +in[i*n+(j+4)] * static_cast(0.025) +in[i*n+(j-4)] * static_cast(-0.025) +in[(i+4)*n+j] * static_cast(0.025) From 527e58c517e52c16f8d8b9e41016878fa5c6b9bf Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 12:02:04 +0300 Subject: [PATCH 219/325] remove unnecessary deprecated offset Signed-off-by: Jeff Hammond --- Cxx11/stencil-2d-sycl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc index ee42e2da0..541273634 100644 --- a/Cxx11/stencil-2d-sycl.cc +++ b/Cxx11/stencil-2d-sycl.cc @@ -144,7 +144,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star q.submit([&](sycl::handler& h) { auto in = d_in.template get_access(h); // Add constant to solution to force refresh of neighbor data, if any - h.parallel_for>(sycl::range<2> {n, n}, sycl::id<2> {0, 0}, [=] (sycl::item<2> it) { + h.parallel_for>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) { sycl::id<2> xy = it.get_id(); in[xy] += static_cast(1); }); From 643b7965d2f4e47453cf5eaf4ff19db33a50313e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 12:12:31 +0300 Subject: [PATCH 220/325] add a workaround for FP64 problems with DPC++ on TGL --- common/make.defs.oneapi | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index ec6421d24..246af6200 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -19,7 +19,7 @@ CXX=icpx -std=c++20 -pthread #--gcc-toolchain=/opt/gcc/11.2.0 # Compiler flags # # -xHOST is appropriate for most cases. -DEFAULT_OPT_FLAGS=-g -O3 -xHOST +DEFAULT_OPT_FLAGS=-g3 -O3 -xHOST # # If you are compiling for KNL on a Xeon login node, use the following: # DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512 @@ -32,6 +32,7 @@ OPENMPFLAG=-qopenmp OPENMPSIMDFLAG=-qopenmp-simd OFFLOADFLAG=-fopenmp-targets=spir64 OFFLOADFLAG+=-DGPU_SCHEDULE="" +STDPARFLAG=-parallel -qmkl # # OpenCL flags # @@ -59,8 +60,12 @@ OPENCLFLAG=-I${OPENCLDIR}/include/sycl -L${OPENCLDIR}/lib -lOpenCL # SYCLCXX=dpcpp SYCLFLAG=-fsycl -SYCLFLAG+=-std=c++17 -O3 +SYCLFLAG+=-std=c++17 -O3 -g3 SYCLFLAG+=-DDPCPP +# this is because the DPC++ compiler will fail to compile run on Tiger Lake +# even though the code explicitly checks for FP64 support and only instantiates the +# template when the device query says FP64 is supported. +SYCLFLAG+=-DDPCPP_NO_DOUBLE # # # OCCA From 2d2898f1a5dd1b9c711fc8b6b65324ae3fae57c6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 12:15:37 +0300 Subject: [PATCH 221/325] add a workaround for FP64 problems with DPC++ on TGL --- Cxx11/transpose-2d-sycl.cc | 11 ++++++++++- Cxx11/transpose-sycl-usm.cc | 8 +++++++- Cxx11/transpose-sycl.cc | 8 +++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/Cxx11/transpose-2d-sycl.cc b/Cxx11/transpose-2d-sycl.cc index 83092891e..2fbe8938b 100644 --- a/Cxx11/transpose-2d-sycl.cc +++ b/Cxx11/transpose-2d-sycl.cc @@ -217,7 +217,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -234,7 +236,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -250,13 +254,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); - bool has_fp64 = prk::SYCL::has_fp64(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE + bool has_fp64 = prk::SYCL::has_fp64(q); + if (has_fp64) { + if (prk::SYCL::print_gen12lp_helper(q)) return 1; + } if (has_fp64) { run(q, iterations, order, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/transpose-sycl-usm.cc b/Cxx11/transpose-sycl-usm.cc index c1d9a4fec..1ec5c1470 100644 --- a/Cxx11/transpose-sycl-usm.cc +++ b/Cxx11/transpose-sycl-usm.cc @@ -200,7 +200,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -217,7 +219,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -233,16 +237,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, order, block_size); if (has_fp64) { run(q, iterations, order, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index d3bcc0215..da0d596c0 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -216,7 +216,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -233,7 +235,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, order, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -249,16 +253,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, order, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, order, block_size); if (has_fp64) { run(q, iterations, order, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; From 7f5b6fa0d40665a297b0e66c8dc94b3a9948a321 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 7 May 2022 12:18:16 +0300 Subject: [PATCH 222/325] add a workaround for FP64 problems with DPC++ on TGL --- Cxx11/nstream-sycl-explicit-usm.cc | 8 +++++++- Cxx11/nstream-sycl-explicit.cc | 8 +++++++- Cxx11/nstream-sycl-usm.cc | 8 +++++++- Cxx11/nstream-sycl.cc | 8 +++++++- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/Cxx11/nstream-sycl-explicit-usm.cc b/Cxx11/nstream-sycl-explicit-usm.cc index 22325b565..aa5c5c690 100644 --- a/Cxx11/nstream-sycl-explicit-usm.cc +++ b/Cxx11/nstream-sycl-explicit-usm.cc @@ -278,7 +278,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -295,7 +297,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -311,16 +315,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, length, block_size); if (has_fp64) { run(q, iterations, length, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc index a3083a244..adf045d32 100644 --- a/Cxx11/nstream-sycl-explicit.cc +++ b/Cxx11/nstream-sycl-explicit.cc @@ -271,7 +271,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -288,7 +290,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -304,16 +308,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, length, block_size); if (has_fp64) { run(q, iterations, length, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc index f119746ff..e872a5130 100644 --- a/Cxx11/nstream-sycl-usm.cc +++ b/Cxx11/nstream-sycl-usm.cc @@ -256,7 +256,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -273,7 +275,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -289,16 +293,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, length, block_size); if (has_fp64) { run(q, iterations, length, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index a95a163aa..140125f9d 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -253,7 +253,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -270,7 +272,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, length, block_size); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -286,16 +290,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, length, block_size); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, length, block_size); if (has_fp64) { run(q, iterations, length, block_size); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; From c9a68ab4d39c718f83383fb52a3d894fe444cd89 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Mon, 9 May 2022 10:18:22 -0500 Subject: [PATCH 223/325] Update script for code generation --- Cxx11/generate-sycl-stencil.py | 9 ++++----- Cxx11/stencil_sycl.hpp | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index 22756399e..0361d6666 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -33,11 +33,10 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm): src.write(' sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n') src.write(' sycl::id<2> dy'+str(r)+'(sycl::range<2> {0,'+str(r)+'});\n') src.write(' h.parallel_for>(') - src.write('sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') - src.write('sycl::id<2> {'+str(radius)+','+str(radius)+'}, ') + src.write('sycl::range<2> {n-'+str(radius)+',n-'+str(radius)+'}, ') src.write('[=] (sycl::item<2> it) {\n') if (dim==2): - src.write(' sycl::id<2> xy = it.get_id();\n') + src.write(' sycl::id<2> xy = it.get_id() + sycl::id<2> {'+str(radius)+','+str(radius)+'};\n') src.write(' out[xy] += ') else: # 1D indexing the slow way @@ -45,8 +44,8 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm): #src.write(' auto j = it[1];\n') #src.write(' out[i*n+j] += ') # 1D indexing the fast way - src.write(' const auto i = it[0];\n') - src.write(' const auto j = it[1];\n') + src.write(' const auto i = it[0] + '+str(radius)+';\n') + src.write(' const auto j = it[1] + '+str(radius)+';\n') src.write(' out[i*n+j] += ') if pattern == 'star': for i in range(1,radius+1): diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 024e796c4..89aa0368c 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -1,4 +1,3 @@ - // declare the kernel name used in SYCL parallel_for template class star1_1d; From b7b5ed6417dfd315d91509861bef3ec0a71f492f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 10:04:12 +0300 Subject: [PATCH 224/325] add cuf target link --- FORTRAN/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 95639a9ba..96daca662 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -86,6 +86,7 @@ openacc: p2p-openacc p2p-innerloop-openacc stencil-openacc transpose-openacc nst stdpar: nstream-stdpar stencil-stdpar transpose-stdpar dgemm-stdpar +cuda: cufortran cuf: cufortran cufortran: nstream-cufortran transpose-cufortran From 377aafa7132cd001aff33179a9f60a4ba0ee33bd Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 10:04:50 +0300 Subject: [PATCH 225/325] add cache --- FORTRAN/transpose-openacc.F90 | 1 + 1 file changed, 1 insertion(+) diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index fcb0a3342..a94c00230 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -140,6 +140,7 @@ program main !$acc parallel loop tile(tile_size,tile_size) do j=1,order do i=1,order + !$acc cache(A,B) A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) B(i,j) = 0.0 enddo From 86b6acd2a6c9760d9b6d4bb39e41578561d5f0b3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 10:09:29 +0300 Subject: [PATCH 226/325] no double stuff --- Cxx11/stencil-2d-sycl.cc | 11 ++++++++++- Cxx11/stencil-sycl-usm.cc | 8 +++++++- Cxx11/stencil-sycl.cc | 8 +++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc index 541273634..b6eeb09bc 100644 --- a/Cxx11/stencil-2d-sycl.cc +++ b/Cxx11/stencil-2d-sycl.cc @@ -281,7 +281,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -298,7 +300,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -314,13 +318,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); - bool has_fp64 = prk::SYCL::has_fp64(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE + bool has_fp64 = prk::SYCL::has_fp64(q); + if (has_fp64) { + if (prk::SYCL::print_gen12lp_helper(q)) return 1; + } if (has_fp64) { run(q, iterations, n, block_size, star, radius); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/stencil-sycl-usm.cc b/Cxx11/stencil-sycl-usm.cc index 8b7adfac5..b219b24f1 100644 --- a/Cxx11/stencil-sycl-usm.cc +++ b/Cxx11/stencil-sycl-usm.cc @@ -270,7 +270,9 @@ int main(int argc, char * argv[]) sycl::queue q(sycl::host_selector{}, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -287,7 +289,9 @@ int main(int argc, char * argv[]) sycl::queue q(sycl::cpu_selector{}, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -303,16 +307,18 @@ int main(int argc, char * argv[]) try { sycl::queue q(sycl::gpu_selector{}, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); + run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, n, block_size, star, radius); if (has_fp64) { run(q, iterations, n, block_size, star, radius); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index b78706df2..8947c8dee 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -279,7 +279,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::host_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -296,7 +298,9 @@ int main(int argc, char * argv[]) sycl::queue q{sycl::cpu_selector{}}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE run(q, iterations, n, block_size, star, radius); +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; @@ -312,16 +316,18 @@ int main(int argc, char * argv[]) try { sycl::queue q{sycl::gpu_selector{}}; prk::SYCL::print_device_platform(q); + run(q, iterations, n, block_size, star, radius); +#ifndef DPCPP_NO_DOUBLE bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { if (prk::SYCL::print_gen12lp_helper(q)) return 1; } - run(q, iterations, n, block_size, star, radius); if (has_fp64) { run(q, iterations, n, block_size, star, radius); } else { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } +#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; From bfc6bb94d033852e6b303a253e334fa733a6aee4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 10:50:08 +0300 Subject: [PATCH 227/325] nstream C OpenACC --- C1z/nstream-openacc.c | 173 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 C1z/nstream-openacc.c diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c new file mode 100644 index 000000000..051342f45 --- /dev/null +++ b/C1z/nstream-openacc.c @@ -0,0 +1,173 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// Copyright (c) 2022, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors and +/// the length of the vectors. +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// Converted to C11 by Jeff Hammond, February 2019. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %d\n", PRKVERSION ); + printf("C11/OpenACC STREAM triad: A = B + scalar * C\n"); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> \n"); + return 1; + } + + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // length of a the vector + size_t length = atol(argv[2]); + if (length <= 0) { + printf("ERROR: Vector length must be greater than 0\n"); + return 1; + } + + printf("Number of iterations = %d\n", iterations); + printf("Vector length = %zu\n", length); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time = 0.0; + + size_t bytes = length*sizeof(double); + double * restrict A = acc_malloc(bytes); + double * restrict B = acc_malloc(bytes); + double * restrict C = acc_malloc(bytes); + + double scalar = 3.0; + + { + #pragma acc parallel loop deviceptr(A,B,C) + for (size_t i=0; i epsilon) { + printf("Failed Validation on output array\n" + " Expected checksum: %lf\n" + " Observed checksum: %lf\n" + "ERROR: solution did not validate\n", ar, asum); + return 1; + } else { + printf("Solution validates\n"); + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime); + } + + return 0; +} + + From e15f271a9153a24ae04d6911bb11dc9148d51929 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 11:07:08 +0300 Subject: [PATCH 228/325] transpose OpenACC --- C1z/transpose-openacc.c | 167 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 C1z/transpose-openacc.c diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c new file mode 100644 index 000000000..679afb1d8 --- /dev/null +++ b/C1z/transpose-openacc.c @@ -0,0 +1,167 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2022, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// C11-ification by Jeff Hammond, June 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %d\n", PRKVERSION ); + printf("C11/OpenACC Matrix transpose: B = A^T\n"); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> [tile size]\n"); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // order of a the matrix + int order = atoi(argv[2]); + if (order <= 0) { + printf("ERROR: Matrix Order must be greater than 0\n"); + return 1; + } + + // default tile size for tiling of local transpose + int tile_size = (argc>3) ? atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + + printf("Number of iterations = %d\n", iterations); + printf("Matrix order = %d\n", order); + printf("Tile size = %d\n", tile_size); + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + double trans_time = 0.0; + + size_t bytes = order*order*sizeof(double); + double * restrict A = acc_malloc(bytes); + double * restrict B = acc_malloc(bytes); + + { + #pragma acc parallel loop deviceptr(A,B) + for (int i=0;i Date: Wed, 18 May 2022 11:07:13 +0300 Subject: [PATCH 229/325] transpose OpenACC --- C1z/Makefile | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/C1z/Makefile b/C1z/Makefile index f8927c191..5e01f0894 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -22,34 +22,22 @@ endif ASMFLAGS = -fverbose-asm $(CFLAGS) OMPFLAGS = $(OPENMPFLAG) +ACCFLAGS = $(OPENACCFLAG) TARGETFLAGS = $(OFFLOADFLAG) CILKFLAGS = $(CILKFLAG) ISPCFLAGS = $(ISPCFLAG) -.PHONY: all clean serial thread openmp target taskloop ispc # cilk +.PHONY: all clean serial thread openmp tasks target taskloop ispc EXTRA= -ifeq ($(shell uname -s),Darwin) - ifneq ($(findstring icc,$(CC)),icc) - EXTRA += target - endif -else - ifneq ($(findstring icx,$(CC)),icx) - EXTRA += target - endif -endif ifdef ($(ISPC)) EXTRA += ispc endif ifneq ($(CILKFLAG),) EXTRA += cilk endif -ifeq ($(findstring xlc,$(CC)),xlc) - EXTRA = target - CFLAGS += -DXLC -endif -ifneq ($(findstring icx,$(CC)),icx) - EXTRA += tasks +ifneq ($(OPENACCFLAG),) + EXTRA += openacc endif all: serial thread openmp $(EXTRA) @@ -83,6 +71,8 @@ target: nstream-target stencil-target transpose-target nstream-alloc-target nstr taskloop: nstream-taskloop stencil-taskloop transpose-taskloop +openacc: nstream-openacc transpose-openacc + cilk: stencil-cilk transpose-cilk ispc: transpose-ispc @@ -132,6 +122,9 @@ p2p-2d: p2p-2d.c prk_util.h %-openmp: %-openmp.c prk_util.h prk_openmp.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ +%-openacc: %-openacc.c prk_util.h + $(CC) $(CFLAGS) $< $(ACCFLAGS) $(EXTRA_CLIBS) -o $@ + %-cilk: %-cilk.c prk_util.h $(CC) $(CFLAGS) $< $(CILKFLAGS) $(EXTRA_CLIBS) -o $@ @@ -161,6 +154,7 @@ clean: -rm -f p2p-sse p2p-avx p2p-avx3 p2p-avx-tasks-openmp -rm -f *-2d -rm -f *-openmp + -rm -f *-openacc -rm -f *-mpi -rm -f *-petsc -rm -f *-target From dc4f0554be4ae24cb3f2d64977529fa8eac797f6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 11:07:44 +0300 Subject: [PATCH 230/325] ignore and build --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 7843446ba..cb7046a77 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,7 @@ C1z/nstream-mmap C1z/nstream-mmap-openmp C1z/nstream-mpi C1z/nstream-openmp +C1z/nstream-openacc C1z/nstream-petsc C1z/nstream-target C1z/nstream-taskloop @@ -96,6 +97,7 @@ C1z/transpose-2d-openmp C1z/transpose-cilk C1z/transpose-ispc C1z/transpose-openmp +C1z/transpose-openacc C1z/transpose-petsc C1z/transpose-target C1z/transpose-taskloop From a7bb31096312af10dbbb09c9cb3e7b894afcfcc3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 11:16:35 +0300 Subject: [PATCH 231/325] OpenACC C stencil --- .gitignore | 1 + C1z/Makefile | 2 +- C1z/stencil-openacc.c | 230 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 C1z/stencil-openacc.c diff --git a/.gitignore b/.gitignore index cb7046a77..df8aeaa8f 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ C1z/stencil-2d C1z/stencil-2d-openmp C1z/stencil-cilk C1z/stencil-openmp +C1z/stencil-openacc C1z/stencil-target C1z/stencil-taskloop C1z/transpose diff --git a/C1z/Makefile b/C1z/Makefile index 5e01f0894..c8c61ed10 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -71,7 +71,7 @@ target: nstream-target stencil-target transpose-target nstream-alloc-target nstr taskloop: nstream-taskloop stencil-taskloop transpose-taskloop -openacc: nstream-openacc transpose-openacc +openacc: nstream-openacc stencil-openacc transpose-openacc cilk: stencil-cilk transpose-cilk diff --git a/C1z/stencil-openacc.c b/C1z/stencil-openacc.c new file mode 100644 index 000000000..edc7e994b --- /dev/null +++ b/C1z/stencil-openacc.c @@ -0,0 +1,230 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2022, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - C99-ification by Jeff Hammond, February 2016. +/// - C11-ification by Jeff Hammond, June 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +typedef void (*stencil_t)(const int, const double * restrict, double * restrict); + +void nothing(const int n, const double * restrict in, double * restrict out) +{ + printf("You are trying to use a stencil that does not exist.\n"); + printf("Please generate the new stencil using the code generator.\n"); + // n will never be zero - this is to silence compiler warnings. + if (n==0) printf("%p %p\n", in, out); + abort(); +} + +#include "stencil_openacc.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %d\n", PRKVERSION); + printf("C11/OpenACC Stencil execution on 2D grid\n"); + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3){ + printf("Usage: <# iterations> [ ]\n"); + return 1; + } + + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + int n = atoi(argv[2]); + if (n < 1) { + printf("ERROR: grid dimension must be positive\n"); + return 1; + } else if (n > floor(sqrt(INT_MAX))) { + printf("ERROR: grid dimension too large - overflow risk\n"); + return 1; + } + + // stencil pattern + bool star = true; + if (argc > 3) { + char* pattern = argv[3]; + star = (0==strncmp(pattern,"star",4)) ? true : false; + } + + // stencil radius + int radius = 2; + if (argc > 4) { + radius = atoi(argv[4]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + printf("ERROR: Stencil radius negative or too large\n"); + return 1; + } + + printf("Number of iterations = %d\n", iterations); + printf("Grid sizes = %d\n", n); + printf("Type of stencil = %s\n", (star ? "star" : "grid") ); + printf("Radius of stencil = %d\n", radius ); + + stencil_t stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + case 6: stencil = star6; break; + case 7: stencil = star7; break; + case 8: stencil = star8; break; + case 9: stencil = star9; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + case 6: stencil = grid6; break; + case 7: stencil = grid7; break; + case 8: stencil = grid8; break; + case 9: stencil = grid9; break; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double stencil_time = 0.0; + + // interior of grid with respect to stencil + size_t active_points = (n-2*radius)*(n-2*radius); + size_t bytes = n*n*sizeof(double); + + double * restrict in = acc_malloc(bytes); + double * restrict out = acc_malloc(bytes); + + { + #pragma acc parallel loop collapse(2) deviceptr(in,out) + for (int i=0; i epsilon) { + printf("ERROR: L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm); + return 1; + } else { + printf("Solution validates\n"); +#ifdef VERBOSE + printf("L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm); +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2*stencil_size+1) * active_points; + double avgtime = stencil_time/iterations; + printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0e-6 * (double)flops/avgtime, avgtime ); + } + + return 0; +} From c8fe3a913230cfd8a55739d0fef0cf08b39ffc7c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:03:00 +0300 Subject: [PATCH 232/325] stencil for OpenACC --- C1z/generate-c-stencil.py | 6 +- C1z/stencil_openacc.h | 3126 +++++++++++++++++++++++++++++++++++++ 2 files changed, 3130 insertions(+), 2 deletions(-) create mode 100644 C1z/stencil_openacc.h diff --git a/C1z/generate-c-stencil.py b/C1z/generate-c-stencil.py index f6dc86032..20a2c9455 100755 --- a/C1z/generate-c-stencil.py +++ b/C1z/generate-c-stencil.py @@ -21,9 +21,11 @@ def codegen(src,pattern,stencil_size,radius,W,model,dim): if (model=='openmp'): outer += 'OMP_FOR()\n ' elif (model=='target'): - outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) )\n ' + outer += 'OMP_TARGET( teams distribute parallel for simd collapse(2) )\n ' elif (model=='taskloop'): outer += 'OMP_TASKLOOP( firstprivate(n) shared(in,out) grainsize(gs) )\n ' + elif (model=='openacc'): + outer += 'PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) )\n ' elif (model=='cilk'): outer += '_Cilk_' @@ -82,7 +84,7 @@ def instance(src,model,pattern,r,dim): codegen(src,pattern,stencil_size,r,W,model,dim) def main(): - for model in ['seq','openmp','target','cilk','taskloop']: + for model in ['seq','openmp','target','cilk','taskloop','openacc']: src = open('stencil_'+model+'.h','w') for pattern in ['star','grid']: for r in range(1,10): diff --git a/C1z/stencil_openacc.h b/C1z/stencil_openacc.h new file mode 100644 index 000000000..09652a00b --- /dev/null +++ b/C1z/stencil_openacc.h @@ -0,0 +1,3126 @@ +void star1(const int n, const double * restrict in, double * restrict out) { + PRAGMA( acc parallel loop tile(32,32) deviceptr(in,out) ) + for (int i=1; i Date: Wed, 18 May 2022 12:03:20 +0300 Subject: [PATCH 233/325] remove schedule --- C1z/stencil_target.h | 72 ++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/C1z/stencil_target.h b/C1z/stencil_target.h index b50d70636..28d1a5fcf 100644 --- a/C1z/stencil_target.h +++ b/C1z/stencil_target.h @@ -1,5 +1,5 @@ void star1(const int n, const double * restrict in, double * restrict out) { - OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) ) + OMP_TARGET( teams distribute parallel for simd collapse(2) ) for (int i=1; i Date: Wed, 18 May 2022 12:03:28 +0300 Subject: [PATCH 234/325] OpenACC --- C1z/Makefile | 11 ++++++----- C1z/transpose-openacc.c | 2 -- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/C1z/Makefile b/C1z/Makefile index c8c61ed10..f719a9096 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -3,13 +3,14 @@ include ../common/PRKVERSION CPPFLAGS = -DPRKVERSION=$(PRKVERSION) -CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS) - # debugging ifdef VERBOSE - CFLAGS += -DVERBOSE + CPPFLAGS += -DVERBOSE endif +CFLAGS = $(DEFAULT_OPT_FLAGS) $(CPPFLAGS) + + ifdef PRK_USE_MMAP CFLAGS += -DPRK_USE_MMAP endif @@ -22,10 +23,10 @@ endif ASMFLAGS = -fverbose-asm $(CFLAGS) OMPFLAGS = $(OPENMPFLAG) -ACCFLAGS = $(OPENACCFLAG) TARGETFLAGS = $(OFFLOADFLAG) CILKFLAGS = $(CILKFLAG) ISPCFLAGS = $(ISPCFLAG) +OPENACCFLAGS = $(OPENACCFLAG) .PHONY: all clean serial thread openmp tasks target taskloop ispc @@ -123,7 +124,7 @@ p2p-2d: p2p-2d.c prk_util.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ %-openacc: %-openacc.c prk_util.h - $(CC) $(CFLAGS) $< $(ACCFLAGS) $(EXTRA_CLIBS) -o $@ + $(CC) $(CFLAGS) $< $(OPENACCFLAGS) $(EXTRA_CLIBS) -o $@ %-cilk: %-cilk.c prk_util.h $(CC) $(CFLAGS) $< $(CILKFLAGS) $(EXTRA_CLIBS) -o $@ diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c index 679afb1d8..0ffc76c8e 100644 --- a/C1z/transpose-openacc.c +++ b/C1z/transpose-openacc.c @@ -71,14 +71,12 @@ int main(int argc, char * argv[]) return 1; } - // number of times to do the transpose int iterations = atoi(argv[1]); if (iterations < 1) { printf("ERROR: iterations must be >= 1\n"); return 1; } - // order of a the matrix int order = atoi(argv[2]); if (order <= 0) { printf("ERROR: Matrix Order must be greater than 0\n"); From 6f8e9d1c8564abbf206d95a6d3da7be601ce8993 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:03:45 +0300 Subject: [PATCH 235/325] cleanup --- Cxx11/Makefile | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index a96805be9..365e92363 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -1,7 +1,7 @@ -include ../common/Cxx11.defs +include ../common/make.defs include ../common/PRKVERSION -CPPFLAGS = -DPRKVERSION=$(PRKVERSION) +CPPFLAGS = -DPRKVERSION=$(PRKVERSION) # debugging ifdef VERBOSE @@ -31,7 +31,7 @@ endif #ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm ASMFLAGS = -fverbose-asm -OMPFLAGS = $(OPENMPFLAG) -DUSE_OPENMP +OMPFLAGS = $(OPENMPFLAG) TARGETFLAGS = $(OFFLOADFLAG) OPENCLFLAGS = $(OPENCLFLAG) -DCL_HPP_MINIMUM_OPENCL_VERSION=120 -DCL_HPP_TARGET_OPENCL_VERSION=120 -DCL_HPP_ENABLE_EXCEPTIONS # We do not yet handle all possible exceptions... @@ -62,18 +62,17 @@ OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib boost-compute thrust executor oneapi onemkl EXTRA= -ifeq ($(shell uname -s),Darwin) - ifneq ($(findstring icpc,$(CXX)),icpc) - EXTRA += target - endif -else - EXTRA += target +ifneq ($(findstring nvc++,$(CXX)),nvc++) + EXTRA += ranges stl pstl +endif +ifneq ($(OPENACCFLAG),) + EXTRA += openacc endif -ifneq ($(findstring pgc++,$(CXX)),pgc++) - EXTRA += pstl +ifneq ($(SYCLCC),) + EXTRA += sycl endif -all: sequential vector valarray openmp taskloop stl ranges opencl sycl $(EXTRA) +all: sequential vector valarray openmp taskloop opencl $(EXTRA) sequential: p2p stencil transpose nstream dgemm sparse @@ -137,7 +136,7 @@ oneapi: onemkl dpcpp sycl onedpl occa: transpose-occa nstream-occa -openacc: p2p-hyperplane-openacc +openacc: nstream-openacc stencil-openacc transpose-openacc p2p-hyperplane-openacc stdpar: nstream-stdpar transpose-stdpar #stencil-stdpar p2p-stdpar From 706bdaeb0e0c7deab21b56949a0078636eeb8026 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:03:55 +0300 Subject: [PATCH 236/325] remove USE_OPENMP --- Cxx11/prk_util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index e5314fd81..93a037f78 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -81,7 +81,7 @@ #endif // omp_get_wtime() -#if defined(USE_OPENMP) && defined(_OPENMP) +#if defined(_OPENMP) #include #endif @@ -301,7 +301,7 @@ namespace prk { static inline double wtime(void) { -#if defined(USE_OPENMP) && defined(_OPENMP) +#if defined(_OPENMP) return omp_get_wtime(); #else using t = std::chrono::high_resolution_clock; From 59b06de28bd0fcf5bf699dbb058ef4f149d2ee38 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:04:10 +0300 Subject: [PATCH 237/325] remove unnecessary indirection --- common/Cxx11.defs | 1 - 1 file changed, 1 deletion(-) delete mode 100644 common/Cxx11.defs diff --git a/common/Cxx11.defs b/common/Cxx11.defs deleted file mode 100644 index d146ce6f7..000000000 --- a/common/Cxx11.defs +++ /dev/null @@ -1 +0,0 @@ -include ../common/make.defs From 9a20e2ec7e90fc82bdce894a261de41105eb935f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:27:53 +0300 Subject: [PATCH 238/325] OpenACC --- .gitignore | 5 + C1z/nstream-openacc.c | 1 - C1z/stencil-openacc.c | 5 +- Cxx11/generate-cxx-stencil.py | 13 +- Cxx11/nstream-openacc.cc | 177 +++++++++++++++ Cxx11/stencil-openacc.cc | 233 ++++++++++++++++++++ Cxx11/stencil_openacc.hpp | 397 ++++++++++++++++++++++++++++++++++ Cxx11/transpose-openacc.cc | 173 +++++++++++++++ 8 files changed, 998 insertions(+), 6 deletions(-) create mode 100644 Cxx11/nstream-openacc.cc create mode 100644 Cxx11/stencil-openacc.cc create mode 100644 Cxx11/stencil_openacc.hpp create mode 100644 Cxx11/transpose-openacc.cc diff --git a/.gitignore b/.gitignore index df8aeaa8f..2dab1847b 100644 --- a/.gitignore +++ b/.gitignore @@ -139,6 +139,7 @@ Cxx11/nstream-cublas Cxx11/nstream-cuda Cxx11/nstream-cuda-managed Cxx11/nstream-dpcpp +Cxx11/nstream-onedpl Cxx11/nstream-executors Cxx11/nstream-hip Cxx11/nstream-hipblas @@ -154,6 +155,7 @@ Cxx11/nstream-multigpu-dpcpp Cxx11/nstream-onemkl Cxx11/nstream-opencl Cxx11/nstream-openmp +Cxx11/nstream-openacc Cxx11/nstream-openmp-target Cxx11/nstream-pstl Cxx11/nstream-raja @@ -174,6 +176,7 @@ Cxx11/nstream-vector-raja Cxx11/p2p Cxx11/p2p-doacross-openmp Cxx11/p2p-hyperplane-openmp +Cxx11/p2p-hyperplane-openacc Cxx11/p2p-hyperplane-pstl Cxx11/p2p-hyperplane-stl Cxx11/p2p-hyperplane-sycl @@ -212,6 +215,7 @@ Cxx11/stencil-kokkos Cxx11/stencil-mpi Cxx11/stencil-opencl Cxx11/stencil-openmp +Cxx11/stencil-openacc Cxx11/stencil-openmp-target Cxx11/stencil-pstl Cxx11/stencil-raja @@ -243,6 +247,7 @@ Cxx11/transpose-kokkos Cxx11/transpose-mpi Cxx11/transpose-opencl Cxx11/transpose-openmp +Cxx11/transpose-openacc Cxx11/transpose-openmp-target Cxx11/transpose-pstl Cxx11/transpose-raja diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c index 051342f45..94985da56 100644 --- a/C1z/nstream-openacc.c +++ b/C1z/nstream-openacc.c @@ -53,7 +53,6 @@ /// by the execution time. For a vector length of N, the total /// number of words read and written is 4*N*sizeof(double). /// -/// /// HISTORY: This code is loosely based on the Stream benchmark by John /// McCalpin, but does not follow all the Stream rules. Hence, /// reported results should not be associated with Stream in diff --git a/C1z/stencil-openacc.c b/C1z/stencil-openacc.c index edc7e994b..6f79c40f3 100644 --- a/C1z/stencil-openacc.c +++ b/C1z/stencil-openacc.c @@ -160,10 +160,7 @@ int main(int argc, char * argv[]) double stencil_time = 0.0; - // interior of grid with respect to stencil - size_t active_points = (n-2*radius)*(n-2*radius); size_t bytes = n*n*sizeof(double); - double * restrict in = acc_malloc(bytes); double * restrict out = acc_malloc(bytes); @@ -196,6 +193,8 @@ int main(int argc, char * argv[]) // Analyze and output results. ////////////////////////////////////////////////////////////////////// + // interior of grid with respect to stencil + size_t active_points = (n-2*radius)*(n-2*radius); // compute L1 norm in parallel double norm = 0.0; #pragma acc parallel loop reduction( +:norm ) deviceptr(out) diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index 00095484e..67cf61894 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -67,6 +67,15 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' }\n') src.write(' }\n') src.write('}\n\n') + elif (model=='openacc'): + src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n') + src.write(' PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) )\n') + src.write(' for (int i='+str(radius)+'; i <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/OpenACC STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time{0}; + + size_t bytes = length*sizeof(double); + double * RESTRICT A = (double *) acc_malloc(bytes); + double * RESTRICT B = (double *) acc_malloc(bytes); + double * RESTRICT C = (double *) acc_malloc(bytes); + + double scalar = 3.0; + + { + #pragma acc parallel loop deviceptr(A,B,C) + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << std::setprecision(16) + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/stencil-openacc.cc b/Cxx11/stencil-openacc.cc new file mode 100644 index 000000000..18a1e212f --- /dev/null +++ b/Cxx11/stencil-openacc.cc @@ -0,0 +1,233 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2022, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" +#include "stencil_openacc.hpp" + +void nothing(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) +{ + // use arguments to silence compiler warnings + out[0] = in[0] + n + t; +} + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/OpenMP TARGET Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, n, radius, tile_size; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > prk::get_max_matrix_size()) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double stencil_time{0}; + + size_t bytes = n*n*sizeof(double); + double * RESTRICT in = (double *)acc_malloc(bytes); + double * RESTRICT out = (double *)acc_malloc(bytes); + + { + #pragma acc parallel loop collapse(2) deviceptr(in,out) + for (int i=0; i(i+j); + out[i*n+j] = 0.0; + } + } + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + + stencil(n, tile_size, in, out); + + #pragma acc parallel loop collapse(2) deviceptr(in) + for (int i=0; i(n-2*radius)*static_cast(n-2*radius); + // compute L1 norm in parallel + double norm = 0.0; + #pragma acc parallel loop reduction( +:norm ) deviceptr(out) + for (int i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} diff --git a/Cxx11/stencil_openacc.hpp b/Cxx11/stencil_openacc.hpp new file mode 100644 index 000000000..523cda771 --- /dev/null +++ b/Cxx11/stencil_openacc.hpp @@ -0,0 +1,397 @@ +#define RESTRICT __restrict__ + +void star1(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { + PRAGMA( acc parallel loop collapse(2) deviceptr(in,out) ) + for (int i=1; i <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/OpenMP TARGET Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + int tile_size; + try { + if (argc < 3) { + throw "Usage: <# iterations> [tile size]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : order; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double trans_time{0}; + + size_t bytes = order*order*sizeof(double); + double * restrict A = (double *)acc_malloc(bytes); + double * restrict B = (double *)acc_malloc(bytes); + + { + #pragma acc parallel loop deviceptr(A,B) + for (int i=0;i(i*order+j); + B[i*order+j] = 0.0; + } + } + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) trans_time = prk::wtime(); + + #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B) + for (int i=0;i(ij)*(1.+iterations)+addit; + abserr += prk::abs(B[ji] - reference); + } + } + + acc_free(A); + acc_free(B); + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + From 5af2232d7a831564832c81951d96904d3a93c159 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 12:36:49 +0300 Subject: [PATCH 239/325] cleanup --- C1z/nstream-openacc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/C1z/nstream-openacc.c b/C1z/nstream-openacc.c index 94985da56..ba4e587af 100644 --- a/C1z/nstream-openacc.c +++ b/C1z/nstream-openacc.c @@ -143,7 +143,7 @@ int main(int argc, char * argv[]) ar *= length; double asum = 0.0; - #pragma acc parallel loop reduction( +:asum ) deviceptr(A,B,C) + #pragma acc parallel loop reduction( +:asum ) deviceptr(A) for (size_t i=0; i Date: Wed, 18 May 2022 03:45:59 -0700 Subject: [PATCH 240/325] fix validation --- Cxx11/dgemm-multigpu-cublas.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Cxx11/dgemm-multigpu-cublas.cu b/Cxx11/dgemm-multigpu-cublas.cu index 160a9d12c..439f33a9b 100644 --- a/Cxx11/dgemm-multigpu-cublas.cu +++ b/Cxx11/dgemm-multigpu-cublas.cu @@ -153,7 +153,7 @@ int main(int argc, char * argv[]) std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl; prk::CUDA::info info; - info.print(); + //info.print(); ////////////////////////////////////////////////////////////////////// /// Read and test input parameters @@ -306,18 +306,18 @@ int main(int argc, char * argv[]) double residuum(0); for (int i=0; i Date: Wed, 18 May 2022 03:47:03 -0700 Subject: [PATCH 241/325] print --- Cxx11/dgemm-multigpu-cublas.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/dgemm-multigpu-cublas.cu b/Cxx11/dgemm-multigpu-cublas.cu index 439f33a9b..18a039425 100644 --- a/Cxx11/dgemm-multigpu-cublas.cu +++ b/Cxx11/dgemm-multigpu-cublas.cu @@ -153,7 +153,7 @@ int main(int argc, char * argv[]) std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl; prk::CUDA::info info; - //info.print(); + info.print(); ////////////////////////////////////////////////////////////////////// /// Read and test input parameters From b002f2829a2149b7989011e04339954b510485f3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 03:47:32 -0700 Subject: [PATCH 242/325] update --- common/make.defs.nvhpc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index 38438888b..fa4b59e8b 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -1,6 +1,6 @@ # # This file shows the NVHPC toolchain options. -NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11 +NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/22.2 #NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11 #NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021 NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/ @@ -74,7 +74,7 @@ CBLASFLAG=${BLASFLAG} NVCC=${NVHPC_CBIN}nvcc CUDAFLAGS=-g -O3 -std=c++17 CUDAFLAGS+=--extended-lambda -CUDAFLAGS+=--gpu-architecture=sm_75 +CUDAFLAGS+=--gpu-architecture=sm_80 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' @@ -106,8 +106,7 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED # MPI-3 # # mpiicc wraps icc. mpicc and mpigcc wrap gcc. -MPIDIR=${NVHPC_PATH}/comm_libs/openmpi/openmpi-3.1.5 -#MPIDIR=${NVHPC_PATH}/comm_libs/openmpi4/openmpi-4.0.5 +MPIDIR=${NVHPC_PATH}/comm_libs/hpcx/latest/ompi MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIFORT=${MPIDIR}/bin/mpifort From 06f16806267f74daa87d74b2d7e07b4372218e43 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 17:50:36 +0300 Subject: [PATCH 243/325] transpose cleanup --- FORTRAN/nstream-mpi.F90 | 3 +- FORTRAN/transpose-openacc.F90 | 119 ++++++-------------------- FORTRAN/transpose-openmp-target.F90 | 69 +++------------ FORTRAN/transpose-openmp.F90 | 63 +++----------- FORTRAN/transpose-pointer.F90 | 52 ++--------- FORTRAN/transpose-pretty.F90 | 51 +++-------- FORTRAN/transpose-stdpar.F90 | 71 +++------------ FORTRAN/transpose-taskloop-openmp.F90 | 65 +++----------- FORTRAN/transpose-tasks-openmp.F90 | 66 +++----------- FORTRAN/transpose.F90 | 10 +-- 10 files changed, 118 insertions(+), 451 deletions(-) diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index 2f4e58937..66ba8d30c 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -139,7 +139,8 @@ program main !$omp parallel default(none) & !$omp& shared(A,B,C,nstream_time) & !$omp& firstprivate(length,iterations,scalar) & - !$omp& private(i,k,t0,t1) + !$omp& private(i,k,t0,t1) & + !$omp& shared(MPI_COMM_WORLD) #endif #if defined(_OPENMP) diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index 02ab0ab9d..1a0a69fe9 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -50,16 +50,14 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -80,120 +78,50 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a41)') 'Fortran OpenACC Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling - endif + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + write(*,'(a22,i8)') 'Tile size = ', tile_size ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif - - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - t0 = 0 - if (tile_size.lt.order) then - !$acc parallel loop gang collapse(2) - do jt=1,order,tile_size - do it=1,order,tile_size - !$acc loop vector collapse(2) - do j=jt,min(order,jt+tile_size-1) - do i=it,min(order,it+tile_size-1) - A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) - B(i,j) = 0.0 - enddo - enddo - enddo - enddo - else - !$acc parallel loop collapse(2) - do j=1,order - do i=1,order - A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) - B(i,j) = 0.0 - enddo + !$acc data create(A,B) + + !$acc parallel loop collapse(2) + do j=1,order + do i=1,order + A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) + B(i,j) = 0 enddo - endif + enddo - !$acc data pcopyin(A) pcopy(B) do k=0,iterations if (k.eq.1) t0 = prk_get_wtime() - ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix - if (tile_size.lt.order) then - !$acc parallel loop gang collapse(2) - do jt=1,order,tile_size - do it=1,order,tile_size - !$acc loop vector collapse(2) - do j=jt,min(order,jt+tile_size-1) - do i=it,min(order,it+tile_size-1) - B(j,i) = B(j,i) + A(i,j) - A(i,j) = A(i,j) + 1.0 - enddo - enddo - enddo - enddo - else - !$acc parallel loop collapse(2) - do j=1,order - do i=1,order - B(j,i) = B(j,i) + A(i,j) - A(i,j) = A(i,j) + 1.0 - enddo + !$acc parallel loop tile(tile_size,tile_size) + do j=1,order + do i=1,order + B(j,i) = B(j,i) + A(i,j) + A(i,j) = A(i,j) + 1.0 enddo - endif + enddo enddo ! iterations t1 = prk_get_wtime() - !$acc end data - trans_time = t1 - t0 ! ******************************************************************** @@ -212,8 +140,9 @@ program main enddo enddo - deallocate( B ) - deallocate( A ) + !$acc end data + + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-openmp-target.F90 b/FORTRAN/transpose-openmp-target.F90 index 4aa431b18..a8b75a245 100644 --- a/FORTRAN/transpose-openmp-target.F90 +++ b/FORTRAN/transpose-openmp-target.F90 @@ -50,16 +50,15 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -81,66 +80,23 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a47)') 'Fortran OpenMP TARGET Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a33,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a28,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size.gt.order).or.(tile_size.lt.1)) then - tile_size = order - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - if (tile_size.lt.order) then - if (mod(order,tile_size).ne.0) then - write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size' - stop 1 - endif - if (tile_size.gt.32) then - write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space' - stop 1 - endif + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif @@ -223,8 +179,7 @@ program main enddo !$omp end parallel do - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-openmp.F90 b/FORTRAN/transpose-openmp.F90 index 93dab50a8..d88d470ff 100644 --- a/FORTRAN/transpose-openmp.F90 +++ b/FORTRAN/transpose-openmp.F90 @@ -50,16 +50,15 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -80,63 +79,27 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran OpenMP Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of threads = ',omp_get_max_threads() + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - - t0 = 0 - !$omp parallel default(none) & !$omp& shared(A,B,t0,t1) & !$omp& firstprivate(order,iterations,tile_size) & @@ -172,6 +135,8 @@ program main !$omp end do endif + t0 = 0 + ! need this because otherwise no barrier between initialization ! and iteration 0 (warmup), which will lead to incorrectness. !$omp barrier diff --git a/FORTRAN/transpose-pointer.F90 b/FORTRAN/transpose-pointer.F90 index 87c3eaac1..b576d5e36 100644 --- a/FORTRAN/transpose-pointer.F90 +++ b/FORTRAN/transpose-pointer.F90 @@ -57,10 +57,7 @@ program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -83,38 +80,14 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran Serial Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** @@ -130,10 +103,6 @@ program main A(1:order,1:order) => TA B(1:order,1:order) => TB - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - t0 = 0 if (tile_size.lt.order) then @@ -158,9 +127,7 @@ program main do k=0,iterations - if (k.eq.1) then - t0 = prk_get_wtime() - endif + if (k.eq.1) t0 = prk_get_wtime() ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if (tile_size.lt.order) then @@ -204,8 +171,7 @@ program main enddo enddo - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-pretty.F90 b/FORTRAN/transpose-pretty.F90 index 885c4ac3d..6eff0820d 100644 --- a/FORTRAN/transpose-pretty.F90 +++ b/FORTRAN/transpose-pretty.F90 @@ -53,13 +53,11 @@ program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix + integer(kind=INT32) :: tile_size real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix integer(kind=INT64) :: bytes ! combined size of matrices @@ -77,57 +75,37 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran Pretty Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif + t0 = 0 ! Fill the original matrix o2 = int(order,INT64)**2 A = reshape((/ (j2, j2 = 0,o2) /),(/order, order/)) B = 0 - t0 = 0 - do k=0,iterations - ! start timer after a warmup iteration + if (k.eq.1) t0 = prk_get_wtime() + B = B + transpose(A) A = A + 1 enddo ! iterations @@ -155,8 +133,7 @@ program main abserr = norm2(A-B) #endif - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-stdpar.F90 b/FORTRAN/transpose-stdpar.F90 index 26c0e87f5..7faf89646 100644 --- a/FORTRAN/transpose-stdpar.F90 +++ b/FORTRAN/transpose-stdpar.F90 @@ -50,16 +50,14 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -81,76 +79,33 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a40)') 'Fortran stdpar Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a33,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a28,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - ! same default as the C implementation - tile_size = 16 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - if ((tile_size.gt.0).and.(mod(order,tile_size).ne.0)) then - write(*,'(a50)') 'ERROR: order must be evenly divisible by tile_size' - stop 1 - endif - if ((tile_size.ne.order) .and. (tile_size.gt.32)) then - write(*,'(a50)') 'ERROR: tile_size must be less than 32 to use temp space' - stop 1 + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - - allocate( A(order,order), stat=err) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - allocate( B(order,order), stat=err ) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err - stop 1 - endif + t0 = 0 do concurrent (j=1:order, i=1:order) A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) B(i,j) = 0.0 enddo - t0 = 0 - do k=0,iterations if (k.eq.1) t0 = prk_get_wtime() @@ -180,7 +135,6 @@ program main enddo ! iterations t1 = prk_get_wtime() - trans_time = t1 - t0 ! ******************************************************************** @@ -196,8 +150,7 @@ program main abserr = abserr + abs(B(i,j) - (temp+addit)) enddo - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-taskloop-openmp.F90 b/FORTRAN/transpose-taskloop-openmp.F90 index 3cc0fbc78..fccef232f 100644 --- a/FORTRAN/transpose-taskloop-openmp.F90 +++ b/FORTRAN/transpose-taskloop-openmp.F90 @@ -49,16 +49,15 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -79,61 +78,26 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a50)') 'Fortran OpenMP TASKLOOP Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a,i1)') 'argument count = ', command_argument_count() - write(*,'(a)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - t0 = 0 !$omp parallel default(none) & @@ -160,9 +124,7 @@ program main do k=0,iterations - if (k.eq.1) then - t0 = omp_get_wtime() - endif + if (k.eq.1) t0 = omp_get_wtime() !$omp taskloop firstprivate(order,tile_size) shared(A,B) private(i,j,it,jt) do jt=1,order,tile_size @@ -211,8 +173,7 @@ program main enddo !$omp end parallel do - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose-tasks-openmp.F90 b/FORTRAN/transpose-tasks-openmp.F90 index a0ac9afb9..7cce694ba 100644 --- a/FORTRAN/transpose-tasks-openmp.F90 +++ b/FORTRAN/transpose-tasks-openmp.F90 @@ -49,16 +49,15 @@ ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 +! ! ******************************************************************* program main use, intrinsic :: iso_fortran_env use omp_lib + use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix @@ -79,61 +78,27 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a46)') 'Fortran OpenMP TASKS Matrix transpose: B = A^T' - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> []' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling + write(*,'(a22,i8)') 'Number of threads = ',omp_get_max_threads() + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + if (tile_size.ne.order) then + write(*,'(a22,i8)') 'Tile size = ', tile_size + else + write(*,'(a10)') 'Tiling off' endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** - allocate( A(order,order), stat=err) - if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of A returned ',err - stop 1 - endif - - allocate( B(order,order), stat=err ) + allocate( A(order,order), B(order,order), stat=err) if (err .ne. 0) then - write(*,'(a,i3)') 'allocation of B returned ',err + write(*,'(a,i3)') 'allocation returned ',err stop 1 endif - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - write(*,'(a,i8)') 'Tile size = ', tile_size - t0 = 0 !$omp parallel default(none) & @@ -160,9 +125,7 @@ program main do k=0,iterations - if (k.eq.1) then - t0 = omp_get_wtime() - endif + if (k.eq.1) t0 = omp_get_wtime() do jt=1,order,tile_size !$omp task firstprivate(order,tile_size,jt) shared(A,B) private(i,j,it) @@ -211,8 +174,7 @@ program main enddo !$omp end parallel do - deallocate( B ) - deallocate( A ) + deallocate( A,B ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' diff --git a/FORTRAN/transpose.F90 b/FORTRAN/transpose.F90 index 4e398a1bf..56fb6ab26 100644 --- a/FORTRAN/transpose.F90 +++ b/FORTRAN/transpose.F90 @@ -98,6 +98,8 @@ program main stop 1 endif + t0 = 0 + if (tile_size.lt.order) then do jt=1,order,tile_size do it=1,order,tile_size @@ -118,12 +120,9 @@ program main enddo endif - t0 = 0 - do k=0,iterations - if (k.eq.1) then - t0 = prk_get_wtime() - endif + + if (k.eq.1) t0 = prk_get_wtime() ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if (tile_size.lt.order) then @@ -149,7 +148,6 @@ program main enddo ! iterations t1 = prk_get_wtime() - trans_time = t1 - t0 ! ******************************************************************** From 359da8eabe2fd842ef9d1611d63a73bfe085c902 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 18:05:13 +0300 Subject: [PATCH 244/325] fix arg parse in MPI --- FORTRAN/transpose-a2a-mpi.F90 | 35 ++++------------------ FORTRAN/transpose-acc-mpi.F90 | 34 ++++------------------ FORTRAN/transpose-ga.F90 | 55 +++++++++++------------------------ FORTRAN/transpose-get-mpi.F90 | 34 ++++------------------ FORTRAN/transpose-p2p-mpi.F90 | 34 ++++------------------ 5 files changed, 37 insertions(+), 155 deletions(-) diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90 index c121b037a..c38158397 100644 --- a/FORTRAN/transpose-a2a-mpi.F90 +++ b/FORTRAN/transpose-a2a-mpi.F90 @@ -90,8 +90,6 @@ program main implicit none ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations integer(kind=INT32) :: order, block_order @@ -101,8 +99,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, r, lo, hi - !integer(kind=INT32) :: it, jt, tile_size + integer(kind=INT32) :: i, j, k, r, lo, hi, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 @@ -118,38 +115,16 @@ program main ! ******************************************************************** if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + write(*,'(a22,i8)') 'Number of MPI procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order if (mod(order,np).ne.0) then write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np call MPI_Abort(MPI_COMM_WORLD, 4) endif - - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations - write(*,'(a23,i8)') 'Matrix order = ', order endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 index 9023a006f..2b49bb0bd 100644 --- a/FORTRAN/transpose-acc-mpi.F90 +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -91,8 +91,6 @@ program main implicit none ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations integer(kind=INT32) :: order, block_order @@ -104,7 +102,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, q, r, lo, hi + integer(kind=INT32) :: i, j, k, q, r, lo, hi, tile_size !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime @@ -123,38 +121,16 @@ program main ! ******************************************************************** if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + write(*,'(a22,i8)') 'Number of MPI procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order if (mod(order,np).ne.0) then write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np call MPI_Abort(MPI_COMM_WORLD, 4) endif - - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations - write(*,'(a23,i8)') 'Matrix order = ', order endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) diff --git a/FORTRAN/transpose-ga.F90 b/FORTRAN/transpose-ga.F90 index 8d81c038d..5e2fde45f 100644 --- a/FORTRAN/transpose-ga.F90 +++ b/FORTRAN/transpose-ga.F90 @@ -55,14 +55,12 @@ program main use, intrinsic :: iso_fortran_env use mpi_f08 + use prk implicit none #include "global.fh" #include "mafdecls.fh" !#include 'ga-mpi.fh' ! unused - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! MPI - should always use 32-bit INTEGER integer(kind=INT32), parameter :: requested = MPI_THREAD_SERIALIZED integer(kind=INT32) :: provided @@ -86,33 +84,7 @@ program main real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 - ! ******************************************************************** - ! read and test input parameters - ! ******************************************************************** - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - stop 1 - endif - - call mpi_init_thread(requested,provided) + call MPI_Init_thread(requested,provided) !call ga_initialize() ! ask GA to allocate enough memory for 4 matrices, just to be safe @@ -124,6 +96,21 @@ program main !if (me.eq.0) print*,'max_mem=',max_mem + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a47)') 'Fortran Global Arrays Matrix transpose: B = A^T' + write(*,'(a22,i8)') 'Number of GA procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order + endif + call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) + #if PRK_CHECK_GA_MPI ! We do use MPI anywhere, but if we did, we would need to avoid MPI collectives ! on the world communicator, because it is possible for that to be larger than @@ -140,14 +127,6 @@ program main endif #endif - if (me.eq.0) then - write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a47)') 'Fortran Global Arrays Matrix transpose: B = A^T' - write(*,'(a22,i12)') 'Number of GA procs = ', np - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix order = ', order - endif - call ga_sync() ! ******************************************************************** diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90 index b153117ca..ebab0c406 100644 --- a/FORTRAN/transpose-get-mpi.F90 +++ b/FORTRAN/transpose-get-mpi.F90 @@ -91,8 +91,6 @@ program main implicit none ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations integer(kind=INT32) :: order, block_order @@ -104,7 +102,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, q, r, lo, hi + integer(kind=INT32) :: i, j, k, q, r, lo, hi, tile_size !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime @@ -123,38 +121,16 @@ program main ! ******************************************************************** if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + write(*,'(a22,i8)') 'Number of MPI procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order if (mod(order,np).ne.0) then write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np call MPI_Abort(MPI_COMM_WORLD, 4) endif - - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations - write(*,'(a23,i8)') 'Matrix order = ', order endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index 3d72cb36c..b18c3b64f 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -90,8 +90,6 @@ program main implicit none ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations integer(kind=INT32) :: order, block_order @@ -101,7 +99,7 @@ program main real(kind=REAL64), parameter :: one=1.0d0 ! runtime variables integer(kind=INT64) :: bytes - integer(kind=INT32) :: i, j, k, lo, hi, q + integer(kind=INT32) :: i, j, k, lo, hi, q, tile_size real(kind=REAL64) :: abserr, addit, temp real(kind=REAL64) :: t0, t1, trans_time, avgtime real(kind=REAL64), parameter :: epsilon=1.d-8 @@ -118,38 +116,16 @@ program main ! ******************************************************************** if (me.eq.0) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' - - if (command_argument_count().lt.2) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./transpose <# iterations> ' - call MPI_Abort(MPI_COMM_WORLD, command_argument_count()) - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations - call MPI_Abort(MPI_COMM_WORLD, 2) - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order - call MPI_Abort(MPI_COMM_WORLD, 3) - endif + write(*,'(a22,i8)') 'Number of MPI procs = ', np + write(*,'(a22,i8)') 'Number of iterations = ', iterations + write(*,'(a22,i8)') 'Matrix order = ', order if (mod(order,np).ne.0) then write(*,'(a,2i5)') 'ERROR: order must an integer multiple of np : ', order,np call MPI_Abort(MPI_COMM_WORLD, 4) endif - - write(*,'(a23,i8)') 'Number of MPI procs = ', np - write(*,'(a23,i8)') 'Number of iterations = ', iterations - write(*,'(a23,i8)') 'Matrix order = ', order endif call MPI_Bcast(iterations, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) From a73e3cec55ffe4203002e639df8fabc8454fc09e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 19:06:18 +0300 Subject: [PATCH 245/325] print fix --- FORTRAN/transpose-a2a-mpi.F90 | 2 +- FORTRAN/transpose-acc-mpi.F90 | 2 +- FORTRAN/transpose-get-mpi.F90 | 2 +- FORTRAN/transpose-p2p-mpi.F90 | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/FORTRAN/transpose-a2a-mpi.F90 b/FORTRAN/transpose-a2a-mpi.F90 index c38158397..72a55c797 100644 --- a/FORTRAN/transpose-a2a-mpi.F90 +++ b/FORTRAN/transpose-a2a-mpi.F90 @@ -117,7 +117,7 @@ program main if (me.eq.0) then call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T' write(*,'(a22,i8)') 'Number of MPI procs = ', np write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order diff --git a/FORTRAN/transpose-acc-mpi.F90 b/FORTRAN/transpose-acc-mpi.F90 index 2b49bb0bd..6ac96b7cf 100644 --- a/FORTRAN/transpose-acc-mpi.F90 +++ b/FORTRAN/transpose-acc-mpi.F90 @@ -123,7 +123,7 @@ program main if (me.eq.0) then call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T' write(*,'(a22,i8)') 'Number of MPI procs = ', np write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order diff --git a/FORTRAN/transpose-get-mpi.F90 b/FORTRAN/transpose-get-mpi.F90 index ebab0c406..96a5470d5 100644 --- a/FORTRAN/transpose-get-mpi.F90 +++ b/FORTRAN/transpose-get-mpi.F90 @@ -123,7 +123,7 @@ program main if (me.eq.0) then call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T' write(*,'(a22,i8)') 'Number of MPI procs = ', np write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index b18c3b64f..b7fc14605 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -118,7 +118,7 @@ program main if (me.eq.0) then call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(*,'(a25)') 'Parallel Research Kernels' - write(*,'(a36)') 'Fortran MPI Matrix transpose: B = A^T' + write(*,'(a37)') 'Fortran MPI Matrix transpose: B = A^T' write(*,'(a22,i8)') 'Number of MPI procs = ', np write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order From 908e3a304f5e70d346f7f0c1c6100245ded5439c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 18 May 2022 19:12:56 +0300 Subject: [PATCH 246/325] args --- FORTRAN/transpose-coarray.F90 | 57 +++++------------------------------ 1 file changed, 8 insertions(+), 49 deletions(-) diff --git a/FORTRAN/transpose-coarray.F90 b/FORTRAN/transpose-coarray.F90 index bc15f1238..08526a1bb 100644 --- a/FORTRAN/transpose-coarray.F90 +++ b/FORTRAN/transpose-coarray.F90 @@ -58,10 +58,7 @@ program main use, intrinsic :: iso_fortran_env use prk implicit none - ! for argument parsing integer :: err - integer :: arglen - character(len=32) :: argtmp integer :: me, np logical :: printer ! problem definition @@ -90,37 +87,18 @@ program main ! ******************************************************************** if (printer) then + call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) write(6,'(a25)') 'Parallel Research Kernels' write(6,'(a41)') 'Fortran coarray Matrix transpose: B = A^T' + write(6,'(a23,i8)') 'Number of images = ', np + write(6,'(a23,i8)') 'Number of iterations = ', iterations + write(6,'(a23,i8)') 'Matrix order = ', order + write(6,'(a23,i8)') 'Tile size = ', tile_size endif + call co_broadcast(iterations,1) + call co_broadcast(order,1) + call co_broadcast(tile_size,1) - if (command_argument_count().lt.2) then - if (printer) then - write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(6,'(a62)') 'Usage: ./transpose <# iterations> []' - endif - stop 1 - endif - - iterations = 1 - call get_command_argument(1,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') iterations - if (iterations .lt. 1) then - if (printer) then - write(6,'(a35,i5)') 'ERROR: iterations must be >= 1 : ', iterations - endif - stop 1 - endif - - order = 1 - call get_command_argument(2,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') order - if (order .lt. 1) then - if (printer) then - write(6,'(a30,i5)') 'ERROR: order must be >= 1 : ', order - endif - stop 1 - endif if (modulo(order,np).gt.0) then if (printer) then write(6,'(a20,i5,a35,i5)') 'ERROR: matrix order ',order,& @@ -130,18 +108,6 @@ program main endif block_order = order/np - ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(6,'(a20,i5,a22,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling - endif - ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** @@ -152,13 +118,6 @@ program main stop 1 endif - if (printer) then - write(6,'(a23,i8)') 'Number of images = ', np - write(6,'(a23,i8)') 'Number of iterations = ', iterations - write(6,'(a23,i8)') 'Matrix order = ', order - write(6,'(a23,i8)') 'Tile size = ', tile_size - endif - ! initialization ! local column index j corresponds to global column index block_order*me+j if ((tile_size.gt.1).and.(tile_size.lt.order)) then From f465bed8fa7f3d5ef319f0102f9b24f17582e688 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Thu, 2 Jun 2022 09:47:05 -0600 Subject: [PATCH 247/325] Fix typos in README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 14a059365..4fbb49dd8 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ executed on many computing systems. These programs should not be used as benchmarks. They are operations to explore features of a hardware platform, but they do not define fixed problems that can be used to rank systems. Furthermore -they have not been optimimzed for the features of any particular system. +they have not been optimized for the features of any particular system. # Build Instructions @@ -51,7 +51,7 @@ If you are looking for the simplest option, try `make.defs.gcc`. | `make.defs.pgi` | PGI compiler toolchain (infrequently tested). | | `make.defs.hip` | HIP compiler toolchain (infrequently tested). | -Some of the C++ implementations require you to install Boost, RAJA, KOKKOS, Parallel STL, respectively, +Some of the C++ implementations require you to install Boost, RAJA, Kokkos, Parallel STL, respectively, and then modify `make.defs` appropriately. Please see the documentation in the [documentation](https://github.com/ParRes/Kernels/tree/default/doc) (`doc`) subdirectory. @@ -215,7 +215,7 @@ be used unless a `make veryclean` has been issued. ## Individual make -Descend into the desired sub-tree and cd to the kernel(s) of interest. +Descend into the desired sub-tree and `cd` to the kernel(s) of interest. Each kernel has its own Makefile. There are a number of parameters that determine the behavior of the kernel that need to be known at compile time. These are explained succinctly in the Makefile itself. Edit From 00e68f8ddb482149255fc3bf77d878e8b2ff1bc5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Jun 2022 12:11:01 +0300 Subject: [PATCH 248/325] mpifort required for prk_mpi_mod --- FORTRAN/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 4faa6796a..6d3b0c1f1 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -99,7 +99,7 @@ prk.mod prk_mod.o: prk_mod.F90 $(FC) $(FCFLAGS) -c $< -o prk_mod.o prk_mpi.mod prk_mpi_mod.o: prk_mpi.F90 - $(FC) $(FCFLAGS) -c $< -o prk_mpi_mod.o + $(MPIFORT) $(FCFLAGS) -c $< -o prk_mpi_mod.o stencil: stencil.F90 prk.mod $(FC) $(FCFLAGS) -c stencil_serial.F90 From c89329dd810f315f96c3d6ba15a13d6e922e7ee8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Jun 2022 12:13:36 +0300 Subject: [PATCH 249/325] default(none) and MPI_COMM_WORLD cannot coexist --- FORTRAN/nstream-mpi.F90 | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/FORTRAN/nstream-mpi.F90 b/FORTRAN/nstream-mpi.F90 index 66ba8d30c..aa6c6b408 100644 --- a/FORTRAN/nstream-mpi.F90 +++ b/FORTRAN/nstream-mpi.F90 @@ -136,11 +136,10 @@ program main scalar = 3 #ifdef _OPENMP - !$omp parallel default(none) & + !$omp parallel & !$omp& shared(A,B,C,nstream_time) & !$omp& firstprivate(length,iterations,scalar) & - !$omp& private(i,k,t0,t1) & - !$omp& shared(MPI_COMM_WORLD) + !$omp& private(i,k,t0,t1) #endif #if defined(_OPENMP) From 83bad194530f9f3778cff68c9fcfd68d15e64146 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Jun 2022 12:14:08 +0300 Subject: [PATCH 250/325] update for homebrew --- common/make.defs.gcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 05a06c0ee..afcf1a6ae 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -205,7 +205,7 @@ ISPCFLAG=-O3 --target=host --opt=fast-math # # MPI-3 # -MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.1_2 +MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.4 MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIFORT=${MPIDIR}/bin/mpifort From eeabb8c15c97797ec1840e9b8adc15c74e53ea9a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 16 Jun 2022 12:38:22 +0300 Subject: [PATCH 251/325] replace non-constant tiling with automatic tiling non-constant tiling was supported by NVHPC not GCC, and was not standard anyways. --- FORTRAN/stencil-openacc.F90 | 98 +++++++++-------------------------- FORTRAN/transpose-openacc.F90 | 7 ++- 2 files changed, 28 insertions(+), 77 deletions(-) diff --git a/FORTRAN/stencil-openacc.F90 b/FORTRAN/stencil-openacc.F90 index a5543e5f3..da660dd22 100644 --- a/FORTRAN/stencil-openacc.F90 +++ b/FORTRAN/stencil-openacc.F90 @@ -61,82 +61,42 @@ ! ! ******************************************************************* -subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) +subroutine apply_stencil(is_star,r,n,W,A,B) use, intrinsic :: iso_fortran_env implicit none - logical, intent(in) :: is_star, tiling - integer(kind=INT32), intent(in) :: tile_size, r, n + logical, intent(in) :: is_star + integer(kind=INT32), intent(in) :: r, n real(kind=REAL64), intent(in) :: W(-r:r,-r:r) real(kind=REAL64), intent(in) :: A(n,n) real(kind=REAL64), intent(inout) :: B(n,n) - integer(kind=INT32) :: i, j, ii, jj, it, jt + integer(kind=INT32) :: i, j, ii, jj !$acc data pcopyin(W,A) pcopy(B) if (is_star) then - if (.not.tiling) then - !$acc parallel loop collapse(2) - do j=r,n-r-1 - do i=r,n-r-1 - do jj=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) - enddo - do ii=-r,-1 - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - do ii=1,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo + !$acc parallel loop tile(*,*) + do j=r,n-r-1 + do i=r,n-r-1 + do jj=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) enddo - enddo - else ! tiling - !$acc parallel loop gang collapse(2) - do jt=r,n-r-1,tile_size - do it=r,n-r-1,tile_size - !$acc loop vector collapse(2) - do j=jt,min(n-r-1,jt+tile_size-1) - do i=it,min(n-r-1,it+tile_size-1) - do jj=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(0,jj) * A(i+1,j+jj+1) - enddo - do ii=-r,-1 - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - do ii=1,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) - enddo - enddo - enddo + do ii=-r,-1 + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) enddo - enddo - endif ! tiling - else ! grid - if (.not.tiling) then - !$acc parallel loop collapse(2) - do j=r,n-r-1 - do i=r,n-r-1 - do jj=-r,r - do ii=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) - enddo - enddo + do ii=1,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,0) * A(i+ii+1,j+1) enddo enddo - else ! tiling - !$acc parallel loop gang collapse(2) - do jt=r,n-r-1,tile_size - do it=r,n-r-1,tile_size - !$acc loop vector collapse(2) - do j=jt,min(n-r-1,jt+tile_size-1) - do i=it,min(n-r-1,it+tile_size-1) - do jj=-r,r - do ii=-r,r - B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) - enddo - enddo - enddo + enddo + else ! grid + !$acc parallel loop tile(*,*) + do j=r,n-r-1 + do i=r,n-r-1 + do jj=-r,r + do ii=-r,r + B(i+1,j+1) = B(i+1,j+1) + W(ii,jj) * A(i+ii+1,j+jj+1) enddo enddo enddo - endif ! tiling + enddo endif ! star !$acc end data end subroutine apply_stencil @@ -150,8 +110,6 @@ program main integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm integer(kind=INT32) :: n ! linear grid dimension integer(kind=INT32) :: stencil_size ! number of points in stencil - integer(kind=INT32) :: tile_size ! loop nest block factor - logical :: tiling ! boolean indication loop nest blocking logical :: is_star ! true = star, false = grid integer(kind=INT32), parameter :: r=RADIUS ! radius of stencil real(kind=REAL64) :: W(-r:r,-r:r) ! weights of points in the stencil @@ -172,7 +130,7 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a44)') 'Fortran OpenACC Stencil execution on 2D grid' - call prk_get_arguments('stencil',iterations=iterations,order=n,tile_size=tile_size) + call prk_get_arguments('stencil',iterations=iterations,order=n) ! TODO: parse runtime input for star/grid #ifdef STAR @@ -181,8 +139,6 @@ program main is_star = .false. #endif - tiling = (tile_size.ne.n) - write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Grid size = ', n write(*,'(a22,i8)') 'Radius of stencil = ', r @@ -193,11 +149,7 @@ program main write(*,'(a22,a8)') 'Type of stencil = ','grid' stencil_size = (2*r+1)**2 endif - if (tiling) then - write(*,'(a22,i8)') 'Tile size = ', tile_size - else - write(*,'(a10)') 'Tiling off' - endif + write(*,'(a32)') 'Tile size = automatic' ! ******************************************************************** ! ** Allocate space for the input and perform the computation @@ -228,7 +180,7 @@ program main if (k.eq.1) t0 = prk_get_wtime() ! Apply the stencil operator - call apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) + call apply_stencil(is_star,r,n,W,A,B) ! add constant to solution to force refresh of neighbor data, if any !$acc parallel loop collapse(2) diff --git a/FORTRAN/transpose-openacc.F90 b/FORTRAN/transpose-openacc.F90 index 1a0a69fe9..ad242cbfb 100644 --- a/FORTRAN/transpose-openacc.F90 +++ b/FORTRAN/transpose-openacc.F90 @@ -66,7 +66,6 @@ program main integer(kind=INT64) :: bytes ! combined size of matrices ! runtime variables integer(kind=INT32) :: i, j, k - integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp ! squared error real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance @@ -78,11 +77,11 @@ program main write(*,'(a25)') 'Parallel Research Kernels' write(*,'(a41)') 'Fortran OpenACC Matrix transpose: B = A^T' - call prk_get_arguments('transpose',iterations=iterations,order=order,tile_size=tile_size) + call prk_get_arguments('transpose',iterations=iterations,order=order) write(*,'(a22,i8)') 'Number of iterations = ', iterations write(*,'(a22,i8)') 'Matrix order = ', order - write(*,'(a22,i8)') 'Tile size = ', tile_size + write(*,'(a32)') 'Tile size = automatic' ! ******************************************************************** ! ** Allocate space for the input and transpose matrix @@ -110,7 +109,7 @@ program main if (k.eq.1) t0 = prk_get_wtime() - !$acc parallel loop tile(tile_size,tile_size) + !$acc parallel loop tile(*,*) do j=1,order do i=1,order B(j,i) = B(j,i) + A(i,j) From dd8308e00648a3b402ccdd613cfd2ee5bbae5c09 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 23 Jun 2022 22:06:52 +0300 Subject: [PATCH 252/325] Update README.md --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4fbb49dd8..994639a7e 100644 --- a/README.md +++ b/README.md @@ -38,15 +38,16 @@ If you are looking for the simplest option, try `make.defs.gcc`. | File (in `./common/`) | Environment | |----------------------|-------------------------| -| `make.defs.cray` | Cray compilers on Cray XC systems. | +| `make.defs.cray` | Cray toolchain (rarely tested). | | `make.defs.cuda` | GCC with the CUDA compiler (only used in C++/CUDA implementation). | -| `make.defs.gcc` | GCC compiler tool chain, which supports essentially all implementations. | +| `make.defs.gcc` | GCC compiler toolchain, which supports essentially all implementations (tested often). | | `make.defs.freebsd` | FreeBSD (rarely tested). | | `make.defs.ibmbg` | IBM Blue Gene/Q compiler toolchain (deprecated). | -| `make.defs.ibmp9nv` | IBM compilers for POWER9 and NVIDIA Volta platforms. | -| `make.defs.intel` | Intel compiler tool chain, which supports most implementations. | -| `make.defs.llvm` | LLVM compiler tool chain, which supports most implementations. | -| `make.defs.musl` | GCC compiler toolchain with MUSL as the C standard library, which is required to use C11 threads. | +| `make.defs.ibmp9nv` | IBM compilers for POWER9 and NVIDIA Volta platforms (rarely tested). | +| `make.defs.intel` | Intel Parallel Studio toolchain, which supports most implementations (tested often). | +| `make.defs.llvm` | LLVM compiler toolchain, which supports most implementations (tested often). | +| `make.defs.musl` | GCC compiler toolchain with MUSL as the C standard library, which was required to use C11 threads. | +| `make.defs.nvhpc` | NVIDIA HPC compiler tool chain, which supports most implementations (tested often). | | `make.defs.oneapi` | Intel oneAPI (https://software.intel.com/oneapi/hpc-kit). | | `make.defs.pgi` | PGI compiler toolchain (infrequently tested). | | `make.defs.hip` | HIP compiler toolchain (infrequently tested). | From 029c003c20a5ffaf522708685e673200f7c5de9e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 23 Jun 2022 22:31:00 +0300 Subject: [PATCH 253/325] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 994639a7e..4c0852d4d 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,8 @@ If you are looking for the simplest option, try `make.defs.gcc`. | `make.defs.intel` | Intel Parallel Studio toolchain, which supports most implementations (tested often). | | `make.defs.llvm` | LLVM compiler toolchain, which supports most implementations (tested often). | | `make.defs.musl` | GCC compiler toolchain with MUSL as the C standard library, which was required to use C11 threads. | -| `make.defs.nvhpc` | NVIDIA HPC compiler tool chain, which supports most implementations (tested often). | -| `make.defs.oneapi` | Intel oneAPI (https://software.intel.com/oneapi/hpc-kit). | +| `make.defs.nvhpc` | [NVIDIA HPC SDK](https://developer.nvidia.com/nvidia-hpc-sdk-downloads), which supports most implementations (tested often). | +| `make.defs.oneapi` | Intel [oneAPI](https://software.intel.com/oneapi/hpc-kit). | | `make.defs.pgi` | PGI compiler toolchain (infrequently tested). | | `make.defs.hip` | HIP compiler toolchain (infrequently tested). | From b1699023fb8d491c8a17d485dc57814b2e799005 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 4 Oct 2022 13:45:38 +0300 Subject: [PATCH 254/325] update OpenCL C++ header this ancient one does not compile with ICPX or Clang when C++20 is enabled --- Cxx11/{cl2.hpp => opencl.hpp} | 706 +++++++++++++++++++++------------- Cxx11/prk_opencl.h | 2 +- 2 files changed, 429 insertions(+), 279 deletions(-) rename Cxx11/{cl2.hpp => opencl.hpp} (94%) diff --git a/Cxx11/cl2.hpp b/Cxx11/opencl.hpp similarity index 94% rename from Cxx11/cl2.hpp rename to Cxx11/opencl.hpp index 09e295ec5..1e61d7890 100644 --- a/Cxx11/cl2.hpp +++ b/Cxx11/opencl.hpp @@ -1,36 +1,23 @@ -/******************************************************************************* - * Copyright (c) 2008-2016 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS - * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS - * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT - * https://www.khronos.org/registry/ - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - ******************************************************************************/ +// +// Copyright (c) 2008-2020 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// /*! \file * - * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), - * OpenCL 1.2 (rev 15), OpenCL 2.0 (rev 29), OpenCL 2.1 (rev 17), - * and OpenCL 2.2 (V2.2-11). + * \brief C++ bindings for OpenCL 1.0, OpenCL 1.1, OpenCL 1.2, + * OpenCL 2.0, OpenCL 2.1, OpenCL 2.2, and OpenCL 3.0. * \author Lee Howes and Bruce Merry * * Derived from the OpenCL 1.x C++ bindings written by @@ -73,10 +60,10 @@ * For many large applications C++ is the language of choice and so it seems * reasonable to define C++ bindings for OpenCL. * - * The interface is contained with a single C++ header file \em cl2.hpp and all + * The interface is contained with a single C++ header file \em opencl.hpp and all * definitions are contained within the namespace \em cl. There is no additional * requirement to include \em cl.h and to use either the C++ or original C - * bindings; it is enough to simply include \em cl2.hpp. + * bindings; it is enough to simply include \em opencl.hpp. * * The bindings themselves are lightweight and correspond closely to the * underlying C API. Using the C++ bindings introduces no additional execution @@ -85,7 +72,7 @@ * There are numerous compatibility, portability and memory management * fixes in the new header as well as additional OpenCL 2.0 features. * As a result the header is not directly backward compatible and for this - * reason we release it as cl2.hpp rather than a new version of cl.hpp. + * reason we release it as opencl.hpp rather than a new version of cl.hpp. * * * \section compatibility Compatibility @@ -157,30 +144,26 @@ * - CL_HPP_NO_STD_STRING * * Do not use the standard library string class. cl::string is not - * defined and may be defined by the user before cl2.hpp is + * defined and may be defined by the user before opencl.hpp is * included. * * - CL_HPP_NO_STD_VECTOR * * Do not use the standard library vector class. cl::vector is not - * defined and may be defined by the user before cl2.hpp is + * defined and may be defined by the user before opencl.hpp is * included. * * - CL_HPP_NO_STD_ARRAY * * Do not use the standard library array class. cl::array is not - * defined and may be defined by the user before cl2.hpp is + * defined and may be defined by the user before opencl.hpp is * included. * * - CL_HPP_NO_STD_UNIQUE_PTR * * Do not use the standard library unique_ptr class. cl::pointer and * the cl::allocate_pointer functions are not defined and may be - * defined by the user before cl2.hpp is included. - * - * - CL_HPP_ENABLE_DEVICE_FISSION - * - * Enables device fission for OpenCL 1.2 platforms. + * defined by the user before opencl.hpp is included. * * - CL_HPP_ENABLE_EXCEPTIONS * @@ -207,10 +190,22 @@ * applies to use of cl::Program construction and other program * build variants. * + * - CL_HPP_USE_CL_DEVICE_FISSION + * + * Enable the cl_ext_device_fission extension. + * + * - CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR + * + * Enable the cl_khr_image2d_from_buffer extension. + * * - CL_HPP_USE_CL_SUB_GROUPS_KHR * * Enable the cl_khr_subgroups extension. * + * - CL_HPP_USE_DX_INTEROP + * + * Enable the cl_khr_d3d10_sharing extension. + * * - CL_HPP_USE_IL_KHR * * Enable the cl_khr_il_program extension. @@ -222,12 +217,16 @@ * bindings, including support for the optional exception feature and * also the supplied vector and string classes, see following sections for * decriptions of these features. + * + * Note: the C++ bindings use std::call_once and therefore may need to be + * compiled using special command-line options (such as "-pthread") on some + * platforms! * * \code #define CL_HPP_ENABLE_EXCEPTIONS #define CL_HPP_TARGET_OPENCL_VERSION 200 - #include + #include #include #include #include @@ -237,28 +236,30 @@ int main(void) { - // Filter for a 2.0 platform and set it as the default + // Filter for a 2.0 or newer platform and set it as the default std::vector platforms; cl::Platform::get(&platforms); cl::Platform plat; for (auto &p : platforms) { std::string platver = p.getInfo(); - if (platver.find("OpenCL 2.") != std::string::npos) { + if (platver.find("OpenCL 2.") != std::string::npos || + platver.find("OpenCL 3.") != std::string::npos) { + // Note: an OpenCL 3.x platform may not support all required features! plat = p; } } - if (plat() == 0) { - std::cout << "No OpenCL 2.0 platform found."; + if (plat() == 0) { + std::cout << "No OpenCL 2.0 or newer platform found.\n"; return -1; } cl::Platform newP = cl::Platform::setDefault(plat); if (newP != plat) { - std::cout << "Error setting default platform."; + std::cout << "Error setting default platform.\n"; return -1; } - // Use C++11 raw string literals for kernel source code + // C++11 raw string literal for the first kernel std::string kernel1{R"CLC( global int globalA; kernel void updateGlobal() @@ -266,6 +267,8 @@ globalA = 75; } )CLC"}; + + // Raw string literal for the second kernel std::string kernel2{R"CLC( typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, @@ -292,8 +295,9 @@ } )CLC"}; - // New simpler string interface style - std::vector programStrings {kernel1, kernel2}; + std::vector programStrings; + programStrings.push_back(kernel1); + programStrings.push_back(kernel2); cl::Program vectorAddProgram(programStrings); try { @@ -332,10 +336,9 @@ std::vector>> inputA(numElements, 1, svmAlloc); cl::coarse_svm_vector inputB(numElements, 2, svmAlloc); - // ////////////// - // Traditional cl_mem allocations + std::vector output(numElements, 0xdeadbeef); cl::Buffer outputBuffer(begin(output), end(output), false); cl::Pipe aPipe(sizeof(cl_int), numElements / 2); @@ -359,14 +362,8 @@ // This one was not passed as a parameter vectorAddKernel.setSVMPointers(anSVMInt); - // Hand control of coarse allocations to runtime - cl::enqueueUnmapSVM(anSVMInt); - cl::enqueueUnmapSVM(fooPointer); - cl::unmapSVM(inputB); - cl::unmapSVM(output2); - - cl_int error; - vectorAddKernel( + cl_int error; + vectorAddKernel( cl::EnqueueArgs( cl::NDRange(numElements/2), cl::NDRange(numElements/2)), @@ -377,12 +374,10 @@ 3, aPipe, defaultDeviceQueue, - error + error ); cl::copy(outputBuffer, begin(output), end(output)); - // Grab the SVM output vector using a map - cl::mapSVM(output2); cl::Device d = cl::Device::getDefault(); @@ -406,59 +401,60 @@ * both and hence work with either version of the bindings. */ #if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP) -# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead") +# pragma message("opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead") # define CL_HPP_USE_DX_INTEROP #endif #if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION) -# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead") +# pragma message("opencl.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead") # define CL_HPP_USE_CL_DEVICE_FISSION #endif #if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS) -# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead") +# pragma message("opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead") # define CL_HPP_ENABLE_EXCEPTIONS #endif #if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR) -# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead") +# pragma message("opencl.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead") # define CL_HPP_NO_STD_VECTOR #endif #if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING) -# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead") +# pragma message("opencl.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead") # define CL_HPP_NO_STD_STRING #endif #if defined(VECTOR_CLASS) -# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead") +# pragma message("opencl.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead") #endif #if defined(STRING_CLASS) -# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.") +# pragma message("opencl.hpp: STRING_CLASS is deprecated. Alias cl::string instead.") #endif #if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS) -# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead") +# pragma message("opencl.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead") # define CL_HPP_USER_OVERRIDE_ERROR_STRINGS #endif /* Warn about features that are no longer supported */ #if defined(__USE_DEV_VECTOR) -# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors") +# pragma message("opencl.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors") #endif #if defined(__USE_DEV_STRING) -# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors") +# pragma message("opencl.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors") #endif /* Detect which version to target */ #if !defined(CL_HPP_TARGET_OPENCL_VERSION) -# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 220 (OpenCL 2.2)") -# define CL_HPP_TARGET_OPENCL_VERSION 220 +# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 300 (OpenCL 3.0)") +# define CL_HPP_TARGET_OPENCL_VERSION 300 #endif #if CL_HPP_TARGET_OPENCL_VERSION != 100 && \ CL_HPP_TARGET_OPENCL_VERSION != 110 && \ CL_HPP_TARGET_OPENCL_VERSION != 120 && \ CL_HPP_TARGET_OPENCL_VERSION != 200 && \ CL_HPP_TARGET_OPENCL_VERSION != 210 && \ - CL_HPP_TARGET_OPENCL_VERSION != 220 -# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210 or 220). It will be set to 220") + CL_HPP_TARGET_OPENCL_VERSION != 220 && \ + CL_HPP_TARGET_OPENCL_VERSION != 300 +# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 300 (OpenCL 3.0).") # undef CL_HPP_TARGET_OPENCL_VERSION -# define CL_HPP_TARGET_OPENCL_VERSION 220 +# define CL_HPP_TARGET_OPENCL_VERSION 300 #endif /* Forward target OpenCL version to C headers if necessary */ @@ -480,8 +476,9 @@ CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \ CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \ CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \ - CL_HPP_MINIMUM_OPENCL_VERSION != 220 -# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210 or 220). It will be set to 100") + CL_HPP_MINIMUM_OPENCL_VERSION != 220 && \ + CL_HPP_MINIMUM_OPENCL_VERSION != 300 +# pragma message("opencl.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 100") # undef CL_HPP_MINIMUM_OPENCL_VERSION # define CL_HPP_MINIMUM_OPENCL_VERSION 100 #endif @@ -541,13 +538,15 @@ #include #endif // !__APPLE__ -#if (__cplusplus >= 201103L) +#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L ) #define CL_HPP_NOEXCEPT_ noexcept #else #define CL_HPP_NOEXCEPT_ #endif -#if defined(_MSC_VER) +#if __cplusplus >= 201703L +# define CL_HPP_DEFINE_STATIC_MEMBER_ inline +#elif defined(_MSC_VER) # define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany) #elif defined(__MINGW32__) # define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany)) @@ -557,19 +556,26 @@ // Define deprecated prefixes and suffixes to ensure compilation // in case they are not pre-defined -#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) -#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) -#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) -#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) - -#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) -#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) -#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED) -#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED) +#define CL_API_PREFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED) +#define CL_API_SUFFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED) + +#if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED) +#define CL_API_PREFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED) +#define CL_API_SUFFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED) + +#if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED) +#define CL_API_PREFIX__VERSION_2_2_DEPRECATED +#endif // #if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED) +#if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED) +#define CL_API_SUFFIX__VERSION_2_2_DEPRECATED +#endif // #if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED) #if !defined(CL_CALLBACK) #define CL_CALLBACK @@ -1326,13 +1332,20 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \ F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \ \ + F(cl_kernel_work_group_info, CL_KERNEL_GLOBAL_WORK_SIZE, cl::detail::size_t_array) \ + \ + F(cl_device_info, CL_DEVICE_LINKER_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, size_type) \ + F(cl_device_info, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, size_type) \ F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \ + F(cl_device_info, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) \ F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector) \ F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector) \ F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \ + F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool) \ F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \ + F(cl_device_info, CL_DEVICE_PRINTF_BUFFER_SIZE, size_type) \ \ F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \ F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \ @@ -1352,6 +1365,14 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \ F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \ F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \ + F(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, cl_uint) \ + F(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, cl_uint ) \ + F(cl_device_info, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, size_type ) \ + F(cl_device_info, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, size_type ) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_COMPLETE, cl_ulong) \ + F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool) \ + F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_PTRS, void**) \ F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \ F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \ F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \ @@ -1367,17 +1388,17 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector) #define CL_HPP_PARAM_NAME_INFO_2_1_(F) \ - F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, size_type) \ + F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, cl_ulong) \ F(cl_program_info, CL_PROGRAM_IL, cl::vector) \ - F(cl_kernel_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \ - F(cl_kernel_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type) \ F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \ F(cl_device_info, CL_DEVICE_IL_VERSION, string) \ F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \ F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \ F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \ F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \ - F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) + F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) \ + F(cl_kernel_sub_group_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \ + F(cl_kernel_sub_group_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type) #define CL_HPP_PARAM_NAME_INFO_2_2_(F) \ F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \ @@ -1390,6 +1411,43 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_ F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector) +#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(F) \ + F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION_KHR, cl_version_khr) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR, cl::vector) \ + \ + F(cl_device_info, CL_DEVICE_NUMERIC_VERSION_KHR, cl_version_khr) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR, cl::vector) \ + F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION_KHR, cl::vector) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR, cl::vector) + +#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr) + +#define CL_HPP_PARAM_NAME_INFO_3_0_(F) \ + F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector) \ + \ + F(cl_device_info, CL_DEVICE_NUMERIC_VERSION, cl_version) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION, cl::vector) \ + F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION, cl::vector) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION, cl::vector) \ + F(cl_device_info, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, cl_device_atomic_capabilities) \ + F(cl_device_info, CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, cl_device_atomic_capabilities) \ + F(cl_device_info, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_ALL_VERSIONS, cl::vector) \ + F(cl_device_info, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \ + F(cl_device_info, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_FEATURES, cl::vector) \ + F(cl_device_info, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, cl_device_device_enqueue_capabilities) \ + F(cl_device_info, CL_DEVICE_PIPE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED, string) \ + \ + F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector) \ + F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector) \ + F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector) \ + F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector) + template struct param_traits {}; @@ -1418,12 +1476,15 @@ CL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_) #if CL_HPP_TARGET_OPENCL_VERSION >= 220 CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_) #endif // CL_HPP_TARGET_OPENCL_VERSION >= 220 +#if CL_HPP_TARGET_OPENCL_VERSION >= 300 +CL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300 #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_) #endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 -#if defined(CL_HPP_USE_IL_KHR) +#if defined(CL_HPP_USE_IL_KHR) && CL_HPP_TARGET_OPENCL_VERSION < 210 CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_) #endif // #if defined(CL_HPP_USE_IL_KHR) @@ -1454,6 +1515,35 @@ CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_); #endif // CL_HPP_USE_CL_DEVICE_FISSION +#if defined(cl_khr_extended_versioning) +#if CL_HPP_TARGET_OPENCL_VERSION < 300 +CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION < 300 +CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // cl_khr_extended_versioning + +#if defined(cl_khr_device_uuid) +using uuid_array = array; +using luid_array = array; +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DRIVER_UUID_KHR, uuid_array) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_VALID_KHR, cl_bool) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_KHR, luid_array) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint) +#endif + +#if defined(cl_khr_pci_bus_info) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr) +#endif + +#if defined(cl_khr_integer_dot_product) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr) +#if defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, cl_device_integer_dot_product_acceleration_properties_khr) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, cl_device_integer_dot_product_acceleration_properties_khr) +#endif // defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR) +#endif // defined(cl_khr_integer_dot_product) + #ifdef CL_PLATFORM_ICD_SUFFIX_KHR CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string) #endif @@ -1461,7 +1551,6 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, strin #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) #endif - #ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector) #endif @@ -1492,6 +1581,9 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUT #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) #endif +#ifdef CL_DEVICE_BOARD_NAME_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_BOARD_NAME_AMD, string) +#endif #ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong) @@ -1499,6 +1591,30 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_AR #ifdef CL_DEVICE_JOB_SLOTS_ARM CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint) #endif +#ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield) +#endif +#ifdef CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM, vector) +#endif +#ifdef CL_DEVICE_MAX_WARP_COUNT_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_MAX_WARP_COUNT_ARM, cl_uint) +#endif +#ifdef CL_KERNEL_MAX_WARP_COUNT_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_info, CL_KERNEL_MAX_WARP_COUNT_ARM, cl_uint) +#endif +#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint) +#endif +#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int) +#endif +#ifdef CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM, cl_uint) +#endif +#ifdef CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM +CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM, cl_uint) +#endif #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) @@ -1862,6 +1978,7 @@ class Wrapper retVal = true; #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + (void)device; return retVal; } @@ -1982,51 +2099,7 @@ inline bool operator!=(const Wrapper &lhs, const Wrapper &rhs) //! \endcond -using BuildLogType = vector::param_type>>; -#if defined(CL_HPP_ENABLE_EXCEPTIONS) -/** -* Exception class for build errors to carry build info -*/ -class BuildError : public Error -{ -private: - BuildLogType buildLogs; -public: - BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec) - { - } - BuildLogType getBuildLog() const - { - return buildLogs; - } -}; -namespace detail { - static inline cl_int buildErrHandler( - cl_int err, - const char * errStr, - const BuildLogType &buildLogs) - { - if (err != CL_SUCCESS) { - throw BuildError(err, errStr, buildLogs); - } - return err; - } -} // namespace detail - -#else -namespace detail { - static inline cl_int buildErrHandler( - cl_int err, - const char * errStr, - const BuildLogType &buildLogs) - { - (void)buildLogs; // suppress unused variable warning - (void)errStr; - return err; - } -} // namespace detail -#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) /*! \stuct ImageFormat @@ -2046,6 +2119,9 @@ struct ImageFormat : public cl_image_format image_channel_data_type = type; } + //! \brief Copy constructor. + ImageFormat(const ImageFormat &other) { *this = other; } + //! \brief Assignment operator. ImageFormat& operator = (const ImageFormat& rhs) { @@ -2187,7 +2263,7 @@ class Device : public detail::Wrapper } //! \brief Wrapper for clGetDeviceInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -2299,7 +2375,7 @@ class Device : public detail::Wrapper const cl_device_partition_property_ext * /* properties */, cl_uint /*num_entries*/, cl_device_id * /*out_devices*/, - cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + cl_uint * /*num_devices*/ ) CL_API_SUFFIX__VERSION_1_1; static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT); @@ -2333,6 +2409,52 @@ class Device : public detail::Wrapper #endif // defined(CL_HPP_USE_CL_DEVICE_FISSION) }; +using BuildLogType = vector::param_type>>; +#if defined(CL_HPP_ENABLE_EXCEPTIONS) +/** +* Exception class for build errors to carry build info +*/ +class BuildError : public Error +{ +private: + BuildLogType buildLogs; +public: + BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec) + { + } + + BuildLogType getBuildLog() const + { + return buildLogs; + } +}; +namespace detail { + static inline cl_int buildErrHandler( + cl_int err, + const char * errStr, + const BuildLogType &buildLogs) + { + if (err != CL_SUCCESS) { + throw BuildError(err, errStr, buildLogs); + } + return err; + } +} // namespace detail + +#else +namespace detail { + static inline cl_int buildErrHandler( + cl_int err, + const char * errStr, + const BuildLogType &buildLogs) + { + (void)buildLogs; // suppress unused variable warning + (void)errStr; + return err; + } +} // namespace detail +#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) + CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_; CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_; CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS; @@ -2465,7 +2587,8 @@ class Platform : public detail::Wrapper } //! \brief Wrapper for clGetPlatformInfo(). - cl_int getInfo(cl_platform_info name, string* param) const + template + cl_int getInfo(cl_platform_info name, T* param) const { return detail::errHandler( detail::getInfo(&::clGetPlatformInfo, object_, name, param), @@ -2473,7 +2596,7 @@ class Platform : public detail::Wrapper } //! \brief Wrapper for clGetPlatformInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -2708,8 +2831,8 @@ CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS; * Unload the OpenCL compiler. * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. */ -inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int -UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int +UnloadCompiler() CL_API_SUFFIX__VERSION_1_1_DEPRECATED; inline cl_int UnloadCompiler() { @@ -2799,7 +2922,7 @@ class Context */ Context( const vector& devices, - cl_context_properties* properties = NULL, + const cl_context_properties* properties = NULL, void (CL_CALLBACK * notifyFptr)( const char *, const void *, @@ -2828,9 +2951,13 @@ class Context } } + /*! \brief Constructs a context including a specific device. + * + * Wraps clCreateContext(). + */ Context( const Device& device, - cl_context_properties* properties = NULL, + const cl_context_properties* properties = NULL, void (CL_CALLBACK * notifyFptr)( const char *, const void *, @@ -2860,7 +2987,7 @@ class Context */ Context( cl_device_type type, - cl_context_properties* properties = NULL, + const cl_context_properties* properties = NULL, void (CL_CALLBACK * notifyFptr)( const char *, const void *, @@ -3030,7 +3157,7 @@ class Context } //! \brief Wrapper for clGetContextInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -3172,7 +3299,7 @@ class Event : public detail::Wrapper } //! \brief Wrapper for clGetEventInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -3195,7 +3322,7 @@ class Event : public detail::Wrapper } //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getProfilingInfo(cl_int* err = NULL) const { @@ -3226,7 +3353,7 @@ class Event : public detail::Wrapper */ cl_int setCallback( cl_int type, - void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), + void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), void * user_data = NULL) { return detail::errHandler( @@ -3387,7 +3514,7 @@ class Memory : public detail::Wrapper } //! \brief Wrapper for clGetMemObjectInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -3415,7 +3542,7 @@ class Memory : public detail::Wrapper * value - not the Memory class instance. */ cl_int setDestructorCallback( - void (CL_CALLBACK * pfn_notify)(cl_mem, void *), + void (CL_CALLBACK * pfn_notify)(cl_mem, void *), void * user_data = NULL) { return detail::errHandler( @@ -3758,7 +3885,7 @@ cl::pointer> allocate_pointer(const Alloc &alloc_, Arg return cl::pointer>(tmp, detail::Deleter{alloc, copies}); } - catch (std::bad_alloc& b) + catch (std::bad_alloc&) { std::allocator_traits::deallocate(alloc, tmp, copies); throw; @@ -3893,7 +4020,7 @@ class Buffer : public Memory Context context = Context::getDefault(err); if( useHostPtr ) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); } else { object_ = ::clCreateBuffer(context(), flags, size, 0, &error); } @@ -4006,7 +4133,7 @@ class Buffer : public Memory } return result; - } + } #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 }; @@ -4385,7 +4512,7 @@ class Image : public Memory } //! \brief Wrapper for clGetImageInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getImageInfo(cl_int* err = NULL) const { @@ -4422,12 +4549,11 @@ class Image1D : public Image cl_int* err = NULL) { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D, - width, - 0, 0, 0, 0, 0, 0, 0, 0 - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE1D; + desc.image_width = width; + object_ = ::clCreateImage( context(), flags, @@ -4510,13 +4636,12 @@ class Image1DBuffer : public Image cl_int* err = NULL) { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D_BUFFER, - width, - 0, 0, 0, 0, 0, 0, 0, - buffer() - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + desc.image_width = width; + desc.buffer = buffer(); + object_ = ::clCreateImage( context(), flags, @@ -4596,15 +4721,13 @@ class Image1DArray : public Image cl_int* err = NULL) { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D_ARRAY, - width, - 0, 0, // height, depth (unused) - arraySize, - rowPitch, - 0, 0, 0, 0 - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; + desc.image_width = width; + desc.image_array_size = arraySize; + desc.image_row_pitch = rowPitch; + object_ = ::clCreateImage( context(), flags, @@ -4711,15 +4834,12 @@ class Image2D : public Image #if CL_HPP_TARGET_OPENCL_VERSION >= 120 if (useCreateImage) { - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D, - width, - height, - 0, 0, // depth, array size (unused) - row_pitch, - 0, 0, 0, 0 - }; + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = row_pitch; + object_ = ::clCreateImage( context(), flags, @@ -4765,17 +4885,13 @@ class Image2D : public Image { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D, - width, - height, - 0, 0, // depth, array size (unused) - row_pitch, - 0, 0, 0, - // Use buffer as input to image - sourceBuffer() - }; + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = row_pitch; + desc.buffer = sourceBuffer(); + object_ = ::clCreateImage( context(), 0, // flags inherited from buffer @@ -4829,19 +4945,16 @@ class Image2D : public Image // Update only the channel order. // Channel format inherited from source. sourceFormat.image_channel_order = order; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D, - sourceWidth, - sourceHeight, - 0, 0, // depth (unused), array size (unused) - sourceRowPitch, - 0, // slice pitch (unused) - sourceNumMIPLevels, - sourceNumSamples, - // Use buffer as input to image - sourceImage() - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = sourceWidth; + desc.image_height = sourceHeight; + desc.image_row_pitch = sourceRowPitch; + desc.num_mip_levels = sourceNumMIPLevels; + desc.num_samples = sourceNumSamples; + desc.buffer = sourceImage(); + object_ = ::clCreateImage( context(), 0, // flags should be inherited from mem_object @@ -4921,7 +5034,7 @@ class Image2D : public Image * \see Memory * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. */ -class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D +class CL_API_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D { public: /*! \brief Constructs an Image2DGL in a specified context, from a given @@ -5004,7 +5117,7 @@ class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D return *this; } -} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +} CL_API_SUFFIX__VERSION_1_1_DEPRECATED; #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS #if CL_HPP_TARGET_OPENCL_VERSION >= 120 @@ -5027,17 +5140,15 @@ class Image2DArray : public Image cl_int* err = NULL) { cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D_ARRAY, - width, - height, - 0, // depth (unused) - arraySize, - rowPitch, - slicePitch, - 0, 0, 0 - }; + + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; + desc.image_width = width; + desc.image_height = height; + desc.image_array_size = arraySize; + desc.image_row_pitch = rowPitch; + desc.image_slice_pitch = slicePitch; + object_ = ::clCreateImage( context(), flags, @@ -5142,17 +5253,14 @@ class Image3D : public Image #if CL_HPP_TARGET_OPENCL_VERSION >= 120 if (useCreateImage) { - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE3D, - width, - height, - depth, - 0, // array size (unused) - row_pitch, - slice_pitch, - 0, 0, 0 - }; + cl_image_desc desc = {0}; + desc.image_type = CL_MEM_OBJECT_IMAGE3D; + desc.image_width = width; + desc.image_height = height; + desc.image_depth = depth; + desc.image_row_pitch = row_pitch; + desc.image_slice_pitch = slice_pitch; + object_ = ::clCreateImage( context(), flags, @@ -5534,7 +5642,7 @@ class Pipe : public Memory } //! \brief Wrapper for clGetMemObjectInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -5667,7 +5775,7 @@ class Sampler : public detail::Wrapper } //! \brief Wrapper for clGetSamplerInfo() that returns by value. - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -5890,7 +5998,7 @@ class Kernel : public detail::Wrapper __GET_KERNEL_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -5912,7 +6020,7 @@ class Kernel : public detail::Wrapper __GET_KERNEL_ARG_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getArgInfo(cl_uint argIndex, cl_int* err = NULL) const { @@ -5936,7 +6044,7 @@ class Kernel : public detail::Wrapper __GET_KERNEL_WORK_GROUP_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getWorkGroupInfo(const Device& device, cl_int* err = NULL) const { @@ -5971,7 +6079,7 @@ class Kernel : public detail::Wrapper #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 } - template + template size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const { size_type param; @@ -6134,6 +6242,23 @@ class Kernel : public detail::Wrapper sizeof(void*)*(1 + sizeof...(Ts)), pointerList.data())); } + + template + cl_int setExecInfo(cl_kernel_exec_info param_name, const T& val) + { + return detail::errHandler( + ::clSetKernelExecInfo( + object_, + param_name, + sizeof(T), + &val)); + } + + template + cl_int setExecInfo(typename detail::param_traits::param_type& val) + { + return setExecInfo(name, val); + } #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 #if CL_HPP_TARGET_OPENCL_VERSION >= 210 @@ -6339,8 +6464,7 @@ class Program : public detail::Wrapper static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL; CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR); - return detail::errHandler( - pfn_clCreateProgramWithILKHR( + object_ = pfn_clCreateProgramWithILKHR( context(), static_cast(IL.data()), IL.size(), &error); #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 @@ -6393,8 +6517,7 @@ class Program : public detail::Wrapper static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = NULL; CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR); - return detail::errHandler( - pfn_clCreateProgramWithILKHR( + object_ = pfn_clCreateProgramWithILKHR( context(), static_cast(IL.data()), IL.size(), &error); #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210 @@ -6538,7 +6661,7 @@ class Program : public detail::Wrapper Program() { } - /*! \brief Constructor from cl_mem - takes ownership. + /*! \brief Constructor from cl_program - takes ownership. * * \param retainObject will cause the constructor to retain its cl object. * Defaults to false to maintain compatibility with @@ -6606,6 +6729,27 @@ class Program : public detail::Wrapper return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo()); } + cl_int build( + const Device& device, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + cl_device_id deviceID = device(); + + cl_int buildError = ::clBuildProgram( + object_, + 1, + &deviceID, + options, + notifyFptr, + data); + + BuildLogType buildLog(0); + buildLog.push_back(std::make_pair(device, getBuildInfo(device))); + return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog); + } + cl_int build( const char* options = NULL, void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, @@ -6619,7 +6763,6 @@ class Program : public detail::Wrapper notifyFptr, data); - return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo()); } @@ -6651,7 +6794,7 @@ class Program : public detail::Wrapper __GET_PROGRAM_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -6674,7 +6817,7 @@ class Program : public detail::Wrapper __GET_PROGRAM_BUILD_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getBuildInfo(const Device& device, cl_int* err = NULL) const { @@ -6692,7 +6835,7 @@ class Program : public detail::Wrapper * info type and for all devices in the program. * On an error reading the info for any device, an empty vector of info will be returned. */ - template + template vector::param_type>> getBuildInfo(cl_int *err = NULL) const { @@ -6762,6 +6905,7 @@ class Program : public detail::Wrapper } #if CL_HPP_TARGET_OPENCL_VERSION >= 220 +#if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) /*! \brief Registers a callback function to be called when destructors for * program scope global variables are complete and before the * program is released. @@ -6772,9 +6916,9 @@ class Program : public detail::Wrapper * on a callback stack associated with program. The registered user callback * functions are called in the reverse order in which they were registered. */ - cl_int setReleaseCallback( + CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback( void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), - void * user_data = NULL) + void * user_data = NULL) CL_API_SUFFIX__VERSION_2_2_DEPRECATED { return detail::errHandler( ::clSetProgramReleaseCallback( @@ -6783,6 +6927,7 @@ class Program : public detail::Wrapper user_data), __SET_PROGRAM_RELEASE_CALLBACK_ERR); } +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) /*! \brief Sets a SPIR-V specialization constant. * @@ -6978,6 +7123,11 @@ inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs) return static_cast(static_cast(lhs) | static_cast(rhs)); } +inline QueueProperties operator&(QueueProperties lhs, QueueProperties rhs) +{ + return static_cast(static_cast(lhs) & static_cast(rhs)); +} + /*! \class CommandQueue * \brief CommandQueue interface for cl_command_queue. */ @@ -7434,7 +7584,7 @@ class CommandQueue : public detail::Wrapper CommandQueue() { } - /*! \brief Constructor from cl_mem - takes ownership. + /*! \brief Constructor from cl_command_queue - takes ownership. * * \param retainObject will cause the constructor to retain its cl object. * Defaults to false to maintain compatibility with @@ -7486,7 +7636,7 @@ class CommandQueue : public detail::Wrapper __GET_COMMAND_QUEUE_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -8119,7 +8269,7 @@ class CommandQueue : public detail::Wrapper { cl_event tmp; cl_int err = detail::errHandler(::clEnqueueSVMMap( - object_, blocking, flags, static_cast(container.data()), container.size(), + object_, blocking, flags, static_cast(container.data()), container.size()*sizeof(T), (events != NULL) ? (cl_uint)events->size() : 0, (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, (event != NULL) ? &tmp : NULL), @@ -8478,10 +8628,10 @@ class CommandQueue : public detail::Wrapper } #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) - CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask( + CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask( const Kernel& kernel, const vector* events = NULL, - Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + Event* event = NULL) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED { cl_event tmp; cl_int err = detail::errHandler( @@ -8538,8 +8688,8 @@ class CommandQueue : public detail::Wrapper * Deprecated APIs for 1.2 */ #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + CL_API_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueMarker(Event* event = NULL) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED { cl_event tmp; cl_int err = detail::errHandler( @@ -8554,8 +8704,8 @@ class CommandQueue : public detail::Wrapper return err; } - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueWaitForEvents(const vector& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + CL_API_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueWaitForEvents(const vector& events) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED { return detail::errHandler( ::clEnqueueWaitForEvents( @@ -8691,8 +8841,8 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( * Deprecated APIs for 1.2 */ #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + CL_API_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueBarrier() const CL_API_SUFFIX__VERSION_1_1_DEPRECATED { return detail::errHandler( ::clEnqueueBarrier(object_), @@ -8866,7 +9016,7 @@ class DeviceCommandQueue : public detail::Wrapper __GET_COMMAND_QUEUE_INFO_ERR); } - template typename + template typename detail::param_traits::param_type getInfo(cl_int* err = NULL) const { @@ -9038,7 +9188,7 @@ Buffer::Buffer( size_type size = sizeof(DataType)*(endIterator - startIterator); if( useHostPtr ) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); } else { object_ = ::clCreateBuffer(context(), flags, size, 0, &error); } @@ -9091,7 +9241,7 @@ Buffer::Buffer( Context context = queue.getInfo(); if (useHostPtr) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + object_ = ::clCreateBuffer(context(), flags, size, const_cast(&*startIterator), &error); } else { object_ = ::clCreateBuffer(context(), flags, size, 0, &error); @@ -9213,7 +9363,7 @@ inline cl_int enqueueMapSVM( */ template inline cl_int enqueueMapSVM( - cl::pointer ptr, + cl::pointer &ptr, cl_bool blocking, cl_map_flags flags, size_type size, @@ -9237,7 +9387,7 @@ inline cl_int enqueueMapSVM( */ template inline cl_int enqueueMapSVM( - cl::vector container, + cl::vector &container, cl_bool blocking, cl_map_flags flags, const vector* events = NULL, @@ -10063,7 +10213,7 @@ class KernelFunctor namespace compatibility { /** - * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp. + * Backward compatibility class to ensure that cl.hpp code works with opencl.hpp. * Please use KernelFunctor directly. */ template diff --git a/Cxx11/prk_opencl.h b/Cxx11/prk_opencl.h index f8f0ade9c..b8d783438 100644 --- a/Cxx11/prk_opencl.h +++ b/Cxx11/prk_opencl.h @@ -19,7 +19,7 @@ #include -#include "cl2.hpp" +#include "opencl.hpp" namespace prk { From dee4ba9d96b9c7b0eeba6c28dc79d23776f31530 Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Sat, 5 Nov 2022 14:36:36 -0500 Subject: [PATCH 255/325] RUST: nstream with rayon! --- .gitignore | 2 + RUST/Makefile | 1 + RUST/nstream-rayon/Cargo.toml | 9 ++ RUST/nstream-rayon/src/main.rs | 184 +++++++++++++++++++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 RUST/nstream-rayon/Cargo.toml create mode 100644 RUST/nstream-rayon/src/main.rs diff --git a/.gitignore b/.gitignore index a92a237e3..bd9ee8deb 100644 --- a/.gitignore +++ b/.gitignore @@ -377,6 +377,8 @@ RUST/nstream-unsafe/Cargo.lock RUST/nstream-unsafe/target/ RUST/nstream-iter/Cargo.lock RUST/nstream-iter/target/ +RUST/nstream-rayon/Cargo.lock +RUST/nstream-rayon/target/ RUST/p2p/Cargo.lock RUST/p2p/target/ RUST/stencil/Cargo.lock diff --git a/RUST/Makefile b/RUST/Makefile index d70e5855e..9904e005b 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -16,6 +16,7 @@ all: cd nstream && cargo build $(RCFLAGS) cd nstream-unsafe && cargo build $(RCFLAGS) cd nstream-iter && cargo build $(RCFLAGS) + cd nstream-rayon && cargo build $(RCFLAGS) cd p2p && cargo build $(RCFLAGS) cd stencil && cargo build $(RCFLAGS) cd transpose && cargo build $(RCFLAGS) diff --git a/RUST/nstream-rayon/Cargo.toml b/RUST/nstream-rayon/Cargo.toml new file mode 100644 index 000000000..054caa930 --- /dev/null +++ b/RUST/nstream-rayon/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "nstream" +version = "0.1.0" +authors = ["Jeff Hammond ", "Thomas Hayward-Schneider ", "Sajid Ali "] + +edition = "2021" + +[dependencies] +rayon = "1.5" diff --git a/RUST/nstream-rayon/src/main.rs b/RUST/nstream-rayon/src/main.rs new file mode 100644 index 000000000..4d02cb145 --- /dev/null +++ b/RUST/nstream-rayon/src/main.rs @@ -0,0 +1,184 @@ +// +// Copyright (c) 2020, Intel Corporation +// Copyright (c) 2020, Thomas Hayward-Schneider +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/////////////////////////////////////////////// +// +// NAME: nstream +// +// PURPOSE: To compute memory bandwidth when adding a vector of a given +// number of double precision values to the scalar multiple of +// another vector of the same length, and storing the result in +// a third vector. +// +// USAGE: The program takes as input the number +// of iterations to loop over the triad vectors, the length of the +// vectors, and the offset between vectors +// +// <# iterations> +// +// The output consists of diagnostics to make sure the +// algorithm worked, and of timing statistics. +// +// NOTES: Bandwidth is determined as the number of words read, plus the +// number of words written, times the size of the words, divided +// by the execution time. For a vector length of N, the total +// number of words read and written is 4*N*sizeof(double). +// +// HISTORY: This code is loosely based on the Stream benchmark by John +// McCalpin, but does not follow all the Stream rules. Hence, +// reported results should not be associated with Stream in +// external publications +// +// Converted to C++11 by Jeff Hammond, November 2017. +// +/////////////////////////////////////////////// + +use std::env; +use std::mem; +//use std::num; // abs? +use rayon::prelude::*; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust STREAM triad: A = B + scalar * C"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let length: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + length = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + + println!("Number of iterations = {}", iterations); + println!("vector length = {}", length); + + /////////////////////////////////////////////// + // Allocate space and perform the computation + /////////////////////////////////////////////// + + let mut a: Vec = vec![0.0; length]; + let b: Vec = vec![2.0; length]; + let c: Vec = vec![2.0; length]; + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + let scalar: f64 = 3.0; + + for _k in 0..iterations + 1 { + if _k == 1 { + t0 = timer.elapsed(); + } + + (&mut a, &b, &c).into_par_iter().for_each(|(x, y, z)| { + *x += *y + scalar * (*z); + }); + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let nstream_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let mut ar: f64 = 0.0; + let br: f64 = 2.0; + let cr: f64 = 2.0; + for _k in 0..iterations + 1 { + ar += br + scalar * cr; + } + + ar *= length as f64; + + let mut asum = 0.0; + for i in 0..length { + let absa: f64 = a[i].abs(); + asum += absa; + } + + let err: f64 = (ar - asum) / asum; + let abserr: f64 = err.abs(); + let epsilon: f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime: f64 = (nstream_time as f64) / (iterations as f64); + let nbytes: usize = 4 * length * mem::size_of::(); + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nbytes as f64) / avgtime, + avgtime + ); + } else { + println!("Failed Validation on output array"); + println!(" Expected checksum: {}", ar); + println!(" Observed checksum: {}", asum); + println!("ERROR: solution did not validate"); + } + return; +} From be2972f727c5b322b4cb4f3e4f79f2f4f9329002 Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Sun, 6 Nov 2022 21:32:55 -0600 Subject: [PATCH 256/325] RUST: dgemm with iter and rayon! modified: .gitignore modified: RUST/Makefile new file: RUST/dgemm-iter/Cargo.toml new file: RUST/dgemm-iter/src/main.rs new file: RUST/dgemm-rayon/Cargo.toml new file: RUST/dgemm-rayon/src/main.rs modified: RUST/dgemm/Cargo.toml modified: RUST/dgemm/src/main.rs modified: RUST/transpose/Cargo.toml modified: RUST/transpose/src/main.rs --- .gitignore | 6 + RUST/Makefile | 37 +++-- RUST/dgemm-iter/Cargo.toml | 6 + RUST/dgemm-iter/src/main.rs | 202 ++++++++++++++++++++++++ RUST/dgemm-rayon/Cargo.toml | 9 ++ RUST/dgemm-rayon/src/main.rs | 204 ++++++++++++++++++++++++ RUST/dgemm/Cargo.toml | 7 +- RUST/dgemm/src/main.rs | 225 +++++++++++++-------------- RUST/transpose/Cargo.toml | 4 +- RUST/transpose/src/main.rs | 291 +++++++++++++++++++++-------------- 10 files changed, 734 insertions(+), 257 deletions(-) create mode 100644 RUST/dgemm-iter/Cargo.toml create mode 100644 RUST/dgemm-iter/src/main.rs create mode 100644 RUST/dgemm-rayon/Cargo.toml create mode 100644 RUST/dgemm-rayon/src/main.rs diff --git a/.gitignore b/.gitignore index bd9ee8deb..9ba4c2b06 100644 --- a/.gitignore +++ b/.gitignore @@ -379,6 +379,12 @@ RUST/nstream-iter/Cargo.lock RUST/nstream-iter/target/ RUST/nstream-rayon/Cargo.lock RUST/nstream-rayon/target/ +RUST/dgemm/Cargo.lock +RUST/dgemm/target/ +RUST/dgemm-iter/Cargo.lock +RUST/dgemm-iter/target/ +RUST/dgemm-rayon/Cargo.lock +RUST/dgemm-rayon/target/ RUST/p2p/Cargo.lock RUST/p2p/target/ RUST/stencil/Cargo.lock diff --git a/RUST/Makefile b/RUST/Makefile index 9904e005b..cc3fa2d06 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -13,21 +13,24 @@ RCFLAGS += --release .PHONY: all clean all: - cd nstream && cargo build $(RCFLAGS) - cd nstream-unsafe && cargo build $(RCFLAGS) - cd nstream-iter && cargo build $(RCFLAGS) - cd nstream-rayon && cargo build $(RCFLAGS) - cd p2p && cargo build $(RCFLAGS) - cd stencil && cargo build $(RCFLAGS) - cd transpose && cargo build $(RCFLAGS) - cd dgemm && cargo build $(RCFLAGS) - + cd nstream && cargo build $(RCFLAGS) + cd nstream-unsafe && cargo build $(RCFLAGS) + cd nstream-iter && cargo build $(RCFLAGS) + cd nstream-rayon && cargo build $(RCFLAGS) + cd p2p && cargo build $(RCFLAGS) + cd stencil && cargo build $(RCFLAGS) + cd transpose && cargo build $(RCFLAGS) + cd dgemm && cargo build $(RCFLAGS) + cd dgemm-iter && cargo build $(RCFLAGS) + cd dgemm-rayon && cargo build $(RCFLAGS) clean: - cd nstream && cargo clean - cd nstream-unsafe && cargo clean - cd nstream-iter && cargo clean - cd p2p && cargo clean - cd stencil && cargo clean - cd transpose && cargo clean - cd dgemm && cargo clean - + cd nstream && cargo clean + cd nstream-unsafe && cargo clean + cd nstream-iter && cargo clean + cd nstream-rayon && cargo clean + cd p2p && cargo clean + cd stencil && cargo clean + cd transpose && cargo clean + cd dgemm && cargo clean + cd dgemm-iter && cargo clean + cd dgemm-rayon && cargo clean diff --git a/RUST/dgemm-iter/Cargo.toml b/RUST/dgemm-iter/Cargo.toml new file mode 100644 index 000000000..5714a1fa3 --- /dev/null +++ b/RUST/dgemm-iter/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "dgemm" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition = "2021" diff --git a/RUST/dgemm-iter/src/main.rs b/RUST/dgemm-iter/src/main.rs new file mode 100644 index 000000000..208cc47b5 --- /dev/null +++ b/RUST/dgemm-iter/src/main.rs @@ -0,0 +1,202 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +use std::env; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Dense matrix-matrix multiplication: C += A x B"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + + /////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + /////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + let mut c: Vec = vec![0.0; nelems]; + + for i in 0..order { + for j in 0..order { + a[i * order + j] = i as f64; + b[i * order + j] = i as f64; + } + } + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + // https://www.reidatcheson.com/matrix%20multiplication/rust/iterators/2021/02/26/gemm-iterators.html + c.chunks_exact_mut(order) + .zip(a.chunks_exact(order)) + // ci_mut : mutable ith row of C + // ai : immutable ith row of A + .for_each(|(ci_mut, ai)| { + // iterate over columns of ith row of a, + // zipped with rows of b + ai.iter() + .zip(b.chunks_exact(order)) + // aik : element at row i, column k in matrix A + // bk : immutable kth row of matrix B + .for_each(|(aik, bk)| { + // iterate over columns of ith row of c, + // zipped with columns of kth row of b + ci_mut + .iter_mut() + .zip(bk.iter()) + // cij : element at row i, column j of matrix C + // bkj : element at row k, column j of marrix B + .for_each(|(cij, bkj)| { + *cij += aik * bkj; + }) + }); + }); + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let dgemm_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let forder: f64 = order as f64; + let reference: f64 = 0.25 + * (forder * forder * forder) + * (forder - 1.0) + * (forder - 1.0) + * (iterations as f64 + 1.0); + let mut checksum: f64 = 0.0; + for i in 0..order { + for j in 0..order { + checksum += c[i * order + j]; + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", checksum); + } + + let epsilon: f64 = 1.0e-8; + let residuum: f64 = (checksum - reference) / reference; + if residuum < epsilon { + println!("Solution validates"); + let avgtime: f64 = (dgemm_time as f64) / (iterations as f64); + let uorder: usize = order as usize; + let nflops: usize = 2_usize * uorder * uorder * uorder; + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nflops as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + residuum, epsilon + ); + return; + } +} diff --git a/RUST/dgemm-rayon/Cargo.toml b/RUST/dgemm-rayon/Cargo.toml new file mode 100644 index 000000000..49886cd96 --- /dev/null +++ b/RUST/dgemm-rayon/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "dgemm" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition = "2021" + +[dependencies] +rayon = "1.5" diff --git a/RUST/dgemm-rayon/src/main.rs b/RUST/dgemm-rayon/src/main.rs new file mode 100644 index 000000000..30dc55057 --- /dev/null +++ b/RUST/dgemm-rayon/src/main.rs @@ -0,0 +1,204 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +use std::env; +use std::time::{Duration, Instant}; + +use rayon::prelude::*; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Dense matrix-matrix multiplication: C += A x B"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + + /////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + /////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + let mut c: Vec = vec![0.0; nelems]; + + for i in 0..order { + for j in 0..order { + a[i * order + j] = i as f64; + b[i * order + j] = i as f64; + } + } + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + // Outermost loop parallelism applied to dgemm-iter version + c.par_chunks_exact_mut(order) + .zip(a.par_chunks_exact(order)) + // ci_mut : mutable ith row of C + // ai : immutable ith row of A + .for_each(|(ci_mut, ai)| { + // iterate over columns of ith row of a, + // zipped with rows of b + ai.iter() + .zip(b.chunks_exact(order)) + // aik : element at row i, column k in matrix A + // bk : immutable kth row of matrix B + .for_each(|(aik, bk)| { + // iterate over columns of ith row of c, + // zipped with columns of kth row of b + ci_mut + .iter_mut() + .zip(bk.iter()) + // cij : element at row i, column j of matrix C + // bkj : element at row k, column j of marrix B + .for_each(|(cij, bkj)| { + *cij += aik * bkj; + }) + }); + }); + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let dgemm_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let forder: f64 = order as f64; + let reference: f64 = 0.25 + * (forder * forder * forder) + * (forder - 1.0) + * (forder - 1.0) + * (iterations as f64 + 1.0); + let mut checksum: f64 = 0.0; + for i in 0..order { + for j in 0..order { + checksum += c[i * order + j]; + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", checksum); + } + + let epsilon: f64 = 1.0e-8; + let residuum: f64 = (checksum - reference) / reference; + if residuum < epsilon { + println!("Solution validates"); + let avgtime: f64 = (dgemm_time as f64) / (iterations as f64); + let uorder: usize = order as usize; + let nflops: usize = 2_usize * uorder * uorder * uorder; + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nflops as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + residuum, epsilon + ); + return; + } +} diff --git a/RUST/dgemm/Cargo.toml b/RUST/dgemm/Cargo.toml index 4548f4f12..cd045832a 100644 --- a/RUST/dgemm/Cargo.toml +++ b/RUST/dgemm/Cargo.toml @@ -1,9 +1,6 @@ [package] name = "dgemm" version = "0.1.0" -authors = ["Jeff Hammond "] +authors = ["Jeff Hammond ", "Sajid Ali "] -[dependencies] -blas = "0.20" -cblas = "0.2" -blas-src = { version = "0.7", features = ["blis"] } +edition="2021" diff --git a/RUST/dgemm/src/main.rs b/RUST/dgemm/src/main.rs index b0d03aaa4..930d3a60d 100644 --- a/RUST/dgemm/src/main.rs +++ b/RUST/dgemm/src/main.rs @@ -1,5 +1,6 @@ // // Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -52,136 +53,130 @@ // /////////////////////////////////////////////// -extern crate blas; -extern crate cblas; -extern crate blas_src; - use std::env; -use std::time::{Instant,Duration}; - -//use blas::*; -use cblas::*; - -fn prk_dgemm(order : usize, a : &mut Vec, b : &mut Vec, c : &mut Vec) -{ - for i in 0..order { - for k in 0..order { - for j in 0..order { - c[i*order+j] += a[i*order+k] * b[k*order+j]; - } - } - } -} +use std::time::{Duration, Instant}; fn help() { - println!("Usage: <# iterations> "); + println!("Usage: <# iterations> "); } -fn main() -{ - println!("Parallel Research Kernels"); - println!("Rust Dense matrix-matrix multiplication: C += A x B"); - - /////////////////////////////////////////////// - // Read and test input parameters - /////////////////////////////////////////////// - - let args : Vec = env::args().collect(); - - let iterations : u32; - let order : usize; - - match args.len() { - 3 => { - iterations = match args[1].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - order = match args[2].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - }, - _ => { - help(); - return; +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Dense matrix-matrix multiplication: C += A x B"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } } - } - if iterations < 1 { - println!("ERROR: iterations must be >= 1"); - } + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } - println!("Number of iterations = {}", iterations); - println!("Matrix order = {}", order); + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); - /////////////////////////////////////////////// - // Allocate space for the input and transpose matrix - /////////////////////////////////////////////// + /////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + /////////////////////////////////////////////// - let nelems : usize = order*order; - let mut a : Vec = vec![0.0; nelems]; - let mut b : Vec = vec![0.0; nelems]; - let mut c : Vec = vec![0.0; nelems]; + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + let mut c: Vec = vec![0.0; nelems]; - for i in 0..order { - for j in 0..order { - a[i*order+j] = i as f64; - b[i*order+j] = i as f64; + for i in 0..order { + for j in 0..order { + a[i * order + j] = i as f64; + b[i * order + j] = i as f64; + } } - } - - let timer = Instant::now(); - let mut t0 : Duration = timer.elapsed(); - for k in 0..iterations+1 { - - if k == 1 { t0 = timer.elapsed(); } + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + for i in 0..order { + for k in 0..order { + for j in 0..order { + c[i * order + j] += a[i * order + k] * b[k * order + j]; + } + } + } + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let dgemm_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let forder: f64 = order as f64; + let reference: f64 = 0.25 + * (forder * forder * forder) + * (forder - 1.0) + * (forder - 1.0) + * (iterations as f64 + 1.0); + let mut checksum: f64 = 0.0; + for i in 0..order { + for j in 0..order { + checksum += c[i * order + j]; + } + } - //prk_dgemm(order, &mut a, &mut b, &mut c); - let m : i32 = order as i32; - let n : i32 = order as i32; - let k : i32 = order as i32; - unsafe { - dgemm(Layout::RowMajor, Transpose::None, Transpose::None, - m, n, k, 1.0, &a, m, &b, k, 1.0, &mut c, m); + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", checksum); } - } - let t1 = timer.elapsed(); - let dt = (t1.checked_sub(t0)).unwrap(); - let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; - let dgemm_time : f64 = dtt as f64 * 1.0e-9; - - /////////////////////////////////////////////// - // Analyze and output results - /////////////////////////////////////////////// - - let forder : f64 = order as f64; - let reference : f64 = 0.25 * (forder*forder*forder) * (forder-1.0)*(forder-1.0) * (iterations as f64 + 1.0); - let mut checksum : f64 = 0.0; - for i in 0..order { - for j in 0..order { - checksum += c[i*order+j]; + let epsilon: f64 = 1.0e-8; + let residuum: f64 = (checksum - reference) / reference; + if residuum < epsilon { + println!("Solution validates"); + let avgtime: f64 = (dgemm_time as f64) / (iterations as f64); + let uorder: usize = order as usize; + let nflops: usize = 2_usize * uorder * uorder * uorder; + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nflops as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + residuum, epsilon + ); + return; } - } - - if cfg!(VERBOSE) { - println!("Sum of absolute differences: {:30.15}", checksum); - } - - let epsilon : f64 = 1.0e-8; - let residuum : f64 = (checksum - reference)/reference; - if residuum < epsilon { - println!("Solution validates"); - let avgtime : f64 = (dgemm_time as f64) / (iterations as f64); - let uorder : usize = order as usize; - let nflops : usize = 2_usize * uorder * uorder * uorder; - println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (nflops as f64) / avgtime, avgtime); - } else { - println!("ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", residuum, epsilon); - return; - } } - - diff --git a/RUST/transpose/Cargo.toml b/RUST/transpose/Cargo.toml index 3f634d3c5..22fe9074e 100644 --- a/RUST/transpose/Cargo.toml +++ b/RUST/transpose/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "transpose" version = "0.1.0" -authors = ["Jeff Hammond "] +authors = ["Jeff Hammond ", "Sajid Ali "] -[dependencies] +edition = "2021" diff --git a/RUST/transpose/src/main.rs b/RUST/transpose/src/main.rs index 935addae8..baace9c90 100644 --- a/RUST/transpose/src/main.rs +++ b/RUST/transpose/src/main.rs @@ -1,5 +1,6 @@ // // Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -54,137 +55,191 @@ use std::env; use std::mem; -use std::time::{Instant,Duration}; +use std::time::{Duration, Instant}; fn help() { - println!("Usage: <# iterations> [tile size]"); + println!("Usage: <# iterations> [tile size]"); } -fn main() -{ - println!("Parallel Research Kernels"); - println!("Rust Matrix transpose: B = A^T"); - - /////////////////////////////////////////////// - // Read and test input parameters - /////////////////////////////////////////////// - - let args : Vec = env::args().collect(); - - let iterations : u32; - let order : usize; - let tilesize : usize; - - match args.len() { - 3 => { - iterations = match args[1].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - order = match args[2].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - tilesize = 32; - }, - 4 => { - iterations = match args[1].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - order = match args[2].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - tilesize = match args[3].parse() { - Ok(n) => { n }, - Err(_) => { help(); return; }, - }; - }, - _ => { - help(); - return; +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Matrix transpose: B = A^T"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + let tilesize: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = 32; + } + 4 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = match args[3].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } } - } - - if iterations < 1 { - println!("ERROR: iterations must be >= 1"); - } - if tilesize > order { - println!("ERROR: tilesize cannot be > order"); - } - - println!("Number of iterations = {}", iterations); - println!("Matrix order = {}", order); - if tilesize < order { - println!("Tile size = {}", tilesize); - } else { - println!("Untiled"); - } - - /////////////////////////////////////////////// - // Allocate space for the input and transpose matrix - /////////////////////////////////////////////// - - let nelems : usize = order*order; - let mut a : Vec = vec![0.0; nelems]; - let mut b : Vec = vec![0.0; nelems]; - - for i in 0..order { - for j in 0..order { - a[i*order+j] = (i*order+j) as f64; + + if tilesize > order { + println!("Warning: tilesize cannot be > order, will not use tiling!"); } - } - let timer = Instant::now(); - let mut t0 : Duration = timer.elapsed(); + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + if tilesize < order { + println!("Tile size = {}", tilesize); + } else { + println!("Untiled"); + } - for k in 0..iterations+1 { + ///////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ///////////////////////////////////////////////////// - if k == 1 { t0 = timer.elapsed(); } + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + // Initialize matrices for i in 0..order { - for j in 0..order { - b[j*order+i] += a[i*order+j]; - a[i*order+j] += 1.0; - } + for j in 0..order { + a[i * order + j] = (i * order + j) as f64; + } } - } - let t1 = timer.elapsed(); - let dt = (t1.checked_sub(t0)).unwrap(); - let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; - let transpose_time : f64 = dtt as f64 * 1.0e-9; - - /////////////////////////////////////////////// - // Analyze and output results - /////////////////////////////////////////////// - - let addit : usize = ((iterations as usize + 1) * (iterations as usize)) / 2; - let mut abserr : f64 = 0.0; - for i in 0..order { - for j in 0..order { - let ij = i*order+j; - let ji = j*order+i; - let reference : f64 = (ij*(iterations as usize + 1)+addit) as f64; - abserr += (b[ji] - reference).abs(); + let (num_tiles, boundscheck): (usize, bool) = if order % tilesize == 0 { + (order / tilesize, false) // all tiles have same size + } else { + (order / tilesize + 1, true) // last tile has size < tilesize + }; + + println!("Initialization done, running algorithm"); + if boundscheck { + println!("Warning: Matrix order not divisible by tilesize, will employ bounds checking!") } - } - - if cfg!(VERBOSE) { - println!("Sum of absolute differences: {:30.15}", abserr); - } - - let epsilon : f64 = 1.0e-8; - if abserr < epsilon { - println!("Solution validates"); - let avgtime : f64 = (transpose_time as f64) / (iterations as f64); - let bytes : usize = 2_usize * nelems * mem::size_of::(); - println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (bytes as f64) / avgtime, avgtime); - } else { - println!("ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", abserr, epsilon); - return; - } -} + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + // Version with no bounds check + if !boundscheck { + for row_tile in 0..num_tiles { + for col_tile in 0..num_tiles { + for i in 0..tilesize { + for j in 0..tilesize { + let rowidx = row_tile * tilesize + i; + let colidx = col_tile * tilesize + j; + b[colidx * order + rowidx] += a[rowidx * order + colidx]; + a[rowidx * order + colidx] += 1.0; + } + } + } + } + } else { + // Version with bounds check + for row_tile in 0..num_tiles { + for col_tile in 0..num_tiles { + for i in 0..tilesize { + for j in 0..tilesize { + let rowidx = row_tile * tilesize + i; + let colidx = col_tile * tilesize + j; + if rowidx < order && colidx < order { + b[colidx * order + rowidx] += a[rowidx * order + colidx]; + a[rowidx * order + colidx] += 1.0; + } + } + } + } + } + } + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let transpose_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2; + let mut abserr: f64 = 0.0; + for i in 0..order { + for j in 0..order { + let ij = i * order + j; + let ji = j * order + i; + let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64; + abserr += (b[ji] - reference).abs(); + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", abserr); + } + + let epsilon: f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime: f64 = (transpose_time as f64) / (iterations as f64); + let bytes: usize = 2_usize * nelems * mem::size_of::(); + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (bytes as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + abserr, epsilon + ); + return; + } +} From e5009d17778e35c547326fa689ff144948bb0fcc Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Mon, 7 Nov 2022 07:35:05 -0600 Subject: [PATCH 257/325] RUST: transpose with iter! modified: .gitignore new file: RUST/transpose-iter/Cargo.toml new file: RUST/transpose-iter/src/main.rs --- .gitignore | 2 + RUST/transpose-iter/Cargo.toml | 6 + RUST/transpose-iter/src/main.rs | 258 ++++++++++++++++++++++++++++++++ 3 files changed, 266 insertions(+) create mode 100644 RUST/transpose-iter/Cargo.toml create mode 100644 RUST/transpose-iter/src/main.rs diff --git a/.gitignore b/.gitignore index 9ba4c2b06..eef173b49 100644 --- a/.gitignore +++ b/.gitignore @@ -391,6 +391,8 @@ RUST/stencil/Cargo.lock RUST/stencil/target/ RUST/transpose/Cargo.lock RUST/transpose/target/ +RUST/transpose-iter/Cargo.lock +RUST/transpose-iter/target/ SERIAL/AMR/amr SERIAL/Branch/branch SERIAL/DGEMM/dgemm diff --git a/RUST/transpose-iter/Cargo.toml b/RUST/transpose-iter/Cargo.toml new file mode 100644 index 000000000..22fe9074e --- /dev/null +++ b/RUST/transpose-iter/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "transpose" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition = "2021" diff --git a/RUST/transpose-iter/src/main.rs b/RUST/transpose-iter/src/main.rs new file mode 100644 index 000000000..f50e7dd27 --- /dev/null +++ b/RUST/transpose-iter/src/main.rs @@ -0,0 +1,258 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +use std::env; +use std::mem; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> [tile size]"); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Matrix transpose: B = A^T"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + let mut tilesize: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = 32; + } + 4 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = match args[3].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if tilesize > order { + println!("Warning: tilesize cannot be > order, will not use tiling!"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + if tilesize < order { + println!("Tile size = {}", tilesize); + } else { + tilesize = order; + println!("Untiled"); + } + + if order % tilesize != 0 && tilesize < order { + panic!("Cannot use the given tilesize!") + }; + + let num_tiles: usize = order / tilesize; // all tiles have same size + + ///////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ///////////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + + // Initialize matrices + for i in 0..order { + for j in 0..order { + a[i * order + j] = (i * order + j) as f64; + } + } + + println!("Initialization done, running algorithm"); + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + /* + (0..num_tiles).for_each(|row_tile_idx| { + (0..num_tiles).for_each(|col_tile_idx| { + (0..tilesize).for_each(|row_within_tile| { + (0..tilesize).for_each(|col_within_tile| { + let rowidx: usize = row_tile_idx * tilesize + row_within_tile; + let colidx: usize = col_tile_idx * tilesize + col_within_tile; + b[rowidx * order + colidx] += a[colidx * order + rowidx]; + }) + }) + }) + }); + */ + + b.chunks_exact_mut(tilesize * order) + .enumerate() + // for the current set of row tiles + // and the rows corresponding to this row tile + .for_each(|(row_tile_idx, b_rows)| { + // iterator over all column tiles + (0..num_tiles).for_each(|col_tile_idx| { + // within the tile, iterate over *tilesize* rows of b + // zipped together with rows of b available in the tile + (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each( + |(row_within_tile, bi)| { + let bi_subset_cols = bi + .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize)) + .unwrap(); + // within the tile, iterator over *tilesize* columns of b + // zipped together with subset of columns of b + (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each( + |(col_within_tile, b_element)| { + let rowidx: usize = row_tile_idx * tilesize + row_within_tile; + let colidx: usize = col_tile_idx * tilesize + col_within_tile; + *b_element += a[colidx * order + rowidx]; + }, + ) + }, + ) + }) + }); + + // straightforward addition of 1.0 to all elements of A + a.iter_mut().for_each(|a_element| { + *a_element += 1.0; + }); + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let transpose_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2; + let mut abserr: f64 = 0.0; + for i in 0..order { + for j in 0..order { + let ij = i * order + j; + let ji = j * order + i; + let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64; + abserr += (b[ji] - reference).abs(); + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", abserr); + } + + let epsilon: f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime: f64 = (transpose_time as f64) / (iterations as f64); + let bytes: usize = 2_usize * nelems * mem::size_of::(); + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (bytes as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + abserr, epsilon + ); + return; + } +} From 1c30af0a05e9738a1733f1c2cef51067495a3f6d Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Tue, 8 Nov 2022 09:47:15 -0600 Subject: [PATCH 258/325] RUST: add transpose-rayon modified: .gitignore modified: RUST/Makefile modified: RUST/transpose-iter/src/main.rs new file: RUST/transpose-rayon/Cargo.toml new file: RUST/transpose-rayon/src/main.rs --- .gitignore | 2 + RUST/Makefile | 4 + RUST/transpose-iter/src/main.rs | 19 +-- RUST/transpose-rayon/Cargo.toml | 9 ++ RUST/transpose-rayon/src/main.rs | 247 +++++++++++++++++++++++++++++++ 5 files changed, 265 insertions(+), 16 deletions(-) create mode 100644 RUST/transpose-rayon/Cargo.toml create mode 100644 RUST/transpose-rayon/src/main.rs diff --git a/.gitignore b/.gitignore index eef173b49..1bacbfade 100644 --- a/.gitignore +++ b/.gitignore @@ -393,6 +393,8 @@ RUST/transpose/Cargo.lock RUST/transpose/target/ RUST/transpose-iter/Cargo.lock RUST/transpose-iter/target/ +RUST/transpose-rayon/Cargo.lock +RUST/transpose-rayon/target/ SERIAL/AMR/amr SERIAL/Branch/branch SERIAL/DGEMM/dgemm diff --git a/RUST/Makefile b/RUST/Makefile index cc3fa2d06..f72474c64 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -20,6 +20,8 @@ all: cd p2p && cargo build $(RCFLAGS) cd stencil && cargo build $(RCFLAGS) cd transpose && cargo build $(RCFLAGS) + cd transpose-iter && cargo build $(RCFLAGS) + cd transpose-rayon && cargo build $(RCFLAGS) cd dgemm && cargo build $(RCFLAGS) cd dgemm-iter && cargo build $(RCFLAGS) cd dgemm-rayon && cargo build $(RCFLAGS) @@ -31,6 +33,8 @@ clean: cd p2p && cargo clean cd stencil && cargo clean cd transpose && cargo clean + cd transpose-iter && cargo clean + cd transpose-rayon && cargo clean cd dgemm && cargo clean cd dgemm-iter && cargo clean cd dgemm-rayon && cargo clean diff --git a/RUST/transpose-iter/src/main.rs b/RUST/transpose-iter/src/main.rs index f50e7dd27..4aae33be1 100644 --- a/RUST/transpose-iter/src/main.rs +++ b/RUST/transpose-iter/src/main.rs @@ -166,35 +166,22 @@ fn main() { t0 = timer.elapsed(); } - /* - (0..num_tiles).for_each(|row_tile_idx| { - (0..num_tiles).for_each(|col_tile_idx| { - (0..tilesize).for_each(|row_within_tile| { - (0..tilesize).for_each(|col_within_tile| { - let rowidx: usize = row_tile_idx * tilesize + row_within_tile; - let colidx: usize = col_tile_idx * tilesize + col_within_tile; - b[rowidx * order + colidx] += a[colidx * order + rowidx]; - }) - }) - }) - }); - */ - b.chunks_exact_mut(tilesize * order) .enumerate() // for the current set of row tiles // and the rows corresponding to this row tile .for_each(|(row_tile_idx, b_rows)| { - // iterator over all column tiles + // iterate over all column tiles (0..num_tiles).for_each(|col_tile_idx| { // within the tile, iterate over *tilesize* rows of b // zipped together with rows of b available in the tile (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each( + // bi is the ith row of b |(row_within_tile, bi)| { let bi_subset_cols = bi .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize)) .unwrap(); - // within the tile, iterator over *tilesize* columns of b + // within the tile, iterate over *tilesize* columns of b // zipped together with subset of columns of b (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each( |(col_within_tile, b_element)| { diff --git a/RUST/transpose-rayon/Cargo.toml b/RUST/transpose-rayon/Cargo.toml new file mode 100644 index 000000000..fa75e1f79 --- /dev/null +++ b/RUST/transpose-rayon/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "transpose" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition = "2021" + +[dependencies] +rayon = "1.5" diff --git a/RUST/transpose-rayon/src/main.rs b/RUST/transpose-rayon/src/main.rs new file mode 100644 index 000000000..8cfced9c0 --- /dev/null +++ b/RUST/transpose-rayon/src/main.rs @@ -0,0 +1,247 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +use rayon::prelude::*; +use std::env; +use std::mem; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> [tile size]"); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Matrix transpose: B = A^T"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + let mut tilesize: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = 32; + } + 4 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + tilesize = match args[3].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if tilesize > order { + println!("Warning: tilesize cannot be > order, will not use tiling!"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + if tilesize < order { + println!("Tile size = {}", tilesize); + } else { + tilesize = order; + println!("Untiled"); + } + + if order % tilesize != 0 && tilesize < order { + panic!("Cannot use the given tilesize!") + }; + + let num_tiles: usize = order / tilesize; // all tiles have same size + + ///////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ///////////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + + // Initialize matrices + for i in 0..order { + for j in 0..order { + a[i * order + j] = (i * order + j) as f64; + } + } + + println!("Initialization done, running algorithm"); + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + // parallelisze outermost loop with rayon + b.par_chunks_exact_mut(tilesize * order) + .enumerate() + // for the current set of row tiles + // and the rows corresponding to this row tile + .for_each(|(row_tile_idx, b_rows)| { + // iterate over all column tiles + (0..num_tiles).for_each(|col_tile_idx| { + // within the tile, iterate over *tilesize* rows of b + // zipped together with rows of b available in the tile + (0..tilesize).zip(b_rows.chunks_exact_mut(order)).for_each( + // bi is the ith row of b + |(row_within_tile, bi)| { + let bi_subset_cols = bi + .get_mut((col_tile_idx * tilesize)..((col_tile_idx + 1) * tilesize)) + .unwrap(); + // within the tile, iterate over *tilesize* columns of b + // zipped together with subset of columns of b + (0..tilesize).zip(bi_subset_cols.iter_mut()).for_each( + |(col_within_tile, b_element)| { + let rowidx: usize = row_tile_idx * tilesize + row_within_tile; + let colidx: usize = col_tile_idx * tilesize + col_within_tile; + *b_element += a[colidx * order + rowidx]; + }, + ) + }, + ) + }) + }); + + // straightforward addition of 1.0 to all elements of A + a.par_iter_mut().for_each(|a_element| { + *a_element += 1.0; + }); + } + + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let transpose_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let addit: usize = ((iterations as usize + 1) * (iterations as usize)) / 2; + let mut abserr: f64 = 0.0; + for i in 0..order { + for j in 0..order { + let ij = i * order + j; + let ji = j * order + i; + let reference: f64 = (ij * (iterations as usize + 1) + addit) as f64; + abserr += (b[ji] - reference).abs(); + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", abserr); + } + + let epsilon: f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime: f64 = (transpose_time as f64) / (iterations as f64); + let bytes: usize = 2_usize * nelems * mem::size_of::(); + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (bytes as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + abserr, epsilon + ); + return; + } +} From 3036da41ec0e783595c0b54d72d62b72321a8163 Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Tue, 8 Nov 2022 17:07:50 -0600 Subject: [PATCH 259/325] Update nstream-kokkos for kokkos-3.7 compatibility modified: Cxx11/nstream-kokkos.cc --- Cxx11/nstream-kokkos.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc index 6ec3528de..340ce1819 100644 --- a/Cxx11/nstream-kokkos.cc +++ b/Cxx11/nstream-kokkos.cc @@ -163,7 +163,7 @@ int main(int argc, char * argv[]) double asum(0); Kokkos::parallel_reduce(length, KOKKOS_LAMBDA(size_t const i, double & inner) { - using Kokkos::Experimental::fabs; + using Kokkos::fabs; inner += fabs(A(i)); }, asum); Kokkos::fence(); From a7687472c15942630b42fca50a68f558afc79f4a Mon Sep 17 00:00:00 2001 From: Sajid Ali Date: Wed, 9 Nov 2022 10:22:31 -0600 Subject: [PATCH 260/325] RUST: clarify naming, and add old blis based dgemm as a separate kernel modified: .gitignore new file: RUST/dgemm-blis/Cargo.toml new file: RUST/dgemm-blis/src/main.rs modified: RUST/dgemm-iter/Cargo.toml modified: RUST/dgemm-rayon/Cargo.toml modified: RUST/nstream-iter/Cargo.toml modified: RUST/nstream-rayon/Cargo.toml modified: RUST/transpose-iter/Cargo.toml modified: RUST/transpose-rayon/Cargo.toml --- .gitignore | 2 + RUST/dgemm-blis/Cargo.toml | 10 ++ RUST/dgemm-blis/src/main.rs | 202 ++++++++++++++++++++++++++++++++ RUST/dgemm-iter/Cargo.toml | 2 +- RUST/dgemm-rayon/Cargo.toml | 2 +- RUST/nstream-iter/Cargo.toml | 2 +- RUST/nstream-rayon/Cargo.toml | 2 +- RUST/transpose-iter/Cargo.toml | 2 +- RUST/transpose-rayon/Cargo.toml | 2 +- 9 files changed, 220 insertions(+), 6 deletions(-) create mode 100644 RUST/dgemm-blis/Cargo.toml create mode 100644 RUST/dgemm-blis/src/main.rs diff --git a/.gitignore b/.gitignore index 1bacbfade..73e16e2da 100644 --- a/.gitignore +++ b/.gitignore @@ -381,6 +381,8 @@ RUST/nstream-rayon/Cargo.lock RUST/nstream-rayon/target/ RUST/dgemm/Cargo.lock RUST/dgemm/target/ +RUST/dgemm-blis/Cargo.lock +RUST/dgemm-blis/target/ RUST/dgemm-iter/Cargo.lock RUST/dgemm-iter/target/ RUST/dgemm-rayon/Cargo.lock diff --git a/RUST/dgemm-blis/Cargo.toml b/RUST/dgemm-blis/Cargo.toml new file mode 100644 index 000000000..3ea994400 --- /dev/null +++ b/RUST/dgemm-blis/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "dgemm-blis" +version = "0.1.0" +authors = ["Jeff Hammond ", "Sajid Ali "] + +edition="2021" + +[dependencies] +cblas = "0.4" +blas-src = { version = "0.8", features = ["blis"] } diff --git a/RUST/dgemm-blis/src/main.rs b/RUST/dgemm-blis/src/main.rs new file mode 100644 index 000000000..83ff6d041 --- /dev/null +++ b/RUST/dgemm-blis/src/main.rs @@ -0,0 +1,202 @@ +// +// Copyright (c) 2013, Intel Corporation +// Copyright (c) 2022, Sajid Ali +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products +// derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////// +// +// NAME: transpose +// +// PURPOSE: This program measures the time for the transpose of a +// column-major stored matrix into a row-major stored matrix. +// +// USAGE: Program input is the matrix order and the number of times to +// repeat the operation: +// +// transpose <# iterations> [tile size] +// +// An optional parameter specifies the tile size used to divide the +// individual matrix blocks for improved cache and TLB performance. +// +// The output consists of diagnostics to make sure the +// transpose worked and timing statistics. +// +// HISTORY: Written by Rob Van der Wijngaart, February 2009. +// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +// +/////////////////////////////////////////////// + +// Need the following to prevent linker errors per +// https://github.com/blas-lapack-rs/blas-lapack-rs.github.io/wiki +extern crate blas_src; + +use std::env; +use std::time::{Duration, Instant}; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() { + println!("Parallel Research Kernels"); + println!("Rust Dense matrix-matrix multiplication: C += A x B"); + + /////////////////////////////////////////////// + // Read and test input parameters + /////////////////////////////////////////////// + + let args: Vec = env::args().collect(); + + let iterations: u32; + let order: usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + order = match args[2].parse() { + Ok(n) => n, + Err(_) => { + help(); + return; + } + }; + } + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + + println!("Number of iterations = {}", iterations); + println!("Matrix order = {}", order); + + /////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + /////////////////////////////////////////////// + + let nelems: usize = order * order; + let mut a: Vec = vec![0.0; nelems]; + let mut b: Vec = vec![0.0; nelems]; + let mut c: Vec = vec![0.0; nelems]; + + for i in 0..order { + for j in 0..order { + a[i * order + j] = i as f64; + b[i * order + j] = i as f64; + } + } + + let timer = Instant::now(); + let mut t0: Duration = timer.elapsed(); + + for k in 0..iterations + 1 { + if k == 1 { + t0 = timer.elapsed(); + } + + //prk_dgemm(order, &mut a, &mut b, &mut c); + let m: i32 = order as i32; + let n: i32 = order as i32; + let k: i32 = order as i32; + unsafe { + cblas::dgemm( + cblas::Layout::RowMajor, + cblas::Transpose::None, + cblas::Transpose::None, + m, + n, + k, + 1.0, + &a, + m, + &b, + k, + 1.0, + &mut c, + m, + ); + } + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt: u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let dgemm_time: f64 = dtt as f64 * 1.0e-9; + + /////////////////////////////////////////////// + // Analyze and output results + /////////////////////////////////////////////// + + let forder: f64 = order as f64; + let reference: f64 = 0.25 + * (forder * forder * forder) + * (forder - 1.0) + * (forder - 1.0) + * (iterations as f64 + 1.0); + let mut checksum: f64 = 0.0; + for i in 0..order { + for j in 0..order { + checksum += c[i * order + j]; + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", checksum); + } + + let epsilon: f64 = 1.0e-8; + let residuum: f64 = (checksum - reference) / reference; + if residuum < epsilon { + println!("Solution validates"); + let avgtime: f64 = (dgemm_time as f64) / (iterations as f64); + let uorder: usize = order as usize; + let nflops: usize = 2_usize * uorder * uorder * uorder; + println!( + "Rate (MB/s): {:10.3} Avg time (s): {:10.3}", + (1.0e-6_f64) * (nflops as f64) / avgtime, + avgtime + ); + } else { + println!( + "ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", + residuum, epsilon + ); + return; + } +} diff --git a/RUST/dgemm-iter/Cargo.toml b/RUST/dgemm-iter/Cargo.toml index 5714a1fa3..af296857c 100644 --- a/RUST/dgemm-iter/Cargo.toml +++ b/RUST/dgemm-iter/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "dgemm" +name = "dgemm-iter" version = "0.1.0" authors = ["Jeff Hammond ", "Sajid Ali "] diff --git a/RUST/dgemm-rayon/Cargo.toml b/RUST/dgemm-rayon/Cargo.toml index 49886cd96..905e888df 100644 --- a/RUST/dgemm-rayon/Cargo.toml +++ b/RUST/dgemm-rayon/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "dgemm" +name = "dgemm-rayon" version = "0.1.0" authors = ["Jeff Hammond ", "Sajid Ali "] diff --git a/RUST/nstream-iter/Cargo.toml b/RUST/nstream-iter/Cargo.toml index 479e87e60..b43f54b10 100644 --- a/RUST/nstream-iter/Cargo.toml +++ b/RUST/nstream-iter/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "nstream" +name = "nstream-iter" version = "0.1.0" authors = ["Jeff Hammond ", "Thomas Hayward-Schneider "] diff --git a/RUST/nstream-rayon/Cargo.toml b/RUST/nstream-rayon/Cargo.toml index 054caa930..af291bdbf 100644 --- a/RUST/nstream-rayon/Cargo.toml +++ b/RUST/nstream-rayon/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "nstream" +name = "nstream-rayon" version = "0.1.0" authors = ["Jeff Hammond ", "Thomas Hayward-Schneider ", "Sajid Ali "] diff --git a/RUST/transpose-iter/Cargo.toml b/RUST/transpose-iter/Cargo.toml index 22fe9074e..840edb129 100644 --- a/RUST/transpose-iter/Cargo.toml +++ b/RUST/transpose-iter/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "transpose" +name = "transpose-iter" version = "0.1.0" authors = ["Jeff Hammond ", "Sajid Ali "] diff --git a/RUST/transpose-rayon/Cargo.toml b/RUST/transpose-rayon/Cargo.toml index fa75e1f79..540969f59 100644 --- a/RUST/transpose-rayon/Cargo.toml +++ b/RUST/transpose-rayon/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "transpose" +name = "transpose-rayon" version = "0.1.0" authors = ["Jeff Hammond ", "Sajid Ali "] From 929548b9fb5508e56f5e63588f5b2845089b808a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 17 Nov 2022 13:24:42 +0200 Subject: [PATCH 261/325] GCC OpenACC does not support runtime tilesizes --- C1z/transpose-openacc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/C1z/transpose-openacc.c b/C1z/transpose-openacc.c index 0ffc76c8e..8bd66d14d 100644 --- a/C1z/transpose-openacc.c +++ b/C1z/transpose-openacc.c @@ -90,7 +90,11 @@ int main(int argc, char * argv[]) printf("Number of iterations = %d\n", iterations); printf("Matrix order = %d\n", order); +#ifdef __GNUC__ + printf("Tile size = %s\n", "automatic (GCC)"); +#else printf("Tile size = %d\n", tile_size); +#endif ////////////////////////////////////////////////////////////////////// /// Allocate space for the input and transpose matrix @@ -115,7 +119,11 @@ int main(int argc, char * argv[]) if (iter==1) trans_time = prk_wtime(); +#ifdef __GNUC__ + #pragma acc parallel loop tile(*,*) deviceptr(A,B) +#else #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B) +#endif for (int i=0;i Date: Thu, 17 Nov 2022 13:25:57 +0200 Subject: [PATCH 262/325] GCC OpenACC does not support runtime tilesizes --- Cxx11/transpose-openacc.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Cxx11/transpose-openacc.cc b/Cxx11/transpose-openacc.cc index 130d424d3..c93e0414c 100644 --- a/Cxx11/transpose-openacc.cc +++ b/Cxx11/transpose-openacc.cc @@ -96,7 +96,11 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; +#ifdef __GNUC__ + std::cout << "Tile size = " << "automatic (GCC)" << std::endl; +#else std::cout << "Tile size = " << tile_size << std::endl; +#endif ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -121,7 +125,11 @@ int main(int argc, char * argv[]) if (iter==1) trans_time = prk::wtime(); +#ifdef __GNUC__ + #pragma acc parallel loop tile(*,*) deviceptr(A,B) +#else #pragma acc parallel loop tile(tile_size,tile_size) deviceptr(A,B) +#endif for (int i=0;i Date: Thu, 17 Nov 2022 13:47:34 +0200 Subject: [PATCH 263/325] fix restrict->RESTRICT --- Cxx11/transpose-openacc.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/transpose-openacc.cc b/Cxx11/transpose-openacc.cc index c93e0414c..258064534 100644 --- a/Cxx11/transpose-openacc.cc +++ b/Cxx11/transpose-openacc.cc @@ -109,8 +109,8 @@ int main(int argc, char * argv[]) double trans_time{0}; size_t bytes = order*order*sizeof(double); - double * restrict A = (double *)acc_malloc(bytes); - double * restrict B = (double *)acc_malloc(bytes); + double * RESTRICT A = (double *)acc_malloc(bytes); + double * RESTRICT B = (double *)acc_malloc(bytes); { #pragma acc parallel loop deviceptr(A,B) From 793667cbd2eb137c73b1675894cd7ad1f21ce2f5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 7 Nov 2022 08:44:06 +0200 Subject: [PATCH 264/325] add SGEMM CBLAS --- Cxx11/Makefile | 2 +- Cxx11/sgemm-cblas.cc | 340 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 Cxx11/sgemm-cblas.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 3a50f690a..ee69f1a75 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -133,7 +133,7 @@ thrust: nstream-host-thrust nstream-device-thrust \ cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas sgemm-cublas -cblas: transpose-cblas dgemm-cblas +cblas: transpose-cblas dgemm-cblas sgemm-cblas onemkl: nstream-onemkl dgemm-onemkl dgemm-multigpu-onemkl diff --git a/Cxx11/sgemm-cblas.cc b/Cxx11/sgemm-cblas.cc new file mode 100644 index 000000000..625ce693f --- /dev/null +++ b/Cxx11/sgemm-cblas.cc @@ -0,0 +1,340 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2021, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: sgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// cblas_sgemm() +/// cblas_sgemm_batch() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#if defined(MKL) +#include +#ifdef MKL_ILP64 +#error Use the MKL library for 32-bit integers! +#endif +#elif defined(ACCELERATE) +// The location of cblas.h is not in the system include path when -framework Accelerate is provided. +#include +#else +#include +#endif + +#ifdef _OPENMP +#include +#endif + +#ifdef PRK_DEBUG +#include +void prk_sgemm_loops(const int order, + const std::vector & A, + const std::vector & B, + std::vector & C) +{ + for (int i=0; i & A, + const std::vector & B, + std::vector & C) +{ + const int n = order; + const float alpha = 1.0; + const float beta = 1.0; + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, alpha, A.data(), n, B.data(), n, beta, C.data(), n); +} + +void prk_sgemm(const int order, const int batches, + const std::vector> & A, + const std::vector> & B, + std::vector> & C) +{ + const int n = order; + const float alpha = 1.0; + const float beta = 1.0; + + for (int b=0; b> & A, + const std::vector> & B, + std::vector> & C) +{ + const int n = order; + const float alpha = 1.0; + const float beta = 1.0; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic) num_threads(nt) +#endif + for (int b=0; b [ ]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (argc > 3) { + batches = std::atoi(argv[3]); + } + + if (argc>4) { + batch_threads = std::atoi(argv[4]); + } else { +#ifdef _OPENMP + batch_threads = omp_get_max_threads(); +#endif + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches > 0) { +#ifdef MKL + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; +#else + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; +#endif + } else if (batches < 0) { + if (batch_threads > 1) { + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS with " << batch_threads << " threads)" << std::endl; + } else { + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double gemm_time(0); + + const int matrices = (batches==0 ? 1 : abs(batches)); + + std::vector const M(order*order,0); + std::vector> A(matrices,M); + std::vector> B(matrices,M); + std::vector> C(matrices,M); + for (int b=0; b 0) { + prk_sgemm(order, matrices, pA, pB, pC); + } + } + gemm_time = prk::wtime() - gemm_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.0e-8; + const double forder = static_cast(order); + const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double residuum(0); + for (int b=0; b Date: Mon, 23 Jan 2023 14:58:18 +0200 Subject: [PATCH 265/325] add shmem4py (#618) * add shmem4py example * better install directions * shmem alltoall behaves different than mpi alltoall, so we have to add a barrier. * abort does not flush print Signed-off-by: Jeff Hammond Co-authored-by: Lisandro Dalcin --- PYTHON/README.md | 22 ++++ PYTHON/nstream-numpy-shmem.py | 175 ++++++++++++++++++++++++ PYTHON/stencil-numpy-mpi.py | 0 PYTHON/transpose-numpy-mpi-rma.py | 11 +- PYTHON/transpose-numpy-mpi.py | 5 +- PYTHON/transpose-numpy-shmem.py | 212 ++++++++++++++++++++++++++++++ 6 files changed, 415 insertions(+), 10 deletions(-) create mode 100755 PYTHON/nstream-numpy-shmem.py mode change 100644 => 100755 PYTHON/stencil-numpy-mpi.py create mode 100755 PYTHON/transpose-numpy-shmem.py diff --git a/PYTHON/README.md b/PYTHON/README.md index 7f670436f..9c624b775 100644 --- a/PYTHON/README.md +++ b/PYTHON/README.md @@ -1,5 +1,7 @@ # How to run +## mpi4py + ``` mpiexec -n 4 python3 -m mpi4py nstream-numpy-mpi.py 10 10000000 mpiexec -n 4 python3 -m mpi4py transpose-numpy-mpi.py 10 1000 @@ -11,3 +13,23 @@ On Mac with Homebrew, this might work better: mpiexec -n 4 ./nstream-numpy-mpi.py 10 10000000 mpiexec -n 4 ./transpose-numpy-mpi.py 10 1000 ``` + +## shmem4py + +Checkout shmem4py and build against e.g. SOS like this: +``` +$ export OSHCC=oshcc +$ python3 -m pip install . +``` + +Run like this: +``` +$ oshrun -n 4 python3 nstream-numpy-shmem.py 10 10000000 +Parallel Research Kernels version +Python SHMEM/NumPy STREAM triad: A = B + scalar * C +Number of ranks = 4 +Number of iterations = 10 +Vector length = 10000000 +Solution validates +Rate (MB/s): 22345.12038433607 Avg time (s): 0.0143208 +``` diff --git a/PYTHON/nstream-numpy-shmem.py b/PYTHON/nstream-numpy-shmem.py new file mode 100755 index 000000000..3b42f0488 --- /dev/null +++ b/PYTHON/nstream-numpy-shmem.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2023, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: nstream +# +# PURPOSE: To compute memory bandwidth when adding a vector of a given +# number of double precision values to the scalar multiple of +# another vector of the same length, and storing the result in +# a third vector. +# +# USAGE: The program takes as input the number +# of iterations to loop over the triad vectors, the length of the +# vectors, and the offset between vectors +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# NOTES: Bandwidth is determined as the number of words read, plus the +# number of words written, times the size of the words, divided +# by the execution time. For a vector length of N, the total +# number of words read and written is 4*N*sizeof(double). +# +# +# HISTORY: This code is loosely based on the Stream benchmark by John +# McCalpin, but does not follow all the Stream rules. Hence, +# reported results should not be associated with Stream in +# external publications +# +# Converted to Python by Jeff Hammond, October 2017. +# +# ******************************************************************* + +import sys +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +from shmem4py import shmem +import numpy + +def main(): + + me = shmem.my_pe() + np = shmem.n_pes() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python SHMEM/Numpy STREAM triad: A = B + scalar * C') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: python nstream.py <# iterations> ") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + total_length = int(sys.argv[2]) + if total_length < 1: + sys.exit("ERROR: length must be positive") + + length = int(total_length / np) + remainder = total_length % np + if (remainder > 0): + if (me < remainder): + length += 1 + + if (me==0): + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Vector length = ', total_length) + + shmem.barrier_all() + + # ******************************************************************** + # ** Allocate space for the input and execute STREAM triad + # ******************************************************************** + + # 0.0 is a float, which is 64b (53b of precision) + A = numpy.zeros(length) + B = numpy.full(length,2.0) + C = numpy.full(length,2.0) + + scalar = 3.0 + + for k in range(0,iterations+1): + + if k<1: + shmem.barrier_all() + t0 = timer() + + A += B + scalar * C + + + shmem.barrier_all() + t1 = timer() + nstream_time = t1 - t0 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + ar = 0.0 + br = 2.0 + cr = 2.0 + ref = 0.0 + for k in range(0,iterations+1): + ar += br + scalar * cr + + ar *= total_length + + #asum = numpy.linalg.norm(A, ord=1) + #shmem.reduce(asum) + + asum = numpy.linalg.norm(A, ord=1) + src = shmem.full(1, asum) + tgt = shmem.full(1, 0.0) + shmem.reduce(tgt,src) + asum = tgt + + epsilon=1.e-8 + if abs(ar-asum)/asum > epsilon: + if (me==0): + print('Failed Validation on output array'); + print(' Expected checksum: ',ar); + print(' Observed checksum: ',asum); + print("ERROR: solution did not validate") + else: + if (me==0): + print('Solution validates') + avgtime = nstream_time/iterations + nbytes = 4.0 * total_length * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + + +if __name__ == '__main__': + main() diff --git a/PYTHON/stencil-numpy-mpi.py b/PYTHON/stencil-numpy-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/transpose-numpy-mpi-rma.py b/PYTHON/transpose-numpy-mpi-rma.py index efa3ca359..b064596ee 100755 --- a/PYTHON/transpose-numpy-mpi-rma.py +++ b/PYTHON/transpose-numpy-mpi-rma.py @@ -159,8 +159,10 @@ def main(): for phase in range(0,np): recv_from = (me + phase) % np bsize = block_order * block_order - WA.Get(T, recv_from, [bsize * recv_from, bsize, MPI.DOUBLE]) - WA.Flush_all() + #WA.Get(T, recv_from, [bsize * me, bsize, MPI.DOUBLE]) + #WA.Flush(recv_from) + r = WA.Rget(T, recv_from, [bsize * me, bsize, MPI.DOUBLE]) + r.Wait() lo = block_order * recv_from hi = block_order * (recv_from+1) @@ -200,10 +202,7 @@ def main(): else: if (me==0): print('error ',abserr, ' exceeds threshold ',epsilon) - print("ERROR: solution did not validate") - comm.Abort() - #sys.exit("ERROR: solution did not validate") - + sys.exit("ERROR: solution did not validate") if __name__ == '__main__': main() diff --git a/PYTHON/transpose-numpy-mpi.py b/PYTHON/transpose-numpy-mpi.py index 5dacbd5ea..d0413f52f 100755 --- a/PYTHON/transpose-numpy-mpi.py +++ b/PYTHON/transpose-numpy-mpi.py @@ -190,10 +190,7 @@ def main(): else: if (me==0): print('error ',abserr, ' exceeds threshold ',epsilon) - print("ERROR: solution did not validate") - comm.Abort() - #sys.exit("ERROR: solution did not validate") - + sys.exit("ERROR: solution did not validate") if __name__ == '__main__': main() diff --git a/PYTHON/transpose-numpy-shmem.py b/PYTHON/transpose-numpy-shmem.py new file mode 100755 index 000000000..1495dec53 --- /dev/null +++ b/PYTHON/transpose-numpy-shmem.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2023, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: transpose +# +# PURPOSE: This program measures the time for the transpose of a +# column-major stored matrix into a row-major stored matrix. +# +# USAGE: Program input is the matrix order and the number of times to +# repeat the operation: +# +# transpose <# iterations> +# +# The output consists of diagnostics to make sure the +# transpose worked and timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# +# ******************************************************************* + +# Layout nomenclature +# ------------------- +# +# - Each rank owns one block of columns (Colblock) of the overall +# matrix to be transposed, as well as of the transposed matrix. +# - Colblock is stored contiguously in the memory of the rank. +# The stored format is column major, which means that matrix +# elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1) +# are "order" words apart +# - Colblock is logically composed of #ranks Blocks, but a Block is +# not stored contiguously in memory. Conceptually, the Block is +# the unit of data that gets communicated between ranks. Block i of +# rank j is locally transposed and gathered into a buffer called Work, +# which is sent to rank i, where it is scattered into Block j of the +# transposed matrix. +# - When tiling is applied to reduce TLB misses, each block gets +# accessed by tiles. +# - The original and transposed matrices are called A and B +# +# +-----------------------------------------------------------------+ +# | | | | | +# | Colblock | | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | Block | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | Overall Matrix | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# +-----------------------------------------------------------------+ + +import sys +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +from shmem4py import shmem +import numpy + +def main(): + + me = shmem.my_pe() + np = shmem.n_pes() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python SHMEM/Numpy Matrix transpose: B = A^T') + + if len(sys.argv) != 3: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./transpose <# iterations> ") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + order = int(sys.argv[2]) + if order < 1: + sys.exit("ERROR: order must be >= 1") + + if order % np != 0: + sys.exit("ERROR: matrix order ", order," should be divisible by # procs", np) + + block_order = int(order / np) + + if (me==0): + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Matrix order = ', order) + + shmem.barrier_all() + + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + LA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=float) + A = shmem.full((order,block_order),LA) + B = shmem.zeros((order,block_order)) + T = shmem.zeros((order,block_order)) + + for k in range(0,iterations+1): + + if k<1: + shmem.barrier_all() + t0 = timer() + + # this actually forms the transpose of A + #B += numpy.transpose(A) + # this only uses the transpose _view_ of A + #B += A.T + + # barrier required before alltoall for correctness + shmem.barrier_all() + shmem.alltoall(T, A) + for r in range(0,np): + lo = block_order * r + hi = block_order * (r+1) + #B[lo:hi,:] += numpy.transpose(T[lo:hi,:]) + B[lo:hi,:] += T[lo:hi,:].T + + A += 1.0 + + shmem.barrier_all() + t1 = timer() + trans_time = t1 - t0 + + shmem.free(A) + shmem.free(T) + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # allgather is non-scalable but was easier to debug + F = shmem.zeros((np,order,block_order)) + shmem.fcollect(F,B) + G = numpy.concatenate(F,axis=1) + #if (me==0): + # print(G) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) + abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) + + shmem.free(B) + shmem.free(F) + + epsilon=1.e-8 + nbytes = 2 * order**2 * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + if abserr < epsilon: + if (me==0): + print('Solution validates') + avgtime = trans_time/iterations + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + else: + if (me==0): + print('error ',abserr, ' exceeds threshold ',epsilon) + print("ERROR: solution did not validate") + + +if __name__ == '__main__': + main() From e39cd7ced7e3118352b88ab18c834111e95b3004 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 25 Jan 2023 14:05:49 +0200 Subject: [PATCH 266/325] rename (#619) --- FORTRAN/Makefile | 2 +- FORTRAN/{nstream-cufortran.cuf => nstream-cufortran.F90} | 0 FORTRAN/{transpose-cufortran.cuf => transpose-cufortran.F90} | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename FORTRAN/{nstream-cufortran.cuf => nstream-cufortran.F90} (100%) rename FORTRAN/{transpose-cufortran.cuf => transpose-cufortran.F90} (100%) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 6d3b0c1f1..625490385 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -141,7 +141,7 @@ dgemm-blas: dgemm-blas.F90 prk.mod %-openacc: %-openacc.F90 prk.mod $(FC) $(FCFLAGS) $(OPENACCFLAG) $< prk_mod.o -o $@ -%-cufortran: %-cufortran.cuf prk.mod +%-cufortran: %-cufortran.F90 prk.mod $(FC) $(FCFLAGS) $(CUFORTFLAG) $< prk_mod.o -o $@ %-stdpar: %-stdpar.F90 prk.mod diff --git a/FORTRAN/nstream-cufortran.cuf b/FORTRAN/nstream-cufortran.F90 similarity index 100% rename from FORTRAN/nstream-cufortran.cuf rename to FORTRAN/nstream-cufortran.F90 diff --git a/FORTRAN/transpose-cufortran.cuf b/FORTRAN/transpose-cufortran.F90 similarity index 100% rename from FORTRAN/transpose-cufortran.cuf rename to FORTRAN/transpose-cufortran.F90 From a8c9d697317b21860039a72d6937968223ad81d7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Nov 2022 16:03:46 +0200 Subject: [PATCH 267/325] fix name --- RUST/nstream-unsafe/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RUST/nstream-unsafe/Cargo.toml b/RUST/nstream-unsafe/Cargo.toml index 479e87e60..81a229d01 100644 --- a/RUST/nstream-unsafe/Cargo.toml +++ b/RUST/nstream-unsafe/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "nstream" +name = "nstream-unsafe" version = "0.1.0" authors = ["Jeff Hammond ", "Thomas Hayward-Schneider "] From 02937f26557fd35738016ff8bc788e8da26a2848 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Nov 2022 16:27:15 +0200 Subject: [PATCH 268/325] add dgemm-blis --- RUST/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RUST/Makefile b/RUST/Makefile index f72474c64..3516f35f1 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -23,6 +23,7 @@ all: cd transpose-iter && cargo build $(RCFLAGS) cd transpose-rayon && cargo build $(RCFLAGS) cd dgemm && cargo build $(RCFLAGS) + cd dgemm-blis && cargo build $(RCFLAGS) cd dgemm-iter && cargo build $(RCFLAGS) cd dgemm-rayon && cargo build $(RCFLAGS) clean: @@ -36,5 +37,6 @@ clean: cd transpose-iter && cargo clean cd transpose-rayon && cargo clean cd dgemm && cargo clean + cd dgemm-blis && cargo clean cd dgemm-iter && cargo clean cd dgemm-rayon && cargo clean From 98ad8948ad85599a836160a3aa6c789dba3618c9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 Mar 2023 09:55:48 +0200 Subject: [PATCH 269/325] dunno --- RUST/dgemm-blis/Cargo.toml | 2 +- common/make.defs.gcc | 55 +++++++++++++++++++------------------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/RUST/dgemm-blis/Cargo.toml b/RUST/dgemm-blis/Cargo.toml index 3ea994400..249b6fb8c 100644 --- a/RUST/dgemm-blis/Cargo.toml +++ b/RUST/dgemm-blis/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dgemm-blis" -version = "0.1.0" +version = "0.5.0" authors = ["Jeff Hammond ", "Sajid Ali "] edition="2021" diff --git a/common/make.defs.gcc b/common/make.defs.gcc index afcf1a6ae..62e540298 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -4,7 +4,7 @@ # # Base compilers and language options # -VERSION=-11 +VERSION=-10 # C99 is required in some implementations. CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt @@ -43,15 +43,15 @@ OPENACCFLAG=-fopenacc # OpenCL flags # # MacOS -OPENCLFLAG=-framework OpenCL +#OPENCLFLAG=-framework OpenCL # POCL # http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL # Linux -#OPENCLDIR=/etc/alternatives/opencl-intel-tools -#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +OPENCLDIR=/etc/alternatives/opencl-intel-tools +OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations -METALFLAG=-framework MetalPerformanceShaders +#METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # @@ -98,19 +98,17 @@ METALFLAG=-framework MetalPerformanceShaders # # hipSYCL # -SYCLDIR=/opt/hipSYCL -SYCLCXX=${SYCLDIR}/bin/syclcc-clang -SYCLFLAG=-std=c++17 -O3 -SYCLFLAG+=-DHIPSYCL +#SYCLDIR=/opt/hipSYCL +#SYCLCXX=${SYCLDIR}/bin/syclcc-clang +#SYCLFLAG=-std=c++17 -O3 +#SYCLFLAG+=-DHIPSYCL # CPU platform -SYCLFLAG+=--hipsycl-platform=cpu -SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib +#SYCLFLAG+=--hipsycl-platform=cpu +#SYCLFLAG+=-Wl,-rpath=/opt/hipSYCL/llvm/lib # -CELERITYDIR=${SYCLDIR} -CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor -CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime -MPIINC=-I/usr/include/mpich-3.2-x86_64 -MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi +#CELERITYDIR=${SYCLDIR} +#CELERITYINC=-I$(CELERITYDIR)/include/celerity -I$(CELERITYDIR)/include/celerity/vendor +#CELERITYLIB=-L$(CELERITYDIR)/lib -lcelerity_runtime # # OCCA # @@ -162,19 +160,19 @@ UPCXXFLAG+=-mtune=native -ffast-math # #BLASFLAG=-L${HOME}/BLIS/lib -lblis #-fopenmp -lpthread #CBLASFLAG=-I${HOME}/BLIS/include -BLASFLAG=-DACCELERATE -framework Accelerate -CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions -#BLASFLAG=-lblas -#`CBLASFLAG=-lblas +#BLASFLAG=-DACCELERATE -framework Accelerate +#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +BLASFLAG=-lblas +CBLASFLAG=-lblas # # CUDA flags # # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander -NVCC=/opt/llvm/cocl/bin/cocl +#NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA -NVCC=nvcc +NVCC=/usr/local/cuda-11.4/bin/nvcc CUDAFLAGS=-g -O3 -std=c++11 -CUDAFLAGS+=-arch=sm_50 +CUDAFLAGS+=-arch=sm_87 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 #CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # @@ -205,10 +203,10 @@ ISPCFLAG=-O3 --target=host --opt=fast-math # # MPI-3 # -MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.4 -MPICC=${MPIDIR}/bin/mpicc -MPICXX=${MPIDIR}/bin/mpicxx -MPIFORT=${MPIDIR}/bin/mpifort +MPIDIR=/usr +MPICC=${MPIDIR}/bin/mpicc.mpich +MPICXX=${MPIDIR}/bin/mpicxx.mpich +MPIFORT=${MPIDIR}/bin/mpifort.mpich MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi_usempif08 -lmpi #MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi @@ -241,7 +239,8 @@ PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib # single-node #COARRAYFLAG=-fcoarray=single -lcaf_single # multi-node -COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +#COARRAYFLAG=-fcoarray=lib -L/opt/homebrew/lib -lcaf_mpi +COARRAYFLAG=-fcoarray=lib -L/usr/lib/x86_64-linux-gnu/open-coarrays/mpich/lib -lcaf_mpi # # MEMKIND (used in C1z) # From 53c10ce02dbd3fd6d4bb2fd1028bd5c8987ff7cc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 31 Mar 2023 15:33:58 +0300 Subject: [PATCH 270/325] gcc apple update for ventura (#623) --- common/make.defs.gcc | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 62e540298..2f52fa0c6 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -4,7 +4,7 @@ # # Base compilers and language options # -VERSION=-10 +VERSION=-12 # C99 is required in some implementations. CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt @@ -186,16 +186,6 @@ HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide HALIDEFLAG+=${DEFAULT_OPT_FLAGS} HALIDEFLAG+=-std=c++17 # -# Halide -# -HALIDECXX=${CXX} -HALIDEDIR=/opt/halide/Halide-10.0.0-x86-64-linux -HALIDEFLAG=-I${HALIDEDIR}/include -HALIDEFLAG+=-Wl,-rpath=${HALIDEDIR}/lib -L${HALIDEDIR}/lib -lHalide -#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 -HALIDEFLAG+=${DEFAULT_OPT_FLAGS} -HALIDEFLAG+=-std=c++17 -# # ISPC # ISPC=ispc @@ -203,10 +193,10 @@ ISPCFLAG=-O3 --target=host --opt=fast-math # # MPI-3 # -MPIDIR=/usr -MPICC=${MPIDIR}/bin/mpicc.mpich -MPICXX=${MPIDIR}/bin/mpicxx.mpich -MPIFORT=${MPIDIR}/bin/mpifort.mpich +MPIDIR=/opt/homebrew/Cellar/open-mpi/4.1.5 +MPICC=${MPIDIR}/bin/mpicc +MPICXX=${MPIDIR}/bin/mpicxx +MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi_usempif08 -lmpi #MPILIB=-L${MPIDIR}/lib -lmpifort -lmpi From f3a392e609078f3a23715c609ab5069e1a4fe961 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Apr 2023 11:13:26 +0300 Subject: [PATCH 271/325] brew tbb update --- common/make.defs.gcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 2f52fa0c6..1d6dd89e8 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -121,7 +121,7 @@ OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -TBBDIR=/opt/homebrew/Cellar/tbb/2020_U3_1 +TBBDIR=/opt/homebrew/Cellar/tbb/2021.8.0 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb From 7925db02a5ebe6d65a1b49348e31151b4a01668a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Apr 2023 11:23:24 +0300 Subject: [PATCH 272/325] add flang-new docs --- doc/flang-new.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 doc/flang-new.md diff --git a/doc/flang-new.md b/doc/flang-new.md new file mode 100644 index 000000000..fe47c6e13 --- /dev/null +++ b/doc/flang-new.md @@ -0,0 +1,6 @@ +This works, but -flang-experimental-exec` and `-Wall` are ignored. + +``` +/opt/llvm/latest/bin/flang-new -flang-experimental-exec -g -O3 -ffast-math -Wall -DRADIUS=2 -DSTAR -c p2p.F90 +ld -L /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib -lSystem p2p.o prk_mod.o -o p2p /opt/llvm/latest/lib/libFortran*a +``` From 252bbb5e047cd4e95e550597b7afe6c54170abf8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 19 Jul 2023 11:04:29 +0300 Subject: [PATCH 273/325] fix petsc transpose - closes #615 (#626) --- C1z/nstream-petsc.c | 2 +- C1z/transpose-petsc.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/C1z/nstream-petsc.c b/C1z/nstream-petsc.c index 93b931872..aec86ff5c 100644 --- a/C1z/nstream-petsc.c +++ b/C1z/nstream-petsc.c @@ -119,7 +119,7 @@ int main(int argc, char * argv[]) #endif PetscPrintf(PETSC_COMM_WORLD,"Number of processes = %d\n", np); PetscPrintf(PETSC_COMM_WORLD,"Number of iterations = %d\n", iterations); - PetscPrintf(PETSC_COMM_WORLD,"Vector length = %zu\n", length); + PetscPrintf(PETSC_COMM_WORLD,"Vector length = %zu\n", (size_t)length); ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation diff --git a/C1z/transpose-petsc.c b/C1z/transpose-petsc.c index ed631cf88..aa3219c68 100644 --- a/C1z/transpose-petsc.c +++ b/C1z/transpose-petsc.c @@ -118,10 +118,7 @@ int main(int argc, char * argv[]) double trans_time = 0.0; - PetscReal zero = 0; PetscReal one = 1; - PetscReal two = 2; - PetscReal three = 3; Mat A; Mat B; @@ -144,6 +141,8 @@ int main(int argc, char * argv[]) } } ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); + // https://petsc.org/main/manualpages/Mat/MatTransposeSetPrecursor/ + ierr = MatTransposeSetPrecursor(A, AT); CHKERRQ(ierr); // B[i,j] = 0 #if 0 @@ -196,9 +195,9 @@ int main(int argc, char * argv[]) // Analyze and output results ////////////////////////////////////////////////////////////////////// - PetscReal addit = (iterations+1)*(iterations)/2; PetscReal abserr = 0; #if 0 + PetscReal addit = (iterations+1)*(iterations)/2; for (int j=0; j Date: Wed, 19 Jul 2023 14:47:55 -0400 Subject: [PATCH 274/325] Update Intel SYCL compiler driver. Update device selectors and accessors to SYCL2020. (#629) Signed-off-by: James Brodman --- Cxx11/dgemm-onemkl.cc | 2 +- Cxx11/dgemm-sycl.cc | 8 +++--- Cxx11/generate-sycl-stencil.py | 4 +-- Cxx11/nstream-dpcpp.cc | 2 +- Cxx11/nstream-onedpl.cc | 2 +- Cxx11/nstream-onemkl.cc | 2 +- Cxx11/nstream-sycl-explicit-usm.cc | 23 ++--------------- Cxx11/nstream-sycl-explicit.cc | 38 +++++++--------------------- Cxx11/nstream-sycl-usm.cc | 23 ++--------------- Cxx11/nstream-sycl.cc | 30 ++++------------------ Cxx11/p2p-hyperplane-sycl.cc | 4 +-- Cxx11/pic-sycl.cc | 15 +++++------ Cxx11/prk_sycl.h | 2 -- Cxx11/stencil-2d-sycl.cc | 27 +++----------------- Cxx11/stencil-sycl-usm.cc | 23 ++--------------- Cxx11/stencil-sycl.cc | 27 +++----------------- Cxx11/stencil_sycl.hpp | 40 +++++++++++++++--------------- Cxx11/transpose-2d-sycl.cc | 27 +++----------------- Cxx11/transpose-dpcpp.cc | 2 +- Cxx11/transpose-sycl-usm.cc | 23 ++--------------- Cxx11/transpose-sycl.cc | 27 +++----------------- Cxx11/xgemm-onemkl.cc | 21 ++-------------- common/make.defs.oneapi | 2 +- 23 files changed, 80 insertions(+), 294 deletions(-) diff --git a/Cxx11/dgemm-onemkl.cc b/Cxx11/dgemm-onemkl.cc index d1f9b65ec..0ebccd128 100644 --- a/Cxx11/dgemm-onemkl.cc +++ b/Cxx11/dgemm-onemkl.cc @@ -126,7 +126,7 @@ int main(int argc, char * argv[]) } std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); ////////////////////////////////////////////////////////////////////// diff --git a/Cxx11/dgemm-sycl.cc b/Cxx11/dgemm-sycl.cc index a7ca3dd4f..dda801652 100644 --- a/Cxx11/dgemm-sycl.cc +++ b/Cxx11/dgemm-sycl.cc @@ -73,9 +73,9 @@ void prk_dgemm(sycl::queue & q, { q.submit([&](sycl::handler& h) { - auto A = d_A.get_access(h); - auto B = d_B.get_access(h); - auto C = d_C.get_access(h); + sycl::accessor A(d_A, h, sycl::read_only); + sycl::accessor B(d_B, h, sycl::read_only); + sycl::accessor C(d_C, h); h.parallel_for( sycl::range<2>{order,order}, [=] (sycl::id<2> it) { @@ -130,7 +130,7 @@ int main(int argc, char * argv[]) return 1; } - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); if (tile_size < order) { diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index c67f2d124..9a28bdb2e 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -26,8 +26,8 @@ def codegen(src,pattern,stencil_size,radius,model,dim,usm): src.write('{\n') src.write(' q.submit([&](sycl::handler& h) {\n') if (not usm): - src.write(' auto in = d_in.template get_access(h);\n') - src.write(' auto out = d_out.template get_access(h);\n') + src.write(' sycl::accessor in(d_in, h, sycl::read_only);\n') + src.write(' sycl::accessor out(d_out, h);\n') if (dim==2): for r in range(1,radius+1): src.write(' sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n') diff --git a/Cxx11/nstream-dpcpp.cc b/Cxx11/nstream-dpcpp.cc index efc0fcaf3..4306adc12 100644 --- a/Cxx11/nstream-dpcpp.cc +++ b/Cxx11/nstream-dpcpp.cc @@ -106,7 +106,7 @@ int main(int argc, char * argv[]) std::cout << "Vector length = " << length << std::endl; std::cout << "Block size = " << block_size << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); size_t padded_length = block_size * prk::divceil(length,block_size); diff --git a/Cxx11/nstream-onedpl.cc b/Cxx11/nstream-onedpl.cc index 963683945..8cd48fc2a 100644 --- a/Cxx11/nstream-onedpl.cc +++ b/Cxx11/nstream-onedpl.cc @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) { std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Vector length = " << length << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); ////////////////////////////////////////////////////////////////////// diff --git a/Cxx11/nstream-onemkl.cc b/Cxx11/nstream-onemkl.cc index 0c69f9808..55448ec74 100644 --- a/Cxx11/nstream-onemkl.cc +++ b/Cxx11/nstream-onemkl.cc @@ -106,7 +106,7 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Vector length = " << length << std::endl; - sycl::queue q(sycl::default_selector{}, sycl::property::queue::in_order{}); + sycl::queue q(sycl::default_selector_v, sycl::property::queue::in_order{}); ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation diff --git a/Cxx11/nstream-sycl-explicit-usm.cc b/Cxx11/nstream-sycl-explicit-usm.cc index aa5c5c690..cf5f9f89a 100644 --- a/Cxx11/nstream-sycl-explicit-usm.cc +++ b/Cxx11/nstream-sycl-explicit-usm.cc @@ -275,7 +275,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE @@ -294,26 +294,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, length, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, length, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc index adf045d32..e7cf0bd57 100644 --- a/Cxx11/nstream-sycl-explicit.cc +++ b/Cxx11/nstream-sycl-explicit.cc @@ -100,15 +100,15 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) sycl::buffer d_C { sycl::range<1>{length} }; q.submit([&](sycl::handler& h) { - sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor A(d_A, h, sycl::no_init); h.fill(A,(T)0); }); q.submit([&](sycl::handler& h) { - sycl::accessor B(d_B, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor B(d_B, h, sycl::no_init); h.fill(B,(T)2); }); q.submit([&](sycl::handler& h) { - sycl::accessor C(d_C, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor C(d_C, h, sycl::no_init); h.fill(C,(T)2); }); q.wait(); @@ -118,10 +118,9 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) if (iter==1) nstream_time = prk::wtime(); q.submit([&](sycl::handler& h) { - - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); - auto C = d_C.template get_access(h); + sycl::accessor A(d_A, h); + sycl::accessor B(d_B, h, sycl::read_only); + sycl::accessor C(d_C, h, sycl::read_only); if (block_size == 0) { // hipSYCL prefers range to nd_range because no barriers @@ -164,7 +163,7 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) nstream_time = prk::wtime() - nstream_time; q.submit([&](sycl::handler& h) { - sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor A(d_A, h, sycl::read_only); h.copy(A,h_A.data()); }); q.wait(); @@ -268,26 +267,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, length, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, length, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::cpu_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE @@ -306,7 +286,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc index e872a5130..cc2865324 100644 --- a/Cxx11/nstream-sycl-usm.cc +++ b/Cxx11/nstream-sycl-usm.cc @@ -253,7 +253,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE @@ -272,26 +272,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, length, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, length, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 140125f9d..902291315 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -106,10 +106,9 @@ void run(sycl::queue & q, int iterations, size_t length, size_t block_size) if (iter==1) nstream_time = prk::wtime(); q.submit([&](sycl::handler& h) { - - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); - auto C = d_C.template get_access(h); + sycl::accessor A(d_A, h); + sycl::accessor B(d_B, h, sycl::read_only); + sycl::accessor C(d_C, h, sycl::read_only); if (block_size == 0) { // hipSYCL prefers range to nd_range because no barriers @@ -250,26 +249,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, length, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, length, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::cpu_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE @@ -288,7 +268,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, length, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc index 1e2083982..5611a84a7 100644 --- a/Cxx11/p2p-hyperplane-sycl.cc +++ b/Cxx11/p2p-hyperplane-sycl.cc @@ -148,7 +148,7 @@ int main(int argc, char* argv[]) q.submit([&](sycl::handler& h) { - auto grid = d_grid.get_access(h); + sycl::accessor grid(d_grid, h); unsigned begin = std::max(2,i-n+2); unsigned end = std::min(i,n)+1; @@ -172,7 +172,7 @@ int main(int argc, char* argv[]) } q.submit([&](sycl::handler& h) { - auto grid = d_grid.get_access(h); + sycl::accessor grid(d_grid, h); h.single_task([=] { grid[0*n+0] = -grid[(n-1)*n+(n-1)]; diff --git a/Cxx11/pic-sycl.cc b/Cxx11/pic-sycl.cc index c55e5f4ff..b47572ba7 100644 --- a/Cxx11/pic-sycl.cc +++ b/Cxx11/pic-sycl.cc @@ -523,14 +523,12 @@ int main(int argc, char ** argv) { std::string devname = (devchar==NULL ? "None" : devchar); sycl::device d; if (devname == "CPU") { - d = sycl::cpu_selector{}.select_device(); + d = sycl::device{sycl::cpu_selector_v}; } else if (devname == "GPU") { - d = sycl::gpu_selector{}.select_device(); - } else if (devname == "HOST") { - d = sycl::host_selector{}.select_device(); + d = sycl::device{sycl::gpu_selector_v}; } else { - std::cout << "PRK_DEVICE should be CPU, GPU or HOST" << std::endl; - d = sycl::default_selector{}.select_device(); + std::cout << "PRK_DEVICE should be CPU or GPU" << std::endl; + d = sycl::device{sycl::default_selector_v}; } sycl::queue q(d); prk::SYCL::print_device_platform(q); @@ -603,9 +601,8 @@ int main(int argc, char ** argv) { /* Calculate forces on particles and update positions */ q.submit([&](sycl::handler& cgh) { - - auto p = d_particles.get_access(cgh); - auto q = d_Qgrid.get_access(cgh); + sycl::accessor p(d_particles, cgh); + sycl::accessor q(d_Qgrid, cgh, sycl::read_only); cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_work_size), sycl::range<1>(local_work_size)), [=] (sycl::nd_item<1> item) { auto i = item.get_global_id(0); diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h index 8d37e489d..f70516d89 100644 --- a/Cxx11/prk_sycl.h +++ b/Cxx11/prk_sycl.h @@ -6,8 +6,6 @@ #include "CL/sycl.hpp" -namespace sycl = cl::sycl; - #if defined(__LIBSYCL_MAJOR_VERSION) && defined(__LIBSYCL_MINOR_VERSION) && defined(__LIBSYCL_PATCH_VERSION) # define __LIBSYCL_VERSION \ (__LIBSYCL_MAJOR_VERSION * 10000 + __LIBSYCL_MINOR_VERSION * 100 + __LIBSYCL_PATCH_VERSION) diff --git a/Cxx11/stencil-2d-sycl.cc b/Cxx11/stencil-2d-sycl.cc index b6eeb09bc..b945e9ad7 100644 --- a/Cxx11/stencil-2d-sycl.cc +++ b/Cxx11/stencil-2d-sycl.cc @@ -123,7 +123,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star q.submit([&](sycl::handler& h) { // accessor methods - auto in = d_in.template get_access(h); + sycl::accessor in(d_in, h); h.parallel_for>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) { sycl::id<2> xy = it.get_id(); @@ -142,7 +142,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star q.wait(); q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); + sycl::accessor in(d_in, h); // Add constant to solution to force refresh of neighbor data, if any h.parallel_for>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) { sycl::id<2> xy = it.get_id(); @@ -278,7 +278,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE @@ -297,26 +297,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, n, block_size, star, radius); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, n, block_size, star, radius); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/stencil-sycl-usm.cc b/Cxx11/stencil-sycl-usm.cc index b219b24f1..3f4a687fd 100644 --- a/Cxx11/stencil-sycl-usm.cc +++ b/Cxx11/stencil-sycl-usm.cc @@ -267,7 +267,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q(sycl::host_selector{}, sycl::property::queue::in_order{}); + sycl::queue q(sycl::cpu_selector_v, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE @@ -286,26 +286,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q(sycl::cpu_selector{}, sycl::property::queue::in_order{}); - prk::SYCL::print_device_platform(q); - run(q, iterations, n, block_size, star, radius); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, n, block_size, star, radius); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q(sycl::gpu_selector{}, sycl::property::queue::in_order{}); + sycl::queue q(sycl::gpu_selector_v, sycl::property::queue::in_order{}); prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 8947c8dee..f5eb3f6f5 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -121,7 +121,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star sycl::buffer d_out { h_out.data(), h_out.size() }; q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); + sycl::accessor in(d_in, h); h.parallel_for>(sycl::nd_range{global, local}, [=](sycl::nd_item<2> it) { const size_t i = it.get_global_id(0); const size_t j = it.get_global_id(1); @@ -140,7 +140,7 @@ void run(sycl::queue & q, int iterations, size_t n, size_t block_size, bool star q.wait(); q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); + sycl::accessor in(d_in, h); h.parallel_for>(sycl::nd_range{global, local}, [=](sycl::nd_item<2> it) { const size_t i = it.get_global_id(0); const size_t j = it.get_global_id(1); @@ -276,7 +276,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE @@ -295,26 +295,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, n, block_size, star, radius); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, n, block_size, star, radius); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, n, block_size, star, radius); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 64af40b79..5339a6826 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -5,8 +5,8 @@ template void star1(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) { const auto i = it[0] + 1; const auto j = it[1] + 1; @@ -25,8 +25,8 @@ template void star1(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); h.parallel_for>(sycl::range<2> {n-1,n-1}, [=] (sycl::item<2> it) { @@ -64,8 +64,8 @@ template void star2(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-2,n-2}, [=] (sycl::item<2> it) { const auto i = it[0] + 2; const auto j = it[1] + 2; @@ -88,8 +88,8 @@ template void star2(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); @@ -137,8 +137,8 @@ template void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-3,n-3}, [=] (sycl::item<2> it) { const auto i = it[0] + 3; const auto j = it[1] + 3; @@ -165,8 +165,8 @@ template void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); @@ -224,8 +224,8 @@ template void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-4,n-4}, [=] (sycl::item<2> it) { const auto i = it[0] + 4; const auto j = it[1] + 4; @@ -256,8 +256,8 @@ template void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); @@ -325,8 +325,8 @@ template void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); h.parallel_for>(sycl::range<2> {n-5,n-5}, [=] (sycl::item<2> it) { const auto i = it[0] + 5; const auto j = it[1] + 5; @@ -361,8 +361,8 @@ template void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { q.submit([&](sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); + sycl::accessor in(d_in, h, sycl::read_only); + sycl::accessor out(d_out, h); sycl::id<2> dx1(sycl::range<2> {1,0}); sycl::id<2> dy1(sycl::range<2> {0,1}); sycl::id<2> dx2(sycl::range<2> {2,0}); diff --git a/Cxx11/transpose-2d-sycl.cc b/Cxx11/transpose-2d-sycl.cc index 2fbe8938b..55d3b8393 100644 --- a/Cxx11/transpose-2d-sycl.cc +++ b/Cxx11/transpose-2d-sycl.cc @@ -91,8 +91,8 @@ void run(sycl::queue & q, int iterations, size_t order, size_t block_size) q.submit([&](sycl::handler& h) { // accessor methods - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); + sycl::accessor A(d_A, h); + sycl::accessor B(d_B, h); h.parallel_for>( #if PREBUILD_KERNEL @@ -214,7 +214,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE @@ -233,26 +233,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, order, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/transpose-dpcpp.cc b/Cxx11/transpose-dpcpp.cc index efdb159e3..ccd1403e0 100644 --- a/Cxx11/transpose-dpcpp.cc +++ b/Cxx11/transpose-dpcpp.cc @@ -96,7 +96,7 @@ int main(int argc, char * argv[]) std::cout << "Matrix order = " << order << std::endl; std::cout << "Block size = " << block_size << std::endl; - sycl::queue q(sycl::default_selector{}); + sycl::queue q(sycl::default_selector_v); prk::SYCL::print_device_platform(q); size_t padded_order = block_size * prk::divceil(order,block_size); diff --git a/Cxx11/transpose-sycl-usm.cc b/Cxx11/transpose-sycl-usm.cc index 1ec5c1470..249440ee0 100644 --- a/Cxx11/transpose-sycl-usm.cc +++ b/Cxx11/transpose-sycl-usm.cc @@ -197,7 +197,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE @@ -216,26 +216,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, order, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index da0d596c0..894a916bd 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -91,8 +91,8 @@ void run(sycl::queue & q, int iterations, size_t order, size_t block_size) q.submit([&](sycl::handler& h) { // accessor methods - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); + sycl::accessor A(d_A, h); + sycl::accessor B(d_B, h); h.parallel_for>( #if PREBUILD_KERNEL @@ -213,7 +213,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE @@ -232,26 +232,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order, block_size); -#ifndef DPCPP_NO_DOUBLE - run(q, iterations, order, block_size); -#endif - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order, block_size); #ifndef DPCPP_NO_DOUBLE diff --git a/Cxx11/xgemm-onemkl.cc b/Cxx11/xgemm-onemkl.cc index 68dfcb587..446777a4a 100644 --- a/Cxx11/xgemm-onemkl.cc +++ b/Cxx11/xgemm-onemkl.cc @@ -199,7 +199,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { - sycl::queue q{sycl::host_selector{}}; + sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); run(q, iterations, order); run(q, iterations, order); @@ -216,24 +216,7 @@ int main(int argc, char * argv[]) } try { - sycl::queue q{sycl::cpu_selector{}}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order); - run(q, iterations, order); - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector{}}; + sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); bool has_fp64 = prk::SYCL::has_fp64(q); if (has_fp64) { diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index 66b5dbd8e..fd8b7ece0 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -58,7 +58,7 @@ OPENCLFLAG=-I${OPENCLDIR}/include/sycl -L${OPENCLDIR}/lib -lOpenCL # # Intel oneAPI # -SYCLCXX=dpcpp +SYCLCXX=icpx SYCLFLAG=-fsycl SYCLFLAG+=-std=c++17 -O3 -g3 SYCLFLAG+=-DDPCPP From 712ff1b3795670cb5c7498f2a8bc4a155f6bb707 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 20 Jul 2023 10:25:46 +0300 Subject: [PATCH 275/325] better xgemm test for onemkl (#630) --- Cxx11/xgemm-onemkl.cc | 81 ++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/Cxx11/xgemm-onemkl.cc b/Cxx11/xgemm-onemkl.cc index 446777a4a..5dcac9384 100644 --- a/Cxx11/xgemm-onemkl.cc +++ b/Cxx11/xgemm-onemkl.cc @@ -1,5 +1,6 @@ /// /// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2023, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -63,6 +64,7 @@ #include #else #include +#include #endif using namespace oneapi; // oneapi::mkl -> mkl @@ -139,7 +141,7 @@ void run(sycl::queue & q, int iterations, int order) } const double residuum = std::abs(checksum - reference) / reference; const double epsilon{1.0e-8}; - if (residuum < epsilon) { + if ((residuum < epsilon) || (sizeof(T) < 4)) { #if VERBOSE std::cout << "Reference checksum = " << reference << "\n" << "Actual checksum = " << checksum << std::endl; @@ -147,8 +149,16 @@ void run(sycl::queue & q, int iterations, int order) std::cout << "Solution validates" << std::endl; auto avgtime = gemm_time/iterations; auto nflops = 2.0 * prk::pow(forder,3); - std::cout << "FP" << 8*sizeof(T) - << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime + auto is_fp64 = (typeid(T) == typeid(double)); + auto is_fp32 = (typeid(T) == typeid(float)); + auto is_fp16 = (typeid(T) == typeid(sycl::half)); + auto is_bf16 = (typeid(T) == typeid(oneapi::mkl::bfloat16)); + auto pname = (is_fp64 ? "FP64" : + (is_fp32 ? "FP32" : + (is_fp16 ? "FP16" : + (is_bf16 ? "BF16" : "Unknown FP type")))); + std::cout << pname + << " Rate (MF/s): " << 1.0e-6 * nflops/avgtime << " Avg time (s): " << avgtime << std::endl; } else { std::cout << "Reference checksum = " << reference << "\n" @@ -198,46 +208,31 @@ int main(int argc, char * argv[]) /// Setup SYCL environment ////////////////////////////////////////////////////////////////////// - try { - sycl::queue q{sycl::cpu_selector_v}; - prk::SYCL::print_device_platform(q); - run(q, iterations, order); - run(q, iterations, order); - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; - } - - try { - sycl::queue q{sycl::gpu_selector_v}; - prk::SYCL::print_device_platform(q); - bool has_fp64 = prk::SYCL::has_fp64(q); - if (has_fp64) { - if (prk::SYCL::print_gen12lp_helper(q)) return 1; - } - run(q, iterations, order); - if (has_fp64) { - run(q, iterations, order); - } else { - std::cout << "SYCL GPU device lacks FP64 support." << std::endl; - } - } - catch (sycl::exception & e) { - std::cout << e.what() << std::endl; - prk::SYCL::print_exception_details(e); - } - catch (std::exception & e) { - std::cout << e.what() << std::endl; - } - catch (const char * e) { - std::cout << e << std::endl; + sycl::queue qs[2] = { sycl::queue{sycl::cpu_selector_v}, + sycl::queue{sycl::gpu_selector_v} }; + for (auto q : qs) { + try { + prk::SYCL::print_device_platform(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + run(q, iterations, order); + run(q, iterations, order); + run(q, iterations, order); + if (has_fp64) { + run(q, iterations, order); + } else { + std::cout << "SYCL device lacks FP64 support." << std::endl; + } + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + } + catch (const char * e) { + std::cout << e << std::endl; + } } return 0; From 549978bc93f1a6c02715bc6874bb4b801c1b2045 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 4 May 2023 13:10:25 +0300 Subject: [PATCH 276/325] fix C ism bug --- Cxx11/pic-sycl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/pic-sycl.cc b/Cxx11/pic-sycl.cc index b47572ba7..6d5d31503 100644 --- a/Cxx11/pic-sycl.cc +++ b/Cxx11/pic-sycl.cc @@ -126,7 +126,7 @@ double * initializeGrid(uint64_t L) } /* Completes particle distribution */ -void finish_distribution(const uint64_t n, particle_t p[const n]) +void finish_distribution(const uint64_t n, particle_t p[]) { for (uint64_t pi=0; pi Date: Thu, 4 May 2023 13:10:32 +0300 Subject: [PATCH 277/325] par exec needed --- Cxx11/nstream-stdpar.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cxx11/nstream-stdpar.cc b/Cxx11/nstream-stdpar.cc index 4723f0a93..0c4cd5ecf 100644 --- a/Cxx11/nstream-stdpar.cc +++ b/Cxx11/nstream-stdpar.cc @@ -145,7 +145,8 @@ int main(int argc, char * argv[]) auto nstream = [=] (thrust::tuple t) { return thrust::get<0>(t) + thrust::get<1>(t) + scalar * thrust::get<2>(t); }; - std::transform( thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())), + std::transform( std::execution::par_unseq, + thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())), thrust::make_zip_iterator(thrust::make_tuple(A.end() , B.end() , C.end())), A.begin(), nstream); From 65547411769eec741f3ce403469ea77ee3712291 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 20 Jul 2023 10:35:35 +0300 Subject: [PATCH 278/325] disable TBB and related because they keep breaking it --- Cxx11/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index ee69f1a75..7e8cb9ce3 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -60,13 +60,13 @@ ifdef OCCADIR endif OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca -.PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl \ +.PHONY: all clean vector valarray openmp target opencl taskloop stl \ ranges kokkos raja cuda cublas sycl dpcpp \ boost-compute thrust executor oneapi onemkl EXTRA= ifneq ($(findstring nvc++,$(CXX)),nvc++) - EXTRA += ranges stl pstl + EXTRA += ranges stl #pstl tbb # TBB keeps breaking due to API changes endif ifneq ($(OPENACCFLAG),) EXTRA += openacc From 9002ac6eeaff5d09ac9bd4a3ac465be6fc37610d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 25 Jan 2023 13:49:24 +0200 Subject: [PATCH 279/325] missing sync - fixes bug --- FORTRAN/nstream-cufortran.F90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/FORTRAN/nstream-cufortran.F90 b/FORTRAN/nstream-cufortran.F90 index a6ea2ea14..1ba72a1c2 100644 --- a/FORTRAN/nstream-cufortran.F90 +++ b/FORTRAN/nstream-cufortran.F90 @@ -150,6 +150,8 @@ program main enddo ! iterations + err = cudaDeviceSynchronize() + t1 = prk_get_wtime() nstream_time = t1 - t0 From 3413d9d262fab89f45c3510609e73013dba500cd Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 21 Jul 2023 03:47:10 -0700 Subject: [PATCH 280/325] MKL measurement updates --- Cxx11/Makefile | 2 +- Cxx11/nstream-sycl.cc | 3 +- Cxx11/xgemm-cblas.cc | 345 ++++++++++++++++++++++++++++++++++++++++++ Cxx11/xgemm-onemkl.cc | 120 +++++++++++++++ 4 files changed, 468 insertions(+), 2 deletions(-) create mode 100644 Cxx11/xgemm-cblas.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 7e8cb9ce3..b1c9d01b1 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -133,7 +133,7 @@ thrust: nstream-host-thrust nstream-device-thrust \ cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas sgemm-cublas -cblas: transpose-cblas dgemm-cblas sgemm-cblas +cblas: transpose-cblas dgemm-cblas sgemm-cblas xgemm-cblas onemkl: nstream-onemkl dgemm-onemkl dgemm-multigpu-onemkl diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 902291315..bc1622c43 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -248,6 +248,7 @@ int main(int argc, char * argv[]) /// Setup SYCL environment ////////////////////////////////////////////////////////////////////// +#if 0 try { sycl::queue q{sycl::cpu_selector_v}; prk::SYCL::print_device_platform(q); @@ -266,7 +267,7 @@ int main(int argc, char * argv[]) catch (const char * e) { std::cout << e << std::endl; } - +#endif try { sycl::queue q{sycl::gpu_selector_v}; prk::SYCL::print_device_platform(q); diff --git a/Cxx11/xgemm-cblas.cc b/Cxx11/xgemm-cblas.cc new file mode 100644 index 000000000..8c298ab13 --- /dev/null +++ b/Cxx11/xgemm-cblas.cc @@ -0,0 +1,345 @@ +/// +/// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2023, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: gemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#if defined(MKL) +#include +#elif defined(ACCELERATE) +// The location of cblas.h is not in the system include path when -framework Accelerate is provided. +#include +#else +#include +#endif + +template +void prk_gemm(const CBLAS_LAYOUT Layout, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const MKL_INT M, const MKL_INT N, const MKL_INT K, + const TC alpha, + const TAB * A, const MKL_INT lda, + const TAB * B, const MKL_INT ldb, + const TC beta, + TC * C, const MKL_INT ldc) +{ + std::cerr << "No valid template match for type T" << std::endl; + std::abort(); +} + +template <> +void prk_gemm(const CBLAS_LAYOUT Layout, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const MKL_INT M, const MKL_INT N, const MKL_INT K, + const MKL_F16 alpha, + const MKL_F16 * A, const MKL_INT lda, + const MKL_F16 * B, const MKL_INT ldb, + const MKL_F16 beta, + MKL_F16 * C, const MKL_INT ldc) +{ + cblas_hgemm(Layout, TransA, TransB, + M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); +} + +template <> +void prk_gemm(const CBLAS_LAYOUT Layout, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const MKL_INT M, const MKL_INT N, const MKL_INT K, + const float alpha, + const MKL_BF16 * A, const MKL_INT lda, + const MKL_BF16 * B, const MKL_INT ldb, + const float beta, + float * C, const MKL_INT ldc) +{ + // cblas_gemm_bf16bf16f32(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, + // const CBLAS_TRANSPOSE TransB, + // const MKL_INT M, const MKL_INT N, const MKL_INT K, + // const float alpha, const MKL_BF16 *A, const MKL_INT lda, + // const MKL_BF16 *B, const MKL_INT ldb, const float beta, + // float *C, const MKL_INT ldc); + cblas_gemm_bf16bf16f32(Layout, TransA, TransB, + M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); +} + +template <> +void prk_gemm(const CBLAS_LAYOUT Layout, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const MKL_INT M, const MKL_INT N, const MKL_INT K, + const float alpha, + const float * A, const MKL_INT lda, + const float * B, const MKL_INT ldb, + const float beta, + float * C, const MKL_INT ldc) +{ + cblas_sgemm(Layout, TransA, TransB, + M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); +} + +template <> +void prk_gemm(const CBLAS_LAYOUT Layout, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const MKL_INT M, const MKL_INT N, const MKL_INT K, + const double alpha, + const double * A, const MKL_INT lda, + const double * B, const MKL_INT ldb, + const double beta, + double * C, const MKL_INT ldc) +{ + cblas_dgemm(Layout, TransA, TransB, + M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); +} + +void run_BF16(int iterations, int order) +{ + double gemm_time{0}; + + const size_t nelems = (size_t)order * (size_t)order; + + auto A = new MKL_BF16[nelems]; + auto B = new MKL_BF16[nelems]; + auto C = new float[nelems]; + + for (int i=0; i(order); + const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double checksum{0}; + for (int i=0; i "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + run(iterations, order); + run_BF16(iterations, order); + run(iterations, order); + run(iterations, order); + + return 0; +} + + diff --git a/Cxx11/xgemm-onemkl.cc b/Cxx11/xgemm-onemkl.cc index 5dcac9384..900a8d250 100644 --- a/Cxx11/xgemm-onemkl.cc +++ b/Cxx11/xgemm-onemkl.cc @@ -168,6 +168,124 @@ void run(sycl::queue & q, int iterations, int order) sycl::free(h_c, q); } +template +void run3(sycl::queue & q, int iterations, int order) +{ + double gemm_time{0}; + + const size_t nelems = (size_t)order * (size_t)order; + auto h_a = sycl::malloc_host( nelems, q); + auto h_b = sycl::malloc_host( nelems, q); + auto h_c = sycl::malloc_host( nelems, q); + + for (int i=0; i( nelems, q); + auto B = sycl::malloc_device( nelems, q); + auto C = sycl::malloc_device( nelems, q); + q.wait(); + + q.memcpy(A, &(h_a[0]), nelems * sizeof(TA)).wait(); + q.memcpy(B, &(h_b[0]), nelems * sizeof(TB)).wait(); + q.memcpy(C, &(h_c[0]), nelems * sizeof(TC)).wait(); + q.wait(); + + sycl::free(h_a, q); + sycl::free(h_b, q); + + { + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) gemm_time = prk::wtime(); + + const TA alpha{1}; + const TC beta{1}; + + mkl::blas::gemm(q, mkl::transpose::nontrans, // opA + mkl::transpose::nontrans, // opB + order, order, order, // m, n, k + alpha, // alpha + A, order, // A, lda + B, order, // B, ldb + beta, // beta + C, order); // C, ldc + q.wait(); + } + gemm_time = prk::wtime() - gemm_time; + } + // copy output back to host + q.memcpy(&(h_c[0]), C, nelems * sizeof(TC)).wait(); + + sycl::free(C, q); + sycl::free(B, q); + sycl::free(A, q); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double forder = static_cast(order); + const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double checksum{0}; + for (int i=0; i(q, iterations, order); run(q, iterations, order); + run3(q, iterations, order); + run3(q, iterations, order); run(q, iterations, order); if (has_fp64) { run(q, iterations, order); From 61f4441eab9d240d2bf690eb87fae85f08c6f8cb Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 21 Jul 2023 08:07:25 -0700 Subject: [PATCH 281/325] xgemm-onemkl wasn't in make rules --- Cxx11/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index b1c9d01b1..210804f70 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -135,7 +135,7 @@ cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm cblas: transpose-cblas dgemm-cblas sgemm-cblas xgemm-cblas -onemkl: nstream-onemkl dgemm-onemkl dgemm-multigpu-onemkl +onemkl: nstream-onemkl dgemm-onemkl dgemm-multigpu-onemkl xgemm-onemkl onedpl: nstream-onedpl From d8b80fec690bafdd518844de16d6c7689b72b824 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Jul 2023 11:01:02 +0300 Subject: [PATCH 282/325] add xgemm for cublas - WIP --- Cxx11/Makefile | 4 +- Cxx11/xgemm-cublas.cc | 258 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 Cxx11/xgemm-cublas.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 210804f70..ebf78591c 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -131,7 +131,9 @@ hipblas: nstream-hipblas sgemm-hipblas dgemm-hipblas transpose-hipblas thrust: nstream-host-thrust nstream-device-thrust \ transpose-host-thrust transpose-device-thrust -cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas sgemm-cublas +cublas: transpose-cublas nstream-cublas \ + dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas \ + sgemm-cublas xgemm-cublas cblas: transpose-cblas dgemm-cblas sgemm-cblas xgemm-cblas diff --git a/Cxx11/xgemm-cublas.cc b/Cxx11/xgemm-cublas.cc new file mode 100644 index 000000000..a199617ee --- /dev/null +++ b/Cxx11/xgemm-cublas.cc @@ -0,0 +1,258 @@ +/// +/// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2023, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: gemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" + +__global__ void init(int order, const int matrices, float * A, float * B, float * C) +{ + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto j = blockIdx.y * blockDim.y + threadIdx.y; + + for (int b=0; b +void run(sycl::queue & q, int iterations, int order) +{ + double gemm_time{0}; + + const size_t nelems = (size_t)order * (size_t)order; + const size_t bytes = nelems * sizeof(T); + auto h_a = sycl::malloc_host( nelems, q); + auto h_b = sycl::malloc_host( nelems, q); + auto h_c = sycl::malloc_host( nelems, q); + + for (int i=0; i( nelems, q); + auto B = sycl::malloc_device( nelems, q); + auto C = sycl::malloc_device( nelems, q); + q.wait(); + + q.memcpy(A, &(h_a[0]), bytes).wait(); + q.memcpy(B, &(h_b[0]), bytes).wait(); + q.memcpy(C, &(h_c[0]), bytes).wait(); + q.wait(); + + sycl::free(h_a, q); + sycl::free(h_b, q); + + { + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) gemm_time = prk::wtime(); + + const T alpha{1}; + const T beta{1}; + + mkl::blas::gemm(q, mkl::transpose::nontrans, // opA + mkl::transpose::nontrans, // opB + order, order, order, // m, n, k + alpha, // alpha + A, order, // A, lda + B, order, // B, ldb + beta, // beta + C, order); // C, ldc + q.wait(); + } + gemm_time = prk::wtime() - gemm_time; + } + // copy output back to host + q.memcpy(&(h_c[0]), C, bytes).wait(); + + sycl::free(C, q); + sycl::free(B, q); + sycl::free(A, q); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double forder = static_cast(order); + const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double checksum{0}; + for (int i=0; i "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + + sycl::queue qs[2] = { sycl::queue{sycl::cpu_selector_v}, + sycl::queue{sycl::gpu_selector_v} }; + for (auto q : qs) { + try { + prk::SYCL::print_device_platform(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + run(q, iterations, order); + run(q, iterations, order); + run(q, iterations, order); + if (has_fp64) { + run(q, iterations, order); + } else { + std::cout << "SYCL device lacks FP64 support." << std::endl; + } + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + } + catch (const char * e) { + std::cout << e << std::endl; + } + } + + return 0; +} + + From 5cfd13b0e5fd7d7e11aed45db654a7b3ee766491 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Jul 2023 11:45:30 +0300 Subject: [PATCH 283/325] xgemm for cublas (adds FP16) --- Cxx11/{xgemm-cublas.cc => xgemm-cublas.cu} | 184 +++++++++++---------- 1 file changed, 97 insertions(+), 87 deletions(-) rename Cxx11/{xgemm-cublas.cc => xgemm-cublas.cu} (62%) diff --git a/Cxx11/xgemm-cublas.cc b/Cxx11/xgemm-cublas.cu similarity index 62% rename from Cxx11/xgemm-cublas.cc rename to Cxx11/xgemm-cublas.cu index a199617ee..eff45c634 100644 --- a/Cxx11/xgemm-cublas.cc +++ b/Cxx11/xgemm-cublas.cu @@ -60,65 +60,104 @@ #include "prk_util.h" #include "prk_cuda.h" -__global__ void init(int order, const int matrices, float * A, float * B, float * C) +prk::CUDA::info info; + +template +__global__ void init(int order, T * C) { auto i = blockIdx.x * blockDim.x + threadIdx.x; auto j = blockIdx.y * blockDim.y + threadIdx.y; - for (int b=0; b +__global__ void init(int order, T * A, T * B, T * C) { auto i = blockIdx.x * blockDim.x + threadIdx.x; auto j = blockIdx.y * blockDim.y + threadIdx.y; - for (int b=0; b +void prk_gemm(const cublasHandle_t & h, + const int order, const TC alpha, const TC beta, + const TAB * A, const TAB * B, TC * C) +{ + std::cerr << "No valid template match for type T" << std::endl; + std::abort(); +} + +template <> +void prk_gemm(const cublasHandle_t & h, + const int order, const __half alpha, const __half beta, + const __half * A, const __half * B, __half * C) +{ + prk::CUDA::check( cublasHgemm(h, + CUBLAS_OP_N, CUBLAS_OP_N, + order, order, order, + &alpha, + A, order, + B, order, + &beta, + C, order) ); +} + +template <> +void prk_gemm(const cublasHandle_t & h, + const int order, const float alpha, const float beta, + const float * A, const float * B, float * C) +{ + prk::CUDA::check( cublasSgemm(h, + CUBLAS_OP_N, CUBLAS_OP_N, + order, order, order, + &alpha, + A, order, + B, order, + &beta, + C, order) ); +} + +template <> +void prk_gemm(const cublasHandle_t & h, + const int order, const double alpha, const double beta, + const double * A, const double * B, double * C) +{ + prk::CUDA::check( cublasDgemm(h, + CUBLAS_OP_N, CUBLAS_OP_N, + order, order, order, + &alpha, + A, order, + B, order, + &beta, + C, order) ); +} + template -void run(sycl::queue & q, int iterations, int order) +void run(const cublasHandle_t & h, int iterations, int order) { double gemm_time{0}; const size_t nelems = (size_t)order * (size_t)order; - const size_t bytes = nelems * sizeof(T); - auto h_a = sycl::malloc_host( nelems, q); - auto h_b = sycl::malloc_host( nelems, q); - auto h_c = sycl::malloc_host( nelems, q); - - for (int i=0; i( nelems); - // copy input from host to device - auto A = sycl::malloc_device( nelems, q); - auto B = sycl::malloc_device( nelems, q); - auto C = sycl::malloc_device( nelems, q); - q.wait(); - - q.memcpy(A, &(h_a[0]), bytes).wait(); - q.memcpy(B, &(h_b[0]), bytes).wait(); - q.memcpy(C, &(h_c[0]), bytes).wait(); - q.wait(); - - sycl::free(h_a, q); - sycl::free(h_b, q); + const int tile_size = 32; + dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + info.checkDims(dimBlock, dimGrid); + auto d_a = prk::CUDA::malloc_device(nelems); + auto d_b = prk::CUDA::malloc_device(nelems); + auto d_c = prk::CUDA::malloc_device(nelems); + init<<>>(order, d_a, d_b, d_c); + prk::CUDA::sync(); { for (int iter = 0; iter<=iterations; iter++) { @@ -127,24 +166,17 @@ void run(sycl::queue & q, int iterations, int order) const T alpha{1}; const T beta{1}; - mkl::blas::gemm(q, mkl::transpose::nontrans, // opA - mkl::transpose::nontrans, // opB - order, order, order, // m, n, k - alpha, // alpha - A, order, // A, lda - B, order, // B, ldb - beta, // beta - C, order); // C, ldc - q.wait(); + prk_gemm(h, order, alpha, beta, d_a, d_b, d_c); + prk::CUDA::sync(); } gemm_time = prk::wtime() - gemm_time; } // copy output back to host - q.memcpy(&(h_c[0]), C, bytes).wait(); + prk::CUDA::copyD2H(h_c, d_c, nelems); - sycl::free(C, q); - sycl::free(B, q); - sycl::free(A, q); + prk::CUDA::free(d_a); + prk::CUDA::free(d_b); + prk::CUDA::free(d_c); ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -154,7 +186,7 @@ void run(sycl::queue & q, int iterations, int order) const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); double checksum{0}; for (int i=0; i(h, iterations, order); + run(h, iterations, order); + run(h, iterations, order); + prk::CUDA::check( cublasDestroy(h) ); return 0; } - - From d4695f7eaa692bfd469dab27050a1b18c88ae584 Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Tue, 2 May 2023 00:03:14 +0300 Subject: [PATCH 284/325] add shmem4py stencil --- PYTHON/stencil-numpy-mpi.py | 14 +- PYTHON/stencil-numpy-shmem.py | 383 ++++++++++++++++++++++++++++++++++ 2 files changed, 390 insertions(+), 7 deletions(-) create mode 100644 PYTHON/stencil-numpy-shmem.py diff --git a/PYTHON/stencil-numpy-mpi.py b/PYTHON/stencil-numpy-mpi.py index a720596f5..0683a747d 100755 --- a/PYTHON/stencil-numpy-mpi.py +++ b/PYTHON/stencil-numpy-mpi.py @@ -99,9 +99,9 @@ def main(): sys.exit("ERROR: iterations must be >= 1") n = int(sys.argv[2]) - nsquare = n * n; + nsquare = n * n if nsquare < np: - sys.exit("ERROR: grid size ", nsquare, " must be at least # ranks: ", Num_procs); + sys.exit("ERROR: grid size ", nsquare, " must be at least # ranks: ", np) if len(sys.argv) > 3: @@ -234,7 +234,7 @@ def main(): kk=0 for a in range(jend-r+1, jend+1): a = a - jstart - for b in range(istart, iend+1) : + for b in range(istart, iend+1): b = b-istart top_buf_out[kk] = A[a+r][b+r] kk = kk+1 @@ -245,7 +245,7 @@ def main(): kk=0 for a in range(jstart, jstart+r): a = a - jstart - for b in range(istart, iend+1) : + for b in range(istart, iend+1): b = b-istart bot_buf_out[kk] = A[a+r][b+r] kk = kk+1 @@ -256,7 +256,7 @@ def main(): kk=0 for a in range(jstart, jend+1): a = a - jstart - for b in range(iend-r+1, iend+1) : + for b in range(iend-r+1, iend+1): b = b-istart right_buf_out[kk] = A[a+r][b+r] kk = kk+1 @@ -267,7 +267,7 @@ def main(): kk=0 for a in range(jstart, jend+1): a = a - jstart - for b in range(istart, istart+r) : + for b in range(istart, istart+r): b = b-istart left_buf_out[kk] = A[a+r][b+r] kk = kk+1 @@ -331,7 +331,7 @@ def main(): local_time = numpy.array(MPI.Wtime() - t0 , dtype ='f') total_time = numpy.array(0 , dtype ='f') - comm.Reduce([local_time , 1 , typ],[total_time , 1 , typ], op=MPI.SUM , root =0) + comm.Reduce([local_time , 1 , typ],[total_time , 1 , typ], op=MPI.MAX , root =0) # ******************************************************************** # ** Analyze and output results. diff --git a/PYTHON/stencil-numpy-shmem.py b/PYTHON/stencil-numpy-shmem.py new file mode 100644 index 000000000..3455e60e2 --- /dev/null +++ b/PYTHON/stencil-numpy-shmem.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2023 +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# ******************************************************************* +# +# NAME: Stencil +# +# PURPOSE: This program tests the efficiency with which a space-invariant, +# linear, symmetric filter (stencil) can be applied to a square +# grid or image. +# +# USAGE: The program takes as input the linear +# dimension of the grid, and the number of iterations on the grid +# +# +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: - Written by Tom St. John, July 2015. +# - Adapted by Rob Van der Wijngaart to introduce double buffering, December 2015 +# - Converted to Python (MPI) by Jeff Hammond, February 2016 +# - Converted to Python (SHMEM) by Marcin Rogowski, May 2023 +# +# ******************************************************************* + +# TODO: currently, only the star stencil is implemented + +import sys +import time +import numpy +from shmem4py import shmem + +def factor(r): + fac1 = int(numpy.sqrt(r+1.0)) + fac2 = 0 + for fac1 in range(fac1, 0, -1): + if r % fac1 == 0: + fac2 = r//fac1 + break + return fac1, fac2 + +def main(): + splitfence = False + + me = shmem.my_pe() + np = shmem.n_pes() + + if me==0: + print("Parallel Research Kernels") + print("Python SHMEM/Numpy Stencil execution on 2D grid") + + if len(sys.argv) < 3 or len(sys.argv) > 5: + print(f"argument count = {len(sys.argv)}") + sys.exit("Usage: ./stencil <# iterations> []") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + n = int(sys.argv[2]) + nsquare = n * n + if nsquare < np: + sys.exit(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") + + if len(sys.argv) > 3: + radius = int(sys.argv[3]) + if radius < 1: + sys.exit("ERROR: Stencil radius should be positive") + if 2*radius+1 > n: + sys.exit("ERROR: Stencil radius exceeds grid size") + else: + radius = 2 + + if me == 0: + print("Number of ranks = ", np) + print("Number of iterations = ", iterations) + print("Grid size = ", n) + print("Type of stencil = star") + print("Radius of stencil = ", radius) + print("Data type = float 64 (double precision in C)") + + weight = numpy.zeros((2*radius+1, 2*radius+1), dtype='f') + + local_stencil_time = shmem.zeros(1, dtype='f') + stencil_time = shmem.zeros(1, dtype='f') + local_norm = shmem.zeros(1, dtype='f') + norm = shmem.zeros(1, dtype='f') + iterflag = shmem.zeros(2, dtype='i') + width = shmem.zeros(1, dtype='i') + maxwidth = shmem.zeros(1, dtype='i') + height = shmem.zeros(1, dtype='i') + maxheight = shmem.zeros(1, dtype='i') + + npx, npy = factor(np) + + mex = me%npx + mey = me//npx + right_nbr = me+1 + left_nbr = me-1 + top_nbr = me+npx + bottom_nbr = me-npx + count_case = 4 + + if mex == 0: + count_case -= 1 + if mex == npx-1: + count_case -= 1 + if mey == 0: + count_case -= 1 + if mey == npy-1: + count_case -= 1 + + shmem.barrier_all() + + width[0] = n//npx + leftover = n%npx + if mex < leftover: + istart = (width[0]+1) * mex + iend = istart + width[0] + 1 + else: + istart = (width[0]+1) * leftover + width[0] * (mex-leftover) + iend = istart + width[0] + + width[0] = iend - istart + 1 + if width[0] == 0: + print(f"ERROR: rank {me} has no work to do") + shmem.global_exit(1) + + height[0] = n//npy + leftover = n%npy + if mey < leftover: + jstart = (height[0]+1) * mey + jstart = int(jstart) + jend = jstart + height[0] + 1 + else: + jstart = (height[0]+1) * leftover + height[0] * (mey-leftover) + jstart = int(jstart) + jend = jstart + height[0] + + height[0] = jend - jstart + 1 + if height == 0: + print(f"ERROR: rank {me} has no work to do") + shmem.global_exit(1) + + if width[0] < radius or height[0] < radius: + print(f"ERROR: rank {me} has work tile smaller then stencil radius") + shmem.global_exit(1) + + a = numpy.fromfunction(lambda i, j: i+istart+j+jstart, (height[0], width[0]), dtype=float) + A = numpy.zeros((height[0]+2*radius, width[0]+2*radius), dtype='f') + A[radius:-radius, radius:-radius] = a + B = numpy.zeros((height[0], width[0]), dtype='f') + + shmem.barrier_all() + shmem.max_reduce(maxwidth, width) + shmem.barrier_all() + shmem.max_reduce(maxheight, height) + + for ii in range(1, radius+1): + weight[0+radius][ii+radius] = 1.0/(2.0*ii*radius) + weight[ii+radius][0+radius] = 1.0/(2.0*ii*radius) + weight[0+radius][-ii+radius] = -1.0/(2.0*ii*radius) + weight[-ii+radius][0+radius] = -1.0/(2.0*ii*radius) + + # allocate communication buffers for halo values + top_buf_out = shmem.zeros(radius*maxwidth[0], dtype='f') + bottom_buf_out = shmem.zeros(radius*maxwidth[0], dtype='f') + + top_buf_in = {} + bottom_buf_in = {} + top_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='f') + top_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='f') + bottom_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='f') + bottom_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='f') + + right_buf_out = shmem.zeros(radius*maxheight[0], dtype='f') + left_buf_out = shmem.zeros(radius*maxheight[0], dtype='f') + + right_buf_in = {} + left_buf_in = {} + right_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='f') + right_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='f') + left_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='f') + left_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='f') + + shmem.barrier_all() + + for iter in range(0, iterations+1): + # start timer after a warmup iteration + if iter == 1: + shmem.barrier_all() + local_stencil_time[0] = time.monotonic() + + # sw determines which incoming buffer to select + sw = iter % 2 + + # need to fetch ghost point data from neighbors + if mey < npy-1: + kk = 0 + for j in range(jend-radius, jend): + j = j - jstart + for i in range(istart, iend+1): + i = i - istart + top_buf_out[kk] = A[j+radius][i+radius] + kk += 1 + shmem.put(bottom_buf_in[sw], top_buf_out, top_nbr, radius * width[0]) + if splitfence: + shmem.fence() + shmem.atomic_inc(iterflag[sw:sw+1], top_nbr) + + if mey > 0: + kk = 0 + for j in range(jstart, jstart+radius): + j = j - jstart + for i in range(istart, iend+1): + i = i - istart + bottom_buf_out[kk] = A[j+radius][i+radius] + kk += 1 + shmem.put(top_buf_in[sw], bottom_buf_out, bottom_nbr, radius*width[0]) + if splitfence: + shmem.fence() + shmem.atomic_inc(iterflag[sw:sw+1], bottom_nbr) + + if mex < npx-1: + kk = 0 + for j in range(jstart, jend+1): + j = j - jstart + for i in range(iend-radius, iend): + i = i - istart + right_buf_out[kk] = A[j+radius][i+radius] + kk += 1 + shmem.put(left_buf_in[sw], right_buf_out, right_nbr, radius*height[0]) + if splitfence: + shmem.fence() + shmem.atomic_inc(iterflag[sw:sw+1], right_nbr) + + if mex > 0: + kk = 0 + for j in range(jstart, jend+1): + j = j - jstart + for i in range(istart, istart+radius): + i = i - istart + left_buf_out[kk] = A[j+radius][i+radius] + kk += 1 + shmem.put(right_buf_in[sw], left_buf_out, left_nbr, radius*height[0]) + if splitfence: + shmem.fence() + shmem.atomic_inc(iterflag[sw:sw+1], left_nbr) + + if not splitfence: + shmem.fence() + + if mey < npy-1 and top_nbr is not None: + shmem.atomic_inc(iterflag[sw:sw+1], top_nbr) + if mey > 0 and bottom_nbr is not None: + shmem.atomic_inc(iterflag[sw:sw+1], bottom_nbr) + if mex < npx-1 and right_nbr is not None: + shmem.atomic_inc(iterflag[sw:sw+1], right_nbr) + if mex > 0 and left_nbr is not None: + shmem.atomic_inc(iterflag[sw:sw+1], left_nbr) + + shmem.wait_until(iterflag[sw:sw+1], shmem.CMP.EQ, count_case * (iter // 2 + 1)) + + if mey < npy-1: + kk = 0 + for j in range(jend, jend+radius): + j = j - jstart + for i in range(istart, iend+1): + i = i - istart + A[j+radius][i+radius] = top_buf_in[sw][kk] + kk += 1 + + if mey > 0: + kk = 0 + for j in range(jstart-radius, jstart): + j = j - jstart + for i in range(istart, iend+1): + i = i - istart + A[j+radius][i+radius] = bottom_buf_in[sw][kk] + kk += 1 + + if mex < npx-1: + kk = 0 + for j in range(jstart, jend+1): + j = j - jstart + for i in range(iend, iend+radius): + i = i - istart + A[j+radius][i+radius] = right_buf_in[sw][kk] + kk += 1 + + if mex > 0: + kk = 0 + for j in range(jstart, jend+1): + j = j - jstart + for i in range(istart-radius, istart): + i = i - istart + A[j+radius][i+radius] = left_buf_in[sw][kk] + kk += 1 + + # Apply the stencil operator + for j in range(max(jstart, radius), min(n-radius-1, jend)+1): + j = j - jstart + for i in range(max(istart, radius), min(n-radius-1, iend)+1): + i = i - istart + B[j][i] += numpy.dot(weight[radius], A[j:j+2*radius+1, i+radius]) + B[j][i] += numpy.dot(weight[:,radius], A[j+radius, i:i+2*radius+1]) + + # add constant to solution to force refresh of neighbor data, if any + numpy.add(A[0:jend-radius+1, 0:iend-radius+1], 1) + + local_stencil_time[0] = time.monotonic() - local_stencil_time[0] + shmem.barrier_all() + shmem.max_reduce(stencil_time, local_stencil_time) + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + local_norm[0] = 0.0 + for j in range(max(jstart, radius), min(n-radius, jend)): + for i in range(max(istart, radius), min(n-radius, iend)): + local_norm[0] += abs(B[j-jstart][i-istart]) + + shmem.barrier_all() + shmem.sum_reduce(norm, local_norm) + + # verify correctness + active_points = (n-2*radius)**2 + if me == 0: + epsilon = 1e-8 + norm[0] /= active_points + if radius > 0: + reference_norm = (iterations+1) * (2.0) + else: + reference_norm = 0.0 + if abs(norm[0]-reference_norm) > epsilon: + print(f"ERROR: L1 norm = {norm[0]}, Reference L1 norm = {reference_norm}") + shmem.global_exit(1) + else: + print(f"Reference L1 norm = {reference_norm}, L1 norm = {norm[0]}") + + if me == 0: + # flops/stencil: 2 flops (fma) for each point in the stencil + # plus one flop for the update of the input of the array + stencil_size = 4*radius+1 + flops = (2*stencil_size+1) * active_points + avgtime = stencil_time[0]/iterations + print(f"Rate (MFlops/s): {1.0E-06 * flops/avgtime} Avg time (s): {avgtime}") + + +if __name__ == '__main__': + main() From 1c03c57fb7f081f80e8360a8b21ef0285abad82a Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Wed, 3 May 2023 15:25:06 +0300 Subject: [PATCH 285/325] add shmem4py and mpi4py p2p --- PYTHON/p2p-numpy-mpi.py | 204 ++++++++++++++++++++++++++++++++++ PYTHON/p2p-numpy-shmem.py | 204 ++++++++++++++++++++++++++++++++++ PYTHON/stencil-numpy-shmem.py | 2 +- 3 files changed, 409 insertions(+), 1 deletion(-) create mode 100644 PYTHON/p2p-numpy-mpi.py create mode 100644 PYTHON/p2p-numpy-shmem.py diff --git a/PYTHON/p2p-numpy-mpi.py b/PYTHON/p2p-numpy-mpi.py new file mode 100644 index 000000000..56943b851 --- /dev/null +++ b/PYTHON/p2p-numpy-mpi.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2023 +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# ******************************************************************* +# +# NAME: Pipeline +# +# PURPOSE: This program tests the efficiency with which point-to-point +# synchronization can be carried out. It does so by executing +# a pipelined algorithm on an m*n grid. The first array dimension +# is distributed among the ranks (stripwise decomposition). +# +# USAGE: The program takes as input the dimensions of the grid, and the +# number of times we loop over the grid +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: - Written by Rob Van der Wijngaart, March 2006. +# - Modified by Rob Van der Wijngaart, August 2006: +# * changed boundary conditions and stencil computation to avoid +# overflow +# * introduced multiple iterations over grid and dependency between +# iterations +# - Converted to Python by Marcin Rogowski, May 2023. +# +# ******************************************************************* + +import sys +from mpi4py import MPI +import numpy + +def main(): + comm = MPI.COMM_WORLD + me = comm.Get_rank() + np = comm.Get_size() + + final = np-1 + + if me==0: + print("Parallel Research Kernels") + print("MPI pipeline execution on 2D grid") + + if len(sys.argv) < 4 or len(sys.argv) > 5: + print(f"argument count = {len(sys.argv)}") + sys.exit("Usage: ... <# iterations> <1st array dimension> <2nd array dimension> []") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + m = int(sys.argv[2]) + if m < 1: + sys.exit("ERROR: array dimension must be >= 1") + + n = int(sys.argv[3]) + if n < 1: + sys.exit("ERROR: array dimension must be >= 1") + + if len(sys.argv) == 5: + grp = int(sys.argv[4]) + if grp < 1: + sys.exit("ERROR: group factor must be >= 1") + else: + grp = 1 + + if me == 0: + print(f"Number of ranks = {np}") + print(f"Grid sizes = {m}, {n}") + print(f"Number of iterations = {iterations}") + if (grp > 1): + print(f"Group factor = {grp}") + + local_pipeline_time = 0.0 + pipeline_time = 0.0 + + segment_size = m//np + leftover = m%np + if me < leftover: + start = (segment_size+1) * me + end = start + segment_size + else: + start = (segment_size+1) * leftover + segment_size * (me-leftover) + end = start + segment_size - 1 + + # now set segment_size to the value needed by the calling rank + segment_size = end - start + 1 + grid = numpy.zeros((segment_size+1,n), dtype='d') + + inbuf = numpy.zeros(grp, dtype='d') + outbuf = numpy.zeros(grp, dtype='d') + + # set boundary values (bottom and left side of grid) + if me==0: + grid[0,:] = list(range(n)) + for i in range(start-1,end+1): + grid[i-start,0] = i + + # redefine start and end for calling rank to reflect local indices + if me==0: + start = 1 + else: + start = 0 + end = segment_size-1 + + for iter in range(0,iterations+1): + if iter == 1: + comm.Barrier() + local_pipeline_time = MPI.Wtime() + + # special case for no grouping + if grp == 1: + for j in range(1,n): + # if I am not at the left boundary, I need to wait for my left neighbor to send data + if me > 0: + comm.Recv(grid[start-1,j:j+1], source=me-1, tag=j) + + for i in range(start,end+1): + grid[i,j] = grid[i-1,j] + grid[i,j-1] - grid[i-1,j-1] + + # if I am not on the right boundary, send data to my right neighbor + if me < np-1: + comm.Send(grid[end,j:j+1], dest=me+1, tag=j) + + # apply grouping + else: + for j in range(1, n, grp): + jjsize = min(grp, n-j) + + # if I am not at the left boundary, I need to wait for my left neighbor to send data + if me > 0: + comm.Recv(inbuf, source=me-1, tag=j) + grid[start-1,j:j+jjsize] = inbuf[0:jjsize] + + for jj in range(0, jjsize): + for i in range(start, end+1): + grid[i,jj+j] = grid[i-1,jj+j] + grid[i,jj+j-1] - grid[i-1,jj+j-1] + + # if I am not on the right boundary, send data to my right neighbor + if me < np-1: + outbuf[0:jjsize] = grid[end,j:j+jjsize] + comm.Send(outbuf, dest=me+1, tag=j) + + # copy top right corner value to bottom left corner to create dependency + if np > 1: + if me == final: + corner_val = -grid[end,n-1] + comm.Send(corner_val, dest=0, tag=888) + + if me == 0: + comm.Recv(grid[0,0:1], source=final, tag=888) + else: + grid[0,0] = -grid[end,n-1] + + local_pipeline_time = MPI.Wtime() - local_pipeline_time + pipeline_time = comm.reduce(local_pipeline_time, op=MPI.MAX, root=final) + + # verify correctness, using top right value + corner_val = (iterations+1)*(m+n-2) + if me == final: + epsilon = 1e-8 + if abs(grid[end,n-1]-corner_val)/corner_val >= epsilon: + print(f"ERROR: checksum {grid[end,n-1]} does not match verification value {corner_val}") + sys.exit() + + if me == final: + avgtime = pipeline_time/iterations + print(f"Solution validates; verification value = {corner_val}") + print(f"Rate (MFlops/s): {1e-6 * 2 * (((m-1)*(n-1)))/avgtime} Avg time (s): {avgtime}") + + +if __name__ == '__main__': + main() diff --git a/PYTHON/p2p-numpy-shmem.py b/PYTHON/p2p-numpy-shmem.py new file mode 100644 index 000000000..d6d133e71 --- /dev/null +++ b/PYTHON/p2p-numpy-shmem.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2023 +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# ******************************************************************* +# +# NAME: Pipeline +# +# PURPOSE: This program tests the efficiency with which point-to-point +# synchronization can be carried out. It does so by executing +# a pipelined algorithm on an m*n grid. The first array dimension +# is distributed among the ranks (stripwise decomposition). +# +# USAGE: The program takes as input the dimensions of the grid, and the +# number of times we loop over the grid +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: - Written by Rob Van der Wijngaart, March 2006. +# - Modified by Rob Van der Wijngaart, August 2006: +# * changed boundary conditions and stencil computation to avoid +# overflow +# * introduced multiple iterations over grid and dependency between +# iterations +# - Converted to Python by Marcin Rogowski, May 2023. +# +# ******************************************************************* + +import sys +import time +import numpy +from shmem4py import shmem + +def main(): + me = shmem.my_pe() + np = shmem.n_pes() + + root = np-1 + + if me==0: + print("Parallel Research Kernels") + print("SHMEM pipeline execution on 2D grid") + + if len(sys.argv) != 4: + print(f"argument count = {len(sys.argv)}") + sys.exit("Usage: ... <# iterations> <1st array dimension> <2nd array dimension>") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + m = int(sys.argv[2]) + if m < 1: + sys.exit("ERROR: array dimension must be >= 1") + if m <= np: + print("Error: m must be greater than the number of PEs") + exit(1) + + n = int(sys.argv[3]) + if n < 1: + sys.exit("ERROR: array dimension must be >= 1") + + if me == root: + print(f"Number of ranks = {np}") + print(f"Grid sizes = {m}, {n}") + print(f"Number of iterations = {iterations}") + print(f"No handshake between neighbor threads") + + shmem.barrier_all() + + dst = shmem.zeros(n, dtype='d') + src = numpy.zeros(n, dtype='d') + + flag_left = shmem.zeros(n, dtype='i') + + local_pipeline_time = shmem.zeros(1, dtype='d') + pipeline_time = shmem.zeros(1, dtype='d') + + start = numpy.zeros(np, dtype='i') + end = numpy.zeros(np, dtype='i') + + for i in range(0,np): + segment_size = m//np + if i < m%np: + segment_size += 1 + if i > 0: + start[i] = end[i-1]+1 + + end[i] = start[i]+segment_size-1 + + segment_size = end[me] - start[me] + 1 + grid = numpy.zeros((segment_size+1,n), dtype='d') + + # set boundary values (bottom and left side of grid) + if me==0: + grid[0,:] = list(range(n)) + for i in range(start[me]-1,end[me]+1): + grid[i-start[me],0] = i + + # redefine start and end for calling rank to reflect local indices + if me==0: + start[me] = 1 + else: + start[me] = 0 + end[me] = segment_size-1 + + # initialize synchronization flags + true = shmem.array([1], dtype='i') + false = shmem.array([0], dtype='i') + + shmem.barrier_all() + + for iter in range(0,iterations+1): + true[0] = (iter+1)%2 + false[0] = 0 if true[0] else 1 + + if iter == 1: + shmem.barrier_all() + local_pipeline_time[0] = time.monotonic() + + if me==0 and np>1: + shmem.wait_until(flag_left[0:1], shmem.CMP.EQ, false) + if iter>0: + grid[start[me]-1,0] = dst[0] + + for j in range(1,n): + if me > 0: + shmem.wait_until(flag_left[j:j+1], shmem.CMP.EQ, true) + grid[start[me]-1,j] = dst[j] + + for i in range(start[me],end[me]+1): + grid[i,j] = grid[i-1,j] + grid[i,j-1] - grid[i-1,j-1] + + if me != np-1: + src[j] = grid[end[me],j] + + shmem.put(dst[j:j+1], src[j:j+1], me+1) + shmem.fence() + + # indicate to right neighbor that data is available + shmem.put(flag_left[j:j+1], true, me+1) + + if np > 1: + if me == root: + corner_val = -grid[end[me],n-1] + src [0] = corner_val + shmem.put(dst[0:1], src[0:1], 0) + shmem.fence() + # indicate to PE 0 that data is available + shmem.put(flag_left[0:1], true, 0) + else: + grid[0,0] = -grid[end[me],n-1] + + local_pipeline_time[0] = time.monotonic() - local_pipeline_time[0] + shmem.max_reduce(pipeline_time, local_pipeline_time) + + # verify correctness, using top right value + corner_val = (iterations+1)*(m+n-2) + if me == root: + epsilon = 1e-8 + if abs(grid[end[me],n-1]-corner_val)/corner_val >= epsilon: + print(f"ERROR: checksum {grid[end[me],n-1]} does not match verification value {corner_val}") + shmem.global_exit(1) + + if me == root: + avgtime = pipeline_time[0]/iterations + print(f"Solution validates; verification value = {corner_val}") + print(f"Rate (MFlops/s): {1e-6 * 2 * (((m-1)*(n-1)))/avgtime} Avg time (s): {avgtime}") + + +if __name__ == '__main__': + main() diff --git a/PYTHON/stencil-numpy-shmem.py b/PYTHON/stencil-numpy-shmem.py index 3455e60e2..49ed32d0f 100644 --- a/PYTHON/stencil-numpy-shmem.py +++ b/PYTHON/stencil-numpy-shmem.py @@ -82,7 +82,7 @@ def main(): if len(sys.argv) < 3 or len(sys.argv) > 5: print(f"argument count = {len(sys.argv)}") - sys.exit("Usage: ./stencil <# iterations> []") + sys.exit("Usage: ... <# iterations> []") iterations = int(sys.argv[1]) if iterations < 1: From dbcc0074c2ecaf0b65e83261decf93edc75e90d8 Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Thu, 25 May 2023 00:27:47 +0300 Subject: [PATCH 286/325] fix error string --- PYTHON/transpose-numpy-shmem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PYTHON/transpose-numpy-shmem.py b/PYTHON/transpose-numpy-shmem.py index 1495dec53..935e5f8cc 100755 --- a/PYTHON/transpose-numpy-shmem.py +++ b/PYTHON/transpose-numpy-shmem.py @@ -130,7 +130,7 @@ def main(): sys.exit("ERROR: order must be >= 1") if order % np != 0: - sys.exit("ERROR: matrix order ", order," should be divisible by # procs", np) + sys.exit(f"ERROR: matrix order ({order}) should be divisible by # procs ({np})") block_order = int(order / np) From 68f5778170ba3cdadee9d57726037da347bd3c2d Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Fri, 26 May 2023 15:22:18 +0300 Subject: [PATCH 287/325] add p2p-numba-{mpi,shmem} --- PYTHON/p2p-numba-mpi.py | 216 ++++++++++++++++++++++++++++++++++++++ PYTHON/p2p-numba-shmem.py | 211 +++++++++++++++++++++++++++++++++++++ 2 files changed, 427 insertions(+) create mode 100644 PYTHON/p2p-numba-mpi.py create mode 100644 PYTHON/p2p-numba-shmem.py diff --git a/PYTHON/p2p-numba-mpi.py b/PYTHON/p2p-numba-mpi.py new file mode 100644 index 000000000..24d8a85bc --- /dev/null +++ b/PYTHON/p2p-numba-mpi.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2023 +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# ******************************************************************* +# +# NAME: Pipeline +# +# PURPOSE: This program tests the efficiency with which point-to-point +# synchronization can be carried out. It does so by executing +# a pipelined algorithm on an m*n grid. The first array dimension +# is distributed among the ranks (stripwise decomposition). +# +# USAGE: The program takes as input the dimensions of the grid, and the +# number of times we loop over the grid +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: - Written by Rob Van der Wijngaart, March 2006. +# - Modified by Rob Van der Wijngaart, August 2006: +# * changed boundary conditions and stencil computation to avoid +# overflow +# * introduced multiple iterations over grid and dependency between +# iterations +# - Converted to Python by Marcin Rogowski, May 2023. +# +# ******************************************************************* + +import sys +from mpi4py import MPI +import numpy +from numba import jit + +@jit(nopython=True) +def iterate_over_grid(grid, i1, i2, j1, j2): + for i in range(i1,i2): + for j in range(j1,j2): + grid[i,j] = grid[i-1,j] + grid[i,j-1] - grid[i-1,j-1] + +@jit(nopython=True) +def iterate_over_grid_grp(grid, g, i1, i2, j1, j2): + for i in range(i1,i2): + for j in range(j1,j2): + grid[i,j+g] = grid[i-1,j+g] + grid[i,j-1+g] - grid[i-1,j-1+g] + + +def main(): + comm = MPI.COMM_WORLD + me = comm.Get_rank() + np = comm.Get_size() + + final = np-1 + + if me==0: + print("Parallel Research Kernels") + print("MPI pipeline execution on 2D grid") + + if len(sys.argv) < 4 or len(sys.argv) > 5: + print(f"argument count = {len(sys.argv)}") + sys.exit("Usage: ... <# iterations> <1st array dimension> <2nd array dimension> []") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + m = int(sys.argv[2]) + if m < 1: + sys.exit("ERROR: array dimension must be >= 1") + + n = int(sys.argv[3]) + if n < 1: + sys.exit("ERROR: array dimension must be >= 1") + + if len(sys.argv) == 5: + grp = int(sys.argv[4]) + if grp < 1: + sys.exit("ERROR: group factor must be >= 1") + else: + grp = 1 + + if me == 0: + print(f"Number of ranks = {np}") + print(f"Grid sizes = {m}, {n}") + print(f"Number of iterations = {iterations}") + if (grp > 1): + print(f"Group factor = {grp}") + + local_pipeline_time = 0.0 + pipeline_time = 0.0 + + segment_size = m//np + leftover = m%np + if me < leftover: + start = (segment_size+1) * me + end = start + segment_size + else: + start = (segment_size+1) * leftover + segment_size * (me-leftover) + end = start + segment_size - 1 + + # now set segment_size to the value needed by the calling rank + segment_size = end - start + 1 + grid = numpy.zeros((segment_size+1,n), dtype='d') + + inbuf = numpy.zeros(grp, dtype='d') + outbuf = numpy.zeros(grp, dtype='d') + + # set boundary values (bottom and left side of grid) + if me==0: + grid[0,:] = list(range(n)) + for i in range(start-1,end+1): + grid[i-start,0] = i + + # redefine start and end for calling rank to reflect local indices + if me==0: + start = 1 + else: + start = 0 + end = segment_size-1 + + for iter in range(0,iterations+1): + if iter == 1: + comm.Barrier() + local_pipeline_time = MPI.Wtime() + + # special case for no grouping + if grp == 1: + for j in range(1,n): + # if I am not at the left boundary, I need to wait for my left neighbor to send data + if me > 0: + comm.Recv(grid[start-1,j:j+1], source=me-1, tag=j) + + iterate_over_grid(grid, start, end+1, 1, n) + + for j in range(1,n): + # if I am not on the right boundary, send data to my right neighbor + if me < np-1: + comm.Send(grid[end,j:j+1], dest=me+1, tag=j) + + # apply grouping + else: + for j in range(1, n, grp): + jjsize = min(grp, n-j) + + # if I am not at the left boundary, I need to wait for my left neighbor to send data + if me > 0: + comm.Recv(inbuf, source=me-1, tag=j) + grid[start-1,j:j+jjsize] = inbuf[0:jjsize] + + iterate_over_grid_grp(grid, j, start, end+1, 0, jjsize) + + # if I am not on the right boundary, send data to my right neighbor + if me < np-1: + outbuf[0:jjsize] = grid[end,j:j+jjsize] + comm.Send(outbuf, dest=me+1, tag=j) + + # copy top right corner value to bottom left corner to create dependency + if np > 1: + if me == final: + corner_val = -grid[end,n-1] + comm.Send(corner_val, dest=0, tag=888) + + if me == 0: + comm.Recv(grid[0,0:1], source=final, tag=888) + else: + grid[0,0] = -grid[end,n-1] + + local_pipeline_time = MPI.Wtime() - local_pipeline_time + pipeline_time = comm.reduce(local_pipeline_time, op=MPI.MAX, root=final) + + # verify correctness, using top right value + corner_val = (iterations+1)*(m+n-2) + if me == final: + epsilon = 1e-8 + if abs(grid[end,n-1]-corner_val)/corner_val >= epsilon: + print(f"ERROR: checksum {grid[end,n-1]} does not match verification value {corner_val}") + sys.exit() + + if me == final: + avgtime = pipeline_time/iterations + print(f"Solution validates; verification value = {corner_val}") + print(f"Rate (MFlops/s): {1e-6 * 2 * (((m-1)*(n-1)))/avgtime} Avg time (s): {avgtime}") + + +if __name__ == '__main__': + main() diff --git a/PYTHON/p2p-numba-shmem.py b/PYTHON/p2p-numba-shmem.py new file mode 100644 index 000000000..7058bf016 --- /dev/null +++ b/PYTHON/p2p-numba-shmem.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2023 +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# ******************************************************************* +# +# NAME: Pipeline +# +# PURPOSE: This program tests the efficiency with which point-to-point +# synchronization can be carried out. It does so by executing +# a pipelined algorithm on an m*n grid. The first array dimension +# is distributed among the ranks (stripwise decomposition). +# +# USAGE: The program takes as input the dimensions of the grid, and the +# number of times we loop over the grid +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: - Written by Rob Van der Wijngaart, March 2006. +# - Modified by Rob Van der Wijngaart, August 2006: +# * changed boundary conditions and stencil computation to avoid +# overflow +# * introduced multiple iterations over grid and dependency between +# iterations +# - Converted to Python by Marcin Rogowski, May 2023. +# +# ******************************************************************* + +import sys +import time +import numpy +from shmem4py import shmem +from numba import jit + +@jit(nopython=True) +def iterate_over_grid(grid, i1, i2, j1, j2): + for i in range(i1,i2): + for j in range(j1,j2): + grid[i,j] = grid[i-1,j] + grid[i,j-1] - grid[i-1,j-1] + +def main(): + me = shmem.my_pe() + np = shmem.n_pes() + + root = np-1 + + if me==0: + print("Parallel Research Kernels") + print("SHMEM pipeline execution on 2D grid") + + if len(sys.argv) != 4: + print(f"argument count = {len(sys.argv)}") + sys.exit("Usage: ... <# iterations> <1st array dimension> <2nd array dimension>") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + m = int(sys.argv[2]) + if m < 1: + sys.exit("ERROR: array dimension must be >= 1") + if m <= np: + print("Error: m must be greater than the number of PEs") + exit(1) + + n = int(sys.argv[3]) + if n < 1: + sys.exit("ERROR: array dimension must be >= 1") + + if me == root: + print(f"Number of ranks = {np}") + print(f"Grid sizes = {m}, {n}") + print(f"Number of iterations = {iterations}") + print(f"No handshake between neighbor threads") + + shmem.barrier_all() + + dst = shmem.zeros(n, dtype='d') + src = numpy.zeros(n, dtype='d') + + flag_left = shmem.zeros(n, dtype='i') + + local_pipeline_time = shmem.zeros(1, dtype='d') + pipeline_time = shmem.zeros(1, dtype='d') + + start = numpy.zeros(np, dtype='i') + end = numpy.zeros(np, dtype='i') + + for i in range(0,np): + segment_size = m//np + if i < m%np: + segment_size += 1 + if i > 0: + start[i] = end[i-1]+1 + + end[i] = start[i]+segment_size-1 + + segment_size = end[me] - start[me] + 1 + grid = numpy.zeros((segment_size+1,n), dtype='d') + + # set boundary values (bottom and left side of grid) + if me==0: + grid[0,:] = list(range(n)) + for i in range(start[me]-1,end[me]+1): + grid[i-start[me],0] = i + + # redefine start and end for calling rank to reflect local indices + if me==0: + start[me] = 1 + else: + start[me] = 0 + end[me] = segment_size-1 + + # initialize synchronization flags + true = shmem.array([1], dtype='i') + false = shmem.array([0], dtype='i') + + shmem.barrier_all() + + for iter in range(0,iterations+1): + true[0] = (iter+1)%2 + false[0] = 0 if true[0] else 1 + + if iter == 1: + shmem.barrier_all() + local_pipeline_time[0] = time.monotonic() + + if me==0 and np>1: + shmem.wait_until(flag_left[0:1], shmem.CMP.EQ, false) + if iter>0: + grid[start[me]-1,0] = dst[0] + + for j in range(1,n): + if me > 0: + shmem.wait_until(flag_left[j:j+1], shmem.CMP.EQ, true) + grid[start[me]-1,j] = dst[j] + + iterate_over_grid(grid, start[me], end[me]+1, 1, n) + + for j in range(1,n): + if me != np-1: + src[j] = grid[end[me],j] + + shmem.put(dst[j:j+1], src[j:j+1], me+1) + shmem.fence() + + # indicate to right neighbor that data is available + shmem.put(flag_left[j:j+1], true, me+1) + + if np > 1: + if me == root: + corner_val = -grid[end[me],n-1] + src [0] = corner_val + shmem.put(dst[0:1], src[0:1], 0) + shmem.fence() + # indicate to PE 0 that data is available + shmem.put(flag_left[0:1], true, 0) + else: + grid[0,0] = -grid[end[me],n-1] + + local_pipeline_time[0] = time.monotonic() - local_pipeline_time[0] + shmem.max_reduce(pipeline_time, local_pipeline_time) + + # verify correctness, using top right value + corner_val = (iterations+1)*(m+n-2) + if me == root: + epsilon = 1e-8 + if abs(grid[end[me],n-1]-corner_val)/corner_val >= epsilon: + print(f"ERROR: checksum {grid[end[me],n-1]} does not match verification value {corner_val}") + shmem.global_exit(1) + + if me == root: + avgtime = pipeline_time[0]/iterations + print(f"Solution validates; verification value = {corner_val}") + print(f"Rate (MFlops/s): {1e-6 * 2 * (((m-1)*(n-1)))/avgtime} Avg time (s): {avgtime}") + + +if __name__ == '__main__': + main() From 8a4dba8faebb2fa69cdc1e13d65d8ea466d492f9 Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Fri, 26 May 2023 16:54:02 +0300 Subject: [PATCH 288/325] add stencil-numba-{mpi,shmem} --- PYTHON/stencil-numba-mpi.py | 375 ++++++++++++++++++++++++++++++++ PYTHON/stencil-numba-shmem.py | 390 ++++++++++++++++++++++++++++++++++ 2 files changed, 765 insertions(+) create mode 100755 PYTHON/stencil-numba-mpi.py create mode 100644 PYTHON/stencil-numba-shmem.py diff --git a/PYTHON/stencil-numba-mpi.py b/PYTHON/stencil-numba-mpi.py new file mode 100755 index 000000000..7f9a06844 --- /dev/null +++ b/PYTHON/stencil-numba-mpi.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Yijian Hu +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# ******************************************************************* +# +# NAME: Stencil +# +# PURPOSE: This program tests the efficiency with which a space-invariant, +# linear, symmetric filter (stencil) can be applied to a square +# grid or image. +# +# USAGE: The program takes as input the linear +# dimension of the grid, and the number of iterations on the grid +# +# +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: - Written by Rob Van der Wijngaart, February 2009. +# - RvdW: Removed unrolling pragmas for clarity; +# added constant to array "in" at end of each iteration to force +# refreshing of neighbor data in parallel versions; August 2013 +# - Converted to Python by Jeff Hammond, February 2016. +# +# ******************************************************************* + +import sys +from mpi4py import MPI +import numpy +from numba import jit + +@jit(nopython=True) +def star(n, r, A, B, W, jstart, jend, istart, iend): + for a in range(max(jstart, r), min(n-r-1, jend)+1): + a = a - jstart + for b in range(max(istart, r), min(n-r-1, iend)+1): + b = b - istart + for k in range(2 * r + 1): + B[a][b] += W[r][k] * A[a + k][b + r] + B[a][b] += W[k][r] * A[a + r][b + k] + +def factor(r): + fac1 = int(numpy.sqrt(r+1.0)) + fac2 = 0 + for fac1 in range(fac1, 0, -1): + if r%fac1 == 0: + fac2 = r/fac1 + break; + return fac1, fac2 + +def main(): + + comm = MPI.COMM_WORLD + me = comm.Get_rank() #My ID + np = comm.Get_size() #Number of processor, NOT numpy + x, y = factor(np) + comm = comm.Create_cart([x,y]) + coords = comm.Get_coords(me) + X = coords[0] + Y = coords[1] + + x = int(x) + y = int(y) + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if me==0: + print('Parallel Research Kernels ') + print('Python MPI/Numpy Stencil execution on 2D grid') + + if len(sys.argv) < 3 or len(sys.argv) > 5: + print('argument count = ', len(sys.argv)) + sys.exit("Usage: ./stencil <# iterations> [ ]") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + n = int(sys.argv[2]) + nsquare = n * n + if nsquare < np: + sys.exit("ERROR: grid size ", nsquare, " must be at least # ranks: ", np) + + + if len(sys.argv) > 3: + pattern = sys.argv[3] + else: + pattern = 'star' + + if len(sys.argv) > 4: + r = int(sys.argv[4]) + if r < 1: + sys.exit("ERROR: Stencil radius should be positive") + if (2*r+1) > n: + sys.exit("ERROR: Stencil radius exceeds grid size") + else: + r = 2 + + + if me == 0: + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Grid size = ', n) + if pattern == 'star': + print('Type of stencil = star') + else: + print('Type of stencil = stencil') + print('Radius of stencil = ', r) + print('Data type = float 64 (double precision in C)') + print('Compact representation of stencil loop body') + + + + W = numpy.zeros((2*r+1,2*r+1)) + if pattern == 'star': + stencil_size = 4*r+1 + #vh = numpy.fromfunction(lambda i: 1./(2*r*(i-r)), (2*r+1,), dtype=float) + #vh[r] = 0.0 + #W[:,r] = vh + #W[r,:] = vh + for i in range(1,r+1): + W[r,r+i] = +1./(2*i*r) + W[r+i,r] = +1./(2*i*r) + W[r,r-i] = -1./(2*i*r) + W[r-i,r] = -1./(2*i*r) + + else: + stencil_size = (2*r+1)**2 + #W = numpy.fromfunction(lambda i,j: 1./(4 * numpy.maximum(numpy.abs(i-r),numpy.abs(j-r)) * (2*numpy.maximum(numpy.abs(i-r),numpy.abs(j-r)) - 1) * r),(2*r+1,2*r+1),dtype=float) + #sign = numpy.fromfunction(lambda i,j: j-i,(2*r+1,2*r+1) ) + #sign = numpy.sign(sign[::-1]) + #temp = numpy.fromfunction(lambda x: 1./((x-r)*4*r),(2*r+1,),dtype=float) #main diagonal + #temp[r]=0 + #W = numpy.fill_diagonal(sign*W,temp) + for j in range(1,r+1): + for i in range(-j+1,j): + W[r+i,r+j] = +1./(4*j*(2*j-1)*r) + W[r+i,r-j] = -1./(4*j*(2*j-1)*r) + W[r+j,r+i] = +1./(4*j*(2*j-1)*r) + W[r-j,r+i] = -1./(4*j*(2*j-1)*r) + + W[r+j,r+j] = +1./(4*j*r) + W[r-j,r-j] = -1./(4*j*r) + + width = n//x + leftover = n%x + + if X 0: + bot_nbr = comm.Get_cart_rank([X,Y-1]) + if X > 0: + left_nbr = comm.Get_cart_rank([X-1,Y]) + if X < x-1: + right_nbr = comm.Get_cart_rank([X+1,Y]) + + if np > 1: + top_buf_in = numpy.zeros(r*width) + top_buf_out = numpy.zeros(r*width) + bot_buf_in = numpy.zeros(r*width) + bot_buf_out = numpy.zeros(r*width) + + right_buf_in = numpy.zeros(r*height) + right_buf_out = numpy.zeros(r*height) + left_buf_in = numpy.zeros(r*height) + left_buf_out = numpy.zeros(r*height) + + for i in range(0,iterations+1): + if i<1: + comm.Barrier() + t0 = MPI.Wtime() + + if Y < y-1 : + #req0 = comm.Irecv([top_buf_in, r*width, typ], source =top_nbr , tag =101 ) + # ^ I do not know why this does not work + req0 = comm.Irecv(top_buf_in, source =top_nbr , tag =101 ) + kk=0 + for a in range(jend-r+1, jend+1): + a = a - jstart + for b in range(istart, iend+1): + b = b-istart + top_buf_out[kk] = A[a+r][b+r] + kk = kk+1 + req1 = comm.Isend(top_buf_out, dest =top_nbr, tag =99) + + if Y > 0 : + req2 = comm.Irecv(bot_buf_in, source =bot_nbr , tag =99 ) + kk=0 + for a in range(jstart, jstart+r): + a = a - jstart + for b in range(istart, iend+1): + b = b-istart + bot_buf_out[kk] = A[a+r][b+r] + kk = kk+1 + req3 = comm.Isend(bot_buf_out, dest =bot_nbr, tag =101) + + if X < x-1 : + req4 = comm.Irecv(right_buf_in, source =right_nbr , tag =1010) + kk=0 + for a in range(jstart, jend+1): + a = a - jstart + for b in range(iend-r+1, iend+1): + b = b-istart + right_buf_out[kk] = A[a+r][b+r] + kk = kk+1 + req5 = comm.Isend(right_buf_out, dest =right_nbr, tag =990) + + if X > 0 : + req6 = comm.Irecv(left_buf_in, source =left_nbr , tag =990 ) + kk=0 + for a in range(jstart, jend+1): + a = a - jstart + for b in range(istart, istart+r): + b = b-istart + left_buf_out[kk] = A[a+r][b+r] + kk = kk+1 + req7 = comm.Isend(left_buf_out, dest =left_nbr, tag =1010) + + + if Y < y-1 : + req0.wait() + req1.wait() + kk=0 + for a in range(jend+1, jend+r+1): + a = a - jstart + for b in range(istart, iend+1): + b = b-istart + A[a+r][b+r] = top_buf_in[kk] + kk = kk+1 + + if Y > 0 : + req2.wait() + req3.wait() + kk=0 + for a in range(jstart-r, jstart): + a = a-jstart + for b in range(istart, iend+1): + b = b-istart + A[a+r][b+r] = bot_buf_in[kk] + kk = kk+1 + + if X > 0 : + req6.wait() + req7.wait() + kk=0 + for a in range(jstart, jend+1): + a = a - jstart + for b in range(istart-r, istart): + b = b-istart + A[a+r][b+r] = left_buf_in[kk] + kk = kk+1 + + if X < x-1 : + req4.wait() + req5.wait() + kk=0 + for a in range(jstart, jend+1): + a = a - jstart + for b in range(iend+1, iend+r+1): + b = b-istart + A[a+r][b+r] = right_buf_in[kk] + kk = kk+1 + + # Apply the stencil operator + star(n,r,A,B,W,jstart,jend,istart,iend) + + numpy.add(A[0:jend-r+1,0:iend-r+1],1) + + local_time = numpy.array(MPI.Wtime() - t0 , dtype ='f') + total_time = numpy.array(0 , dtype ='f') + + comm.Reduce([local_time , 1 , typ],[total_time , 1 , typ], op=MPI.MAX , root =0) + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # compute L1 norm in parallel + local_norm = 0.0; + for a in range(max(jstart,r), min(n-r-1,jend)+1): + for b in range(max(istart,r), min(n-r-1,iend)+1): + local_norm = local_norm + abs(B[a-jstart][b-istart]) + + local_norm = numpy.array(local_norm, dtype ='f') + norm = numpy.array(0 , dtype ='f') + comm.Reduce([local_norm, 1 , typ], [norm, 1, typ], op=MPI.SUM , root =0) + + if me == 0: + epsilon=1.e-8 + active_points = (n-2*r)**2 + norm = norm / active_points + if r > 0: + ref_norm = (iterations+1)*(2.0) + else: + ref_norm = 0.0 + if abs(norm-ref_norm) < epsilon: + print('Solution validates') + flops = (2*stencil_size+1) * active_points + avgtime = total_time/iterations + print('Rate (MFlops/s): ',1.e-6*flops/avgtime, ' Avg time (s): ',avgtime) + else: + print('ERROR: L1 norm = ', norm,' Reference L1 norm = ', ref_norm) + sys.exit() + + +if __name__ == '__main__': + main() diff --git a/PYTHON/stencil-numba-shmem.py b/PYTHON/stencil-numba-shmem.py new file mode 100644 index 000000000..9fe78d3da --- /dev/null +++ b/PYTHON/stencil-numba-shmem.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2023 +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# ******************************************************************* +# +# NAME: Stencil +# +# PURPOSE: This program tests the efficiency with which a space-invariant, +# linear, symmetric filter (stencil) can be applied to a square +# grid or image. +# +# USAGE: The program takes as input the linear +# dimension of the grid, and the number of iterations on the grid +# +# +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# HISTORY: - Written by Tom St. John, July 2015. +# - Adapted by Rob Van der Wijngaart to introduce double buffering, December 2015 +# - Converted to Python (MPI) by Jeff Hammond, February 2016 +# - Converted to Python (SHMEM) by Marcin Rogowski, May 2023 +# +# ******************************************************************* + +# TODO: currently, only the star stencil is implemented + +import sys +import time +import numpy +from shmem4py import shmem +from numba import jit + +@jit(nopython=True) +def star(n, r, A, B, W, jstart, jend, istart, iend): + for a in range(max(jstart, r), min(n-r-1, jend)+1): + a = a - jstart + for b in range(max(istart, r), min(n-r-1, iend)+1): + b = b - istart + + for k in range(2 * r + 1): + B[a,b] += W[r,k] * A[a + k, b + r] + B[a,b] += W[k,r] * A[a + r, b + k] + +def factor(r): + fac1 = int(numpy.sqrt(r+1.0)) + fac2 = 0 + for fac1 in range(fac1, 0, -1): + if r % fac1 == 0: + fac2 = r//fac1 + break + return fac1, fac2 + +def main(): + splitfence = False + + me = shmem.my_pe() + np = shmem.n_pes() + + if me==0: + print("Parallel Research Kernels") + print("Python SHMEM/Numpy Stencil execution on 2D grid") + + if len(sys.argv) < 3 or len(sys.argv) > 5: + print(f"argument count = {len(sys.argv)}") + sys.exit("Usage: ... <# iterations> []") + + iterations = int(sys.argv[1]) + if iterations < 1: + sys.exit("ERROR: iterations must be >= 1") + + n = int(sys.argv[2]) + nsquare = n * n + if nsquare < np: + sys.exit(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") + + if len(sys.argv) > 3: + radius = int(sys.argv[3]) + if radius < 1: + sys.exit("ERROR: Stencil radius should be positive") + if 2*radius+1 > n: + sys.exit("ERROR: Stencil radius exceeds grid size") + else: + radius = 2 + + if me == 0: + print("Number of ranks = ", np) + print("Number of iterations = ", iterations) + print("Grid size = ", n) + print("Type of stencil = star") + print("Radius of stencil = ", radius) + print("Data type = float 64 (double precision in C)") + + weight = numpy.zeros((2*radius+1, 2*radius+1), dtype='f') + + local_stencil_time = shmem.zeros(1, dtype='f') + stencil_time = shmem.zeros(1, dtype='f') + local_norm = shmem.zeros(1, dtype='f') + norm = shmem.zeros(1, dtype='f') + iterflag = shmem.zeros(2, dtype='i') + width = shmem.zeros(1, dtype='i') + maxwidth = shmem.zeros(1, dtype='i') + height = shmem.zeros(1, dtype='i') + maxheight = shmem.zeros(1, dtype='i') + + npx, npy = factor(np) + + mex = me%npx + mey = me//npx + right_nbr = me+1 + left_nbr = me-1 + top_nbr = me+npx + bottom_nbr = me-npx + count_case = 4 + + if mex == 0: + count_case -= 1 + if mex == npx-1: + count_case -= 1 + if mey == 0: + count_case -= 1 + if mey == npy-1: + count_case -= 1 + + shmem.barrier_all() + + width[0] = n//npx + leftover = n%npx + if mex < leftover: + istart = (width[0]+1) * mex + iend = istart + width[0] + 1 + else: + istart = (width[0]+1) * leftover + width[0] * (mex-leftover) + iend = istart + width[0] + + width[0] = iend - istart + 1 + if width[0] == 0: + print(f"ERROR: rank {me} has no work to do") + shmem.global_exit(1) + + height[0] = n//npy + leftover = n%npy + if mey < leftover: + jstart = (height[0]+1) * mey + jstart = int(jstart) + jend = jstart + height[0] + 1 + else: + jstart = (height[0]+1) * leftover + height[0] * (mey-leftover) + jstart = int(jstart) + jend = jstart + height[0] + + height[0] = jend - jstart + 1 + if height == 0: + print(f"ERROR: rank {me} has no work to do") + shmem.global_exit(1) + + if width[0] < radius or height[0] < radius: + print(f"ERROR: rank {me} has work tile smaller then stencil radius") + shmem.global_exit(1) + + a = numpy.fromfunction(lambda i, j: i+istart+j+jstart, (height[0], width[0]), dtype=float) + A = numpy.zeros((height[0]+2*radius, width[0]+2*radius), dtype='f') + A[radius:-radius, radius:-radius] = a + B = numpy.zeros((height[0], width[0]), dtype='f') + + shmem.barrier_all() + shmem.max_reduce(maxwidth, width) + shmem.barrier_all() + shmem.max_reduce(maxheight, height) + + for ii in range(1, radius+1): + weight[0+radius][ii+radius] = 1.0/(2.0*ii*radius) + weight[ii+radius][0+radius] = 1.0/(2.0*ii*radius) + weight[0+radius][-ii+radius] = -1.0/(2.0*ii*radius) + weight[-ii+radius][0+radius] = -1.0/(2.0*ii*radius) + + # allocate communication buffers for halo values + top_buf_out = shmem.zeros(radius*maxwidth[0], dtype='f') + bottom_buf_out = shmem.zeros(radius*maxwidth[0], dtype='f') + + top_buf_in = {} + bottom_buf_in = {} + top_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='f') + top_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='f') + bottom_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='f') + bottom_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='f') + + right_buf_out = shmem.zeros(radius*maxheight[0], dtype='f') + left_buf_out = shmem.zeros(radius*maxheight[0], dtype='f') + + right_buf_in = {} + left_buf_in = {} + right_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='f') + right_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='f') + left_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='f') + left_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='f') + + shmem.barrier_all() + + for iter in range(0, iterations+1): + # start timer after a warmup iteration + if iter == 1: + shmem.barrier_all() + local_stencil_time[0] = time.monotonic() + + # sw determines which incoming buffer to select + sw = iter % 2 + + # need to fetch ghost point data from neighbors + if mey < npy-1: + kk = 0 + for j in range(jend-radius, jend): + j = j - jstart + for i in range(istart, iend+1): + i = i - istart + top_buf_out[kk] = A[j+radius][i+radius] + kk += 1 + shmem.put(bottom_buf_in[sw], top_buf_out, top_nbr, radius * width[0]) + if splitfence: + shmem.fence() + shmem.atomic_inc(iterflag[sw:sw+1], top_nbr) + + if mey > 0: + kk = 0 + for j in range(jstart, jstart+radius): + j = j - jstart + for i in range(istart, iend+1): + i = i - istart + bottom_buf_out[kk] = A[j+radius][i+radius] + kk += 1 + shmem.put(top_buf_in[sw], bottom_buf_out, bottom_nbr, radius*width[0]) + if splitfence: + shmem.fence() + shmem.atomic_inc(iterflag[sw:sw+1], bottom_nbr) + + if mex < npx-1: + kk = 0 + for j in range(jstart, jend+1): + j = j - jstart + for i in range(iend-radius, iend): + i = i - istart + right_buf_out[kk] = A[j+radius][i+radius] + kk += 1 + shmem.put(left_buf_in[sw], right_buf_out, right_nbr, radius*height[0]) + if splitfence: + shmem.fence() + shmem.atomic_inc(iterflag[sw:sw+1], right_nbr) + + if mex > 0: + kk = 0 + for j in range(jstart, jend+1): + j = j - jstart + for i in range(istart, istart+radius): + i = i - istart + left_buf_out[kk] = A[j+radius][i+radius] + kk += 1 + shmem.put(right_buf_in[sw], left_buf_out, left_nbr, radius*height[0]) + if splitfence: + shmem.fence() + shmem.atomic_inc(iterflag[sw:sw+1], left_nbr) + + if not splitfence: + shmem.fence() + + if mey < npy-1 and top_nbr is not None: + shmem.atomic_inc(iterflag[sw:sw+1], top_nbr) + if mey > 0 and bottom_nbr is not None: + shmem.atomic_inc(iterflag[sw:sw+1], bottom_nbr) + if mex < npx-1 and right_nbr is not None: + shmem.atomic_inc(iterflag[sw:sw+1], right_nbr) + if mex > 0 and left_nbr is not None: + shmem.atomic_inc(iterflag[sw:sw+1], left_nbr) + + shmem.wait_until(iterflag[sw:sw+1], shmem.CMP.EQ, count_case * (iter // 2 + 1)) + + if mey < npy-1: + kk = 0 + for j in range(jend, jend+radius): + j = j - jstart + for i in range(istart, iend+1): + i = i - istart + A[j+radius][i+radius] = top_buf_in[sw][kk] + kk += 1 + + if mey > 0: + kk = 0 + for j in range(jstart-radius, jstart): + j = j - jstart + for i in range(istart, iend+1): + i = i - istart + A[j+radius][i+radius] = bottom_buf_in[sw][kk] + kk += 1 + + if mex < npx-1: + kk = 0 + for j in range(jstart, jend+1): + j = j - jstart + for i in range(iend, iend+radius): + i = i - istart + A[j+radius][i+radius] = right_buf_in[sw][kk] + kk += 1 + + if mex > 0: + kk = 0 + for j in range(jstart, jend+1): + j = j - jstart + for i in range(istart-radius, istart): + i = i - istart + A[j+radius][i+radius] = left_buf_in[sw][kk] + kk += 1 + + # Apply the stencil operator + star(n,radius,A,B,weight,jstart,jend,istart,iend) + + # add constant to solution to force refresh of neighbor data, if any + numpy.add(A[0:jend-radius+1, 0:iend-radius+1], 1) + + local_stencil_time[0] = time.monotonic() - local_stencil_time[0] + shmem.barrier_all() + shmem.max_reduce(stencil_time, local_stencil_time) + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + local_norm[0] = 0.0 + for j in range(max(jstart, radius), min(n-radius, jend)): + for i in range(max(istart, radius), min(n-radius, iend)): + local_norm[0] += abs(B[j-jstart][i-istart]) + + shmem.barrier_all() + shmem.sum_reduce(norm, local_norm) + + # verify correctness + active_points = (n-2*radius)**2 + if me == 0: + epsilon = 1e-8 + norm[0] /= active_points + if radius > 0: + reference_norm = (iterations+1) * (2.0) + else: + reference_norm = 0.0 + if abs(norm[0]-reference_norm) > epsilon: + print(f"ERROR: L1 norm = {norm[0]}, Reference L1 norm = {reference_norm}") + shmem.global_exit(1) + else: + print(f"Reference L1 norm = {reference_norm}, L1 norm = {norm[0]}") + + if me == 0: + # flops/stencil: 2 flops (fma) for each point in the stencil + # plus one flop for the update of the input of the array + stencil_size = 4*radius+1 + flops = (2*stencil_size+1) * active_points + avgtime = stencil_time[0]/iterations + print(f"Rate (MFlops/s): {1.0E-06 * flops/avgtime} Avg time (s): {avgtime}") + + +if __name__ == '__main__': + main() From e78c1b3f44ed5c058463d0b2607894c3cba15df3 Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Sun, 28 May 2023 20:10:52 +0300 Subject: [PATCH 289/325] fixes --- PYTHON/stencil-numba-mpi.py | 30 +++++++------- PYTHON/stencil-numba-shmem.py | 69 +++++++++++++++++-------------- PYTHON/stencil-numpy-mpi.py | 27 ++++++------ PYTHON/stencil-numpy-shmem.py | 67 ++++++++++++++++-------------- PYTHON/transpose-numpy-mpi-p2p.py | 8 ++-- PYTHON/transpose-numpy-mpi.py | 6 +-- PYTHON/transpose-numpy-shmem.py | 4 +- 7 files changed, 113 insertions(+), 98 deletions(-) diff --git a/PYTHON/stencil-numba-mpi.py b/PYTHON/stencil-numba-mpi.py index 7f9a06844..48efac92d 100755 --- a/PYTHON/stencil-numba-mpi.py +++ b/PYTHON/stencil-numba-mpi.py @@ -76,7 +76,7 @@ def factor(r): for fac1 in range(fac1, 0, -1): if r%fac1 == 0: fac2 = r/fac1 - break; + break return fac1, fac2 def main(): @@ -99,7 +99,7 @@ def main(): if me==0: print('Parallel Research Kernels ') - print('Python MPI/Numpy Stencil execution on 2D grid') + print('Python MPI/Numba Stencil execution on 2D grid') if len(sys.argv) < 3 or len(sys.argv) > 5: print('argument count = ', len(sys.argv)) @@ -112,7 +112,7 @@ def main(): n = int(sys.argv[2]) nsquare = n * n if nsquare < np: - sys.exit("ERROR: grid size ", nsquare, " must be at least # ranks: ", np) + sys.exit(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") if len(sys.argv) > 3: @@ -188,7 +188,7 @@ def main(): width = iend - istart + 1 if width == 0 : - sys.exit("ERROR: rank", me,"has no work to do") + sys.exit(f"ERROR: rank {me} has no work to do") height = n//y leftover = n%y @@ -202,16 +202,16 @@ def main(): height = jend - jstart + 1 if height == 0: - sys.exit("ERROR: rank", me,"has no work to do") + sys.exit(f"ERROR: rank {me} has no work to do") if width < r or height < r: - sys.exit("ERROR: rank", me,"has work tile smaller then stencil radius") + sys.exit(f"ERROR: rank {me} has work tile smaller then stencil radius") A = numpy.zeros((height+2*r,width+2*r)) - a = numpy.fromfunction(lambda i,j: i+istart+j+jstart,(height,width),dtype=float) + a = numpy.fromfunction(lambda i,j: i+istart+j+jstart,(height,width),dtype='d') A[r:-r,r:-r] = a B = numpy.zeros((height,width)) - typ = MPI.FLOAT + typ = MPI.DOUBLE_PRECISION if Y < y-1: top_nbr = comm.Get_cart_rank([X,Y+1]) @@ -331,11 +331,11 @@ def main(): # Apply the stencil operator star(n,r,A,B,W,jstart,jend,istart,iend) + A[r:jend-jstart+r+1,r:iend-istart+r+1] += 1.0 + # numpy.add(A[r:jend-jstart+r+1,r:iend-istart+r+1],1.0,A[r:jend-jstart+r+1,r:iend-istart+r+1]] - numpy.add(A[0:jend-r+1,0:iend-r+1],1) - - local_time = numpy.array(MPI.Wtime() - t0 , dtype ='f') - total_time = numpy.array(0 , dtype ='f') + local_time = numpy.array(MPI.Wtime() - t0 , dtype ='d') + total_time = numpy.array(0 , dtype ='d') comm.Reduce([local_time , 1 , typ],[total_time , 1 , typ], op=MPI.MAX , root =0) @@ -344,13 +344,13 @@ def main(): # ******************************************************************** # compute L1 norm in parallel - local_norm = 0.0; + local_norm = 0.0 for a in range(max(jstart,r), min(n-r-1,jend)+1): for b in range(max(istart,r), min(n-r-1,iend)+1): local_norm = local_norm + abs(B[a-jstart][b-istart]) - local_norm = numpy.array(local_norm, dtype ='f') - norm = numpy.array(0 , dtype ='f') + local_norm = numpy.array(local_norm, dtype ='d') + norm = numpy.array(0 , dtype ='d') comm.Reduce([local_norm, 1 , typ], [norm, 1, typ], op=MPI.SUM , root =0) if me == 0: diff --git a/PYTHON/stencil-numba-shmem.py b/PYTHON/stencil-numba-shmem.py index 9fe78d3da..8e3ca6765 100644 --- a/PYTHON/stencil-numba-shmem.py +++ b/PYTHON/stencil-numba-shmem.py @@ -90,7 +90,7 @@ def main(): if me==0: print("Parallel Research Kernels") - print("Python SHMEM/Numpy Stencil execution on 2D grid") + print("Python SHMEM/Numba Stencil execution on 2D grid") if len(sys.argv) < 3 or len(sys.argv) > 5: print(f"argument count = {len(sys.argv)}") @@ -106,7 +106,15 @@ def main(): sys.exit(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") if len(sys.argv) > 3: - radius = int(sys.argv[3]) + pattern = sys.argv[3] + else: + pattern = 'star' + + if pattern != 'star': + sys.exit("ERROR: Only star pattern is supported") + + if len(sys.argv) > 4: + radius = int(sys.argv[4]) if radius < 1: sys.exit("ERROR: Stencil radius should be positive") if 2*radius+1 > n: @@ -118,16 +126,16 @@ def main(): print("Number of ranks = ", np) print("Number of iterations = ", iterations) print("Grid size = ", n) - print("Type of stencil = star") + print("Type of stencil = ", pattern) print("Radius of stencil = ", radius) print("Data type = float 64 (double precision in C)") - weight = numpy.zeros((2*radius+1, 2*radius+1), dtype='f') + weight = numpy.zeros((2*radius+1, 2*radius+1), dtype='d') - local_stencil_time = shmem.zeros(1, dtype='f') - stencil_time = shmem.zeros(1, dtype='f') - local_norm = shmem.zeros(1, dtype='f') - norm = shmem.zeros(1, dtype='f') + local_stencil_time = shmem.zeros(1, dtype='d') + stencil_time = shmem.zeros(1, dtype='d') + local_norm = shmem.zeros(1, dtype='d') + norm = shmem.zeros(1, dtype='d') iterflag = shmem.zeros(2, dtype='i') width = shmem.zeros(1, dtype='i') maxwidth = shmem.zeros(1, dtype='i') @@ -173,15 +181,13 @@ def main(): leftover = n%npy if mey < leftover: jstart = (height[0]+1) * mey - jstart = int(jstart) jend = jstart + height[0] + 1 else: jstart = (height[0]+1) * leftover + height[0] * (mey-leftover) - jstart = int(jstart) jend = jstart + height[0] height[0] = jend - jstart + 1 - if height == 0: + if height[0] == 0: print(f"ERROR: rank {me} has no work to do") shmem.global_exit(1) @@ -189,10 +195,10 @@ def main(): print(f"ERROR: rank {me} has work tile smaller then stencil radius") shmem.global_exit(1) - a = numpy.fromfunction(lambda i, j: i+istart+j+jstart, (height[0], width[0]), dtype=float) - A = numpy.zeros((height[0]+2*radius, width[0]+2*radius), dtype='f') + a = numpy.fromfunction(lambda i, j: i+istart+j+jstart, (height[0], width[0]), dtype='d') + A = numpy.zeros((height[0]+2*radius, width[0]+2*radius), dtype='d') A[radius:-radius, radius:-radius] = a - B = numpy.zeros((height[0], width[0]), dtype='f') + B = numpy.zeros((height[0], width[0]), dtype='d') shmem.barrier_all() shmem.max_reduce(maxwidth, width) @@ -206,25 +212,25 @@ def main(): weight[-ii+radius][0+radius] = -1.0/(2.0*ii*radius) # allocate communication buffers for halo values - top_buf_out = shmem.zeros(radius*maxwidth[0], dtype='f') - bottom_buf_out = shmem.zeros(radius*maxwidth[0], dtype='f') + top_buf_out = shmem.zeros(radius*maxwidth[0], dtype='d') + bottom_buf_out = shmem.zeros(radius*maxwidth[0], dtype='d') top_buf_in = {} bottom_buf_in = {} - top_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='f') - top_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='f') - bottom_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='f') - bottom_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='f') + top_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='d') + top_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='d') + bottom_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='d') + bottom_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='d') - right_buf_out = shmem.zeros(radius*maxheight[0], dtype='f') - left_buf_out = shmem.zeros(radius*maxheight[0], dtype='f') + right_buf_out = shmem.zeros(radius*maxheight[0], dtype='d') + left_buf_out = shmem.zeros(radius*maxheight[0], dtype='d') right_buf_in = {} left_buf_in = {} - right_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='f') - right_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='f') - left_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='f') - left_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='f') + right_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='d') + right_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='d') + left_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='d') + left_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='d') shmem.barrier_all() @@ -293,13 +299,13 @@ def main(): if not splitfence: shmem.fence() - if mey < npy-1 and top_nbr is not None: + if mey < npy-1: shmem.atomic_inc(iterflag[sw:sw+1], top_nbr) - if mey > 0 and bottom_nbr is not None: + if mey > 0: shmem.atomic_inc(iterflag[sw:sw+1], bottom_nbr) - if mex < npx-1 and right_nbr is not None: + if mex < npx-1: shmem.atomic_inc(iterflag[sw:sw+1], right_nbr) - if mex > 0 and left_nbr is not None: + if mex > 0: shmem.atomic_inc(iterflag[sw:sw+1], left_nbr) shmem.wait_until(iterflag[sw:sw+1], shmem.CMP.EQ, count_case * (iter // 2 + 1)) @@ -344,7 +350,8 @@ def main(): star(n,radius,A,B,weight,jstart,jend,istart,iend) # add constant to solution to force refresh of neighbor data, if any - numpy.add(A[0:jend-radius+1, 0:iend-radius+1], 1) + A[radius:jend-jstart+radius,radius:iend-istart+radius] += 1.0 + # numpy.add(A[radius:jend-jstart+radius,radius:iend-istart+radius], 1.0, A[radius:jend-jstart+radius,radius:iend-istart+radius]) local_stencil_time[0] = time.monotonic() - local_stencil_time[0] shmem.barrier_all() diff --git a/PYTHON/stencil-numpy-mpi.py b/PYTHON/stencil-numpy-mpi.py index 0683a747d..30b64457b 100755 --- a/PYTHON/stencil-numpy-mpi.py +++ b/PYTHON/stencil-numpy-mpi.py @@ -65,7 +65,7 @@ def factor(r): for fac1 in range(fac1, 0, -1): if r%fac1 == 0: fac2 = r/fac1 - break; + break return fac1, fac2 def main(): @@ -101,7 +101,7 @@ def main(): n = int(sys.argv[2]) nsquare = n * n if nsquare < np: - sys.exit("ERROR: grid size ", nsquare, " must be at least # ranks: ", np) + sys.exit(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") if len(sys.argv) > 3: @@ -177,7 +177,7 @@ def main(): width = iend - istart + 1 if width == 0 : - sys.exit("ERROR: rank", me,"has no work to do") + sys.exit(f"ERROR: rank {me} has no work to do") height = n//y leftover = n%y @@ -191,16 +191,16 @@ def main(): height = jend - jstart + 1 if height == 0: - sys.exit("ERROR: rank", me,"has no work to do") + sys.exit(f"ERROR: rank {me} has no work to do") if width < r or height < r: - sys.exit("ERROR: rank", me,"has work tile smaller then stencil radius") + sys.exit(f"ERROR: rank {me} has work tile smaller then stencil radius") A = numpy.zeros((height+2*r,width+2*r)) - a = numpy.fromfunction(lambda i,j: i+istart+j+jstart,(height,width),dtype=float) + a = numpy.fromfunction(lambda i,j: i+istart+j+jstart,(height,width),dtype='d') A[r:-r,r:-r] = a B = numpy.zeros((height,width)) - typ = MPI.FLOAT + typ = MPI.DOUBLE_PRECISION if Y < y-1: top_nbr = comm.Get_cart_rank([X,Y+1]) @@ -326,10 +326,11 @@ def main(): B[a][b] = B[a][b] + numpy.dot(W[r],A[a:a+2*r+1,b+r]) B[a][b] = B[a][b] + numpy.dot(W[:,r],A[a+r,b:b+2*r+1]) - numpy.add(A[0:jend-r+1,0:iend-r+1],1) + A[r:jend-jstart+r+1,r:iend-istart+r+1] += 1.0 + # numpy.add(A[r:jend-jstart+r+1,r:iend-istart+r+1],1.0,A[r:jend-jstart+r+1,r:iend-istart+r+1]] - local_time = numpy.array(MPI.Wtime() - t0 , dtype ='f') - total_time = numpy.array(0 , dtype ='f') + local_time = numpy.array(MPI.Wtime() - t0 , dtype ='d') + total_time = numpy.array(0 , dtype ='d') comm.Reduce([local_time , 1 , typ],[total_time , 1 , typ], op=MPI.MAX , root =0) @@ -338,13 +339,13 @@ def main(): # ******************************************************************** # compute L1 norm in parallel - local_norm = 0.0; + local_norm = 0.0 for a in range(max(jstart,r), min(n-r-1,jend)+1): for b in range(max(istart,r), min(n-r-1,iend)+1): local_norm = local_norm + abs(B[a-jstart][b-istart]) - local_norm = numpy.array(local_norm, dtype ='f') - norm = numpy.array(0 , dtype ='f') + local_norm = numpy.array(local_norm, dtype ='d') + norm = numpy.array(0 , dtype ='d') comm.Reduce([local_norm, 1 , typ], [norm, 1, typ], op=MPI.SUM , root =0) if me == 0: diff --git a/PYTHON/stencil-numpy-shmem.py b/PYTHON/stencil-numpy-shmem.py index 49ed32d0f..236bd9ec2 100644 --- a/PYTHON/stencil-numpy-shmem.py +++ b/PYTHON/stencil-numpy-shmem.py @@ -94,7 +94,15 @@ def main(): sys.exit(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") if len(sys.argv) > 3: - radius = int(sys.argv[3]) + pattern = sys.argv[3] + else: + pattern = 'star' + + if pattern != 'star': + sys.exit("ERROR: Only star pattern is supported") + + if len(sys.argv) > 4: + radius = int(sys.argv[4]) if radius < 1: sys.exit("ERROR: Stencil radius should be positive") if 2*radius+1 > n: @@ -106,16 +114,16 @@ def main(): print("Number of ranks = ", np) print("Number of iterations = ", iterations) print("Grid size = ", n) - print("Type of stencil = star") + print("Type of stencil = ", pattern) print("Radius of stencil = ", radius) print("Data type = float 64 (double precision in C)") - weight = numpy.zeros((2*radius+1, 2*radius+1), dtype='f') + weight = numpy.zeros((2*radius+1, 2*radius+1), dtype='d') - local_stencil_time = shmem.zeros(1, dtype='f') - stencil_time = shmem.zeros(1, dtype='f') - local_norm = shmem.zeros(1, dtype='f') - norm = shmem.zeros(1, dtype='f') + local_stencil_time = shmem.zeros(1, dtype='d') + stencil_time = shmem.zeros(1, dtype='d') + local_norm = shmem.zeros(1, dtype='d') + norm = shmem.zeros(1, dtype='d') iterflag = shmem.zeros(2, dtype='i') width = shmem.zeros(1, dtype='i') maxwidth = shmem.zeros(1, dtype='i') @@ -161,15 +169,13 @@ def main(): leftover = n%npy if mey < leftover: jstart = (height[0]+1) * mey - jstart = int(jstart) jend = jstart + height[0] + 1 else: jstart = (height[0]+1) * leftover + height[0] * (mey-leftover) - jstart = int(jstart) jend = jstart + height[0] height[0] = jend - jstart + 1 - if height == 0: + if height[0] == 0: print(f"ERROR: rank {me} has no work to do") shmem.global_exit(1) @@ -177,10 +183,10 @@ def main(): print(f"ERROR: rank {me} has work tile smaller then stencil radius") shmem.global_exit(1) - a = numpy.fromfunction(lambda i, j: i+istart+j+jstart, (height[0], width[0]), dtype=float) - A = numpy.zeros((height[0]+2*radius, width[0]+2*radius), dtype='f') + a = numpy.fromfunction(lambda i, j: i+istart+j+jstart, (height[0], width[0]), dtype='d') + A = numpy.zeros((height[0]+2*radius, width[0]+2*radius), dtype='d') A[radius:-radius, radius:-radius] = a - B = numpy.zeros((height[0], width[0]), dtype='f') + B = numpy.zeros((height[0], width[0]), dtype='d') shmem.barrier_all() shmem.max_reduce(maxwidth, width) @@ -194,25 +200,25 @@ def main(): weight[-ii+radius][0+radius] = -1.0/(2.0*ii*radius) # allocate communication buffers for halo values - top_buf_out = shmem.zeros(radius*maxwidth[0], dtype='f') - bottom_buf_out = shmem.zeros(radius*maxwidth[0], dtype='f') + top_buf_out = shmem.zeros(radius*maxwidth[0], dtype='d') + bottom_buf_out = shmem.zeros(radius*maxwidth[0], dtype='d') top_buf_in = {} bottom_buf_in = {} - top_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='f') - top_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='f') - bottom_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='f') - bottom_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='f') + top_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='d') + top_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='d') + bottom_buf_in[0] = shmem.zeros(radius*maxwidth[0], dtype='d') + bottom_buf_in[1] = shmem.zeros(radius*maxwidth[0], dtype='d') - right_buf_out = shmem.zeros(radius*maxheight[0], dtype='f') - left_buf_out = shmem.zeros(radius*maxheight[0], dtype='f') + right_buf_out = shmem.zeros(radius*maxheight[0], dtype='d') + left_buf_out = shmem.zeros(radius*maxheight[0], dtype='d') right_buf_in = {} left_buf_in = {} - right_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='f') - right_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='f') - left_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='f') - left_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='f') + right_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='d') + right_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='d') + left_buf_in[0] = shmem.zeros(radius*maxheight[0], dtype='d') + left_buf_in[1] = shmem.zeros(radius*maxheight[0], dtype='d') shmem.barrier_all() @@ -281,13 +287,13 @@ def main(): if not splitfence: shmem.fence() - if mey < npy-1 and top_nbr is not None: + if mey < npy-1: shmem.atomic_inc(iterflag[sw:sw+1], top_nbr) - if mey > 0 and bottom_nbr is not None: + if mey > 0: shmem.atomic_inc(iterflag[sw:sw+1], bottom_nbr) - if mex < npx-1 and right_nbr is not None: + if mex < npx-1: shmem.atomic_inc(iterflag[sw:sw+1], right_nbr) - if mex > 0 and left_nbr is not None: + if mex > 0: shmem.atomic_inc(iterflag[sw:sw+1], left_nbr) shmem.wait_until(iterflag[sw:sw+1], shmem.CMP.EQ, count_case * (iter // 2 + 1)) @@ -337,7 +343,8 @@ def main(): B[j][i] += numpy.dot(weight[:,radius], A[j+radius, i:i+2*radius+1]) # add constant to solution to force refresh of neighbor data, if any - numpy.add(A[0:jend-radius+1, 0:iend-radius+1], 1) + A[radius:jend-jstart+radius,radius:iend-istart+radius] += 1.0 + # numpy.add(A[radius:jend-jstart+radius,radius:iend-istart+radius], 1.0, A[radius:jend-jstart+radius,radius:iend-istart+radius]) local_stencil_time[0] = time.monotonic() - local_stencil_time[0] shmem.barrier_all() diff --git a/PYTHON/transpose-numpy-mpi-p2p.py b/PYTHON/transpose-numpy-mpi-p2p.py index 82163b406..c78275396 100755 --- a/PYTHON/transpose-numpy-mpi-p2p.py +++ b/PYTHON/transpose-numpy-mpi-p2p.py @@ -127,7 +127,7 @@ def main(): sys.exit("ERROR: order must be >= 1") if order % np != 0: - sys.exit("ERROR: matrix order ", order," should be divisible by # procs", np) + sys.exit(f"ERROR: matrix order {order} should be divisible by # procs {np}") block_order = int(order / np) @@ -140,7 +140,7 @@ def main(): # ** Allocate space for the input and transpose matrix # ******************************************************************** - A = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=float) + A = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype='d') B = numpy.zeros((order,block_order)) T = numpy.zeros((block_order,block_order)) @@ -159,7 +159,7 @@ def main(): lo = block_order * send_to hi = block_order * (send_to+1) comm.Sendrecv(sendbuf=A[lo:hi,:],dest=send_to,sendtag=phase,recvbuf=T,source=recv_from,recvtag=phase) - lo = block_order * recv_from + lo = block_order * recv_from hi = block_order * (recv_from+1) B[lo:hi,:] += T.T @@ -178,7 +178,7 @@ def main(): G = numpy.concatenate(F,axis=1) #if (me==0): # print(G) - H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype='d') abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) epsilon=1.e-8 diff --git a/PYTHON/transpose-numpy-mpi.py b/PYTHON/transpose-numpy-mpi.py index d0413f52f..70af3ae82 100755 --- a/PYTHON/transpose-numpy-mpi.py +++ b/PYTHON/transpose-numpy-mpi.py @@ -127,7 +127,7 @@ def main(): sys.exit("ERROR: order must be >= 1") if order % np != 0: - sys.exit("ERROR: matrix order ", order," should be divisible by # procs", np) + sys.exit(f"ERROR: matrix order {order} should be divisible by # procs {np}") block_order = int(order / np) @@ -140,7 +140,7 @@ def main(): # ** Allocate space for the input and transpose matrix # ******************************************************************** - A = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=float) + A = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype='d') B = numpy.zeros((order,block_order)) T = numpy.zeros((order,block_order)) @@ -177,7 +177,7 @@ def main(): G = numpy.concatenate(F,axis=1) #if (me==0): # print(G) - H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype='d') abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) epsilon=1.e-8 diff --git a/PYTHON/transpose-numpy-shmem.py b/PYTHON/transpose-numpy-shmem.py index 935e5f8cc..6cbc2fe84 100755 --- a/PYTHON/transpose-numpy-shmem.py +++ b/PYTHON/transpose-numpy-shmem.py @@ -145,7 +145,7 @@ def main(): # ** Allocate space for the input and transpose matrix # ******************************************************************** - LA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=float) + LA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype='d') A = shmem.full((order,block_order),LA) B = shmem.zeros((order,block_order)) T = shmem.zeros((order,block_order)) @@ -189,7 +189,7 @@ def main(): G = numpy.concatenate(F,axis=1) #if (me==0): # print(G) - H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype=float) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype='d') abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) shmem.free(B) From 193da0945934989d2c6c4c284cdc4d3200679c60 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Jun 2023 10:15:37 +0300 Subject: [PATCH 290/325] error message only on me==0 --- PYTHON/nstream-numpy-shmem.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/PYTHON/nstream-numpy-shmem.py b/PYTHON/nstream-numpy-shmem.py index 3b42f0488..3be4ace86 100755 --- a/PYTHON/nstream-numpy-shmem.py +++ b/PYTHON/nstream-numpy-shmem.py @@ -86,16 +86,22 @@ def main(): print('Python SHMEM/Numpy STREAM triad: A = B + scalar * C') if len(sys.argv) != 3: - print('argument count = ', len(sys.argv)) - sys.exit("Usage: python nstream.py <# iterations> ") + if (me==0): + print('argument count = ', len(sys.argv)) + print("Usage: python nstream.py <# iterations> ") + sys.exit() iterations = int(sys.argv[1]) if iterations < 1: - sys.exit("ERROR: iterations must be >= 1") + if (me==0): + print("ERROR: iterations must be >= 1") + sys.exit() total_length = int(sys.argv[2]) if total_length < 1: - sys.exit("ERROR: length must be positive") + if (me==0): + print("ERROR: length must be positive") + sys.exit() length = int(total_length / np) remainder = total_length % np From 82df4ec9d6ac15b482486f73e49e60548d596a2b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Jun 2023 10:21:51 +0300 Subject: [PATCH 291/325] error message only on me==0 --- PYTHON/transpose-numpy-shmem.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/PYTHON/transpose-numpy-shmem.py b/PYTHON/transpose-numpy-shmem.py index 6cbc2fe84..88614aa28 100755 --- a/PYTHON/transpose-numpy-shmem.py +++ b/PYTHON/transpose-numpy-shmem.py @@ -118,19 +118,27 @@ def main(): print('Python SHMEM/Numpy Matrix transpose: B = A^T') if len(sys.argv) != 3: - print('argument count = ', len(sys.argv)) - sys.exit("Usage: ./transpose <# iterations> ") + if (me==0): + print('argument count = ', len(sys.argv)) + print("Usage: ./transpose <# iterations> ") + sys.exit() iterations = int(sys.argv[1]) if iterations < 1: - sys.exit("ERROR: iterations must be >= 1") + if (me==0): + print("ERROR: iterations must be >= 1") + sys.exit() order = int(sys.argv[2]) if order < 1: - sys.exit("ERROR: order must be >= 1") + if (me==0): + print("ERROR: order must be >= 1") + sys.exit() if order % np != 0: - sys.exit(f"ERROR: matrix order ({order}) should be divisible by # procs ({np})") + if (me==0): + print(f"ERROR: matrix order ({order}) should be divisible by # procs ({np})") + sys.exit() block_order = int(order / np) From b5337bc80b6105b147fd1d6cfd8b14a53aef2d48 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Jun 2023 10:29:05 +0300 Subject: [PATCH 292/325] error message only on me==0 --- PYTHON/stencil-numpy-shmem.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) mode change 100644 => 100755 PYTHON/stencil-numpy-shmem.py diff --git a/PYTHON/stencil-numpy-shmem.py b/PYTHON/stencil-numpy-shmem.py old mode 100644 new mode 100755 index 236bd9ec2..92416fb02 --- a/PYTHON/stencil-numpy-shmem.py +++ b/PYTHON/stencil-numpy-shmem.py @@ -81,17 +81,23 @@ def main(): print("Python SHMEM/Numpy Stencil execution on 2D grid") if len(sys.argv) < 3 or len(sys.argv) > 5: - print(f"argument count = {len(sys.argv)}") - sys.exit("Usage: ... <# iterations> []") + if (me==0): + print(f"argument count = {len(sys.argv)}") + print("Usage: ... <# iterations> []") + sys.exit() iterations = int(sys.argv[1]) if iterations < 1: - sys.exit("ERROR: iterations must be >= 1") + if (me==0): + print("ERROR: iterations must be >= 1") + sys.exit() n = int(sys.argv[2]) nsquare = n * n if nsquare < np: - sys.exit(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") + if (me==0): + print(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") + sys.exit() if len(sys.argv) > 3: pattern = sys.argv[3] @@ -99,14 +105,20 @@ def main(): pattern = 'star' if pattern != 'star': - sys.exit("ERROR: Only star pattern is supported") + if (me==0): + print("ERROR: Only star pattern is supported") + sys.exit() if len(sys.argv) > 4: radius = int(sys.argv[4]) if radius < 1: - sys.exit("ERROR: Stencil radius should be positive") + if (me==0): + print("ERROR: Stencil radius should be positive") + sys.exit() if 2*radius+1 > n: - sys.exit("ERROR: Stencil radius exceeds grid size") + if (me==0): + print("ERROR: Stencil radius exceeds grid size") + sys.exit() else: radius = 2 From 1e596203f6cf22de56483779446cb81ba7aa7c03 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 9 Jul 2023 14:56:36 +0300 Subject: [PATCH 293/325] more me==0 stuff --- PYTHON/nstream-numpy-mpi.py | 11 +++++++++-- PYTHON/stencil-numba-shmem.py | 26 +++++++++++++++++++------- 2 files changed, 28 insertions(+), 9 deletions(-) mode change 100644 => 100755 PYTHON/stencil-numba-shmem.py diff --git a/PYTHON/nstream-numpy-mpi.py b/PYTHON/nstream-numpy-mpi.py index d2e35ec09..5cef1382f 100755 --- a/PYTHON/nstream-numpy-mpi.py +++ b/PYTHON/nstream-numpy-mpi.py @@ -69,6 +69,9 @@ def main(): + # MPI4PY_RC_INITIALIZE=0 is required for this + #MPI.Init_thread(required=MPI.THREAD_SINGLE) + comm = MPI.COMM_WORLD me = comm.Get_rank() np = comm.Get_size() @@ -82,8 +85,10 @@ def main(): print('Python MPI/Numpy STREAM triad: A = B + scalar * C') if len(sys.argv) != 3: - print('argument count = ', len(sys.argv)) - sys.exit("Usage: python nstream.py <# iterations> ") + if (me==0): + print('argument count = ', len(sys.argv)) + print("Usage: python nstream.py <# iterations> ") + sys.exit() iterations = int(sys.argv[1]) if iterations < 1: @@ -161,6 +166,8 @@ def main(): nbytes = 4.0 * total_length * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + # MPI4PY_RC_INITIALIZE=0 is required for this + #MPI.Finalize() if __name__ == '__main__': main() diff --git a/PYTHON/stencil-numba-shmem.py b/PYTHON/stencil-numba-shmem.py old mode 100644 new mode 100755 index 8e3ca6765..736aa8ee1 --- a/PYTHON/stencil-numba-shmem.py +++ b/PYTHON/stencil-numba-shmem.py @@ -93,17 +93,23 @@ def main(): print("Python SHMEM/Numba Stencil execution on 2D grid") if len(sys.argv) < 3 or len(sys.argv) > 5: - print(f"argument count = {len(sys.argv)}") - sys.exit("Usage: ... <# iterations> []") + if (me==0): + print(f"argument count = {len(sys.argv)}") + print("Usage: ... <# iterations> []") + sys.exit() iterations = int(sys.argv[1]) if iterations < 1: - sys.exit("ERROR: iterations must be >= 1") + if (me==0): + print("ERROR: iterations must be >= 1") + sys.exit() n = int(sys.argv[2]) nsquare = n * n if nsquare < np: - sys.exit(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") + if (me==0): + print(f"ERROR: grid size {nsquare} must be at least # ranks: {np}") + sys.exit() if len(sys.argv) > 3: pattern = sys.argv[3] @@ -111,14 +117,20 @@ def main(): pattern = 'star' if pattern != 'star': - sys.exit("ERROR: Only star pattern is supported") + if (me==0): + print("ERROR: Only star pattern is supported") + sys.exit() if len(sys.argv) > 4: radius = int(sys.argv[4]) if radius < 1: - sys.exit("ERROR: Stencil radius should be positive") + if (me==0): + print("ERROR: Stencil radius should be positive") + sys.exit() if 2*radius+1 > n: - sys.exit("ERROR: Stencil radius exceeds grid size") + if (me==0): + print("ERROR: Stencil radius exceeds grid size") + sys.exit() else: radius = 2 From 837ffedd24b99cf35b7bed69c939e3c1e53700a5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 17 Jul 2023 11:47:26 +0300 Subject: [PATCH 294/325] remove exec perms when launcher should be used --- PYTHON/nstream-mpi.py | 0 PYTHON/nstream-numpy-mpi.py | 0 PYTHON/nstream-numpy-shmem.py | 0 PYTHON/stencil-numba-mpi.py | 0 PYTHON/stencil-numba-shmem.py | 0 PYTHON/stencil-numpy-mpi.py | 0 PYTHON/stencil-numpy-shmem.py | 0 PYTHON/transpose-numpy-mpi-p2p.py | 0 PYTHON/transpose-numpy-mpi-rma.py | 0 PYTHON/transpose-numpy-mpi.py | 0 PYTHON/transpose-numpy-shmem.py | 0 11 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 PYTHON/nstream-mpi.py mode change 100755 => 100644 PYTHON/nstream-numpy-mpi.py mode change 100755 => 100644 PYTHON/nstream-numpy-shmem.py mode change 100755 => 100644 PYTHON/stencil-numba-mpi.py mode change 100755 => 100644 PYTHON/stencil-numba-shmem.py mode change 100755 => 100644 PYTHON/stencil-numpy-mpi.py mode change 100755 => 100644 PYTHON/stencil-numpy-shmem.py mode change 100755 => 100644 PYTHON/transpose-numpy-mpi-p2p.py mode change 100755 => 100644 PYTHON/transpose-numpy-mpi-rma.py mode change 100755 => 100644 PYTHON/transpose-numpy-mpi.py mode change 100755 => 100644 PYTHON/transpose-numpy-shmem.py diff --git a/PYTHON/nstream-mpi.py b/PYTHON/nstream-mpi.py old mode 100755 new mode 100644 diff --git a/PYTHON/nstream-numpy-mpi.py b/PYTHON/nstream-numpy-mpi.py old mode 100755 new mode 100644 diff --git a/PYTHON/nstream-numpy-shmem.py b/PYTHON/nstream-numpy-shmem.py old mode 100755 new mode 100644 diff --git a/PYTHON/stencil-numba-mpi.py b/PYTHON/stencil-numba-mpi.py old mode 100755 new mode 100644 diff --git a/PYTHON/stencil-numba-shmem.py b/PYTHON/stencil-numba-shmem.py old mode 100755 new mode 100644 diff --git a/PYTHON/stencil-numpy-mpi.py b/PYTHON/stencil-numpy-mpi.py old mode 100755 new mode 100644 diff --git a/PYTHON/stencil-numpy-shmem.py b/PYTHON/stencil-numpy-shmem.py old mode 100755 new mode 100644 diff --git a/PYTHON/transpose-numpy-mpi-p2p.py b/PYTHON/transpose-numpy-mpi-p2p.py old mode 100755 new mode 100644 diff --git a/PYTHON/transpose-numpy-mpi-rma.py b/PYTHON/transpose-numpy-mpi-rma.py old mode 100755 new mode 100644 diff --git a/PYTHON/transpose-numpy-mpi.py b/PYTHON/transpose-numpy-mpi.py old mode 100755 new mode 100644 diff --git a/PYTHON/transpose-numpy-shmem.py b/PYTHON/transpose-numpy-shmem.py old mode 100755 new mode 100644 From c562f7eb08c959ebdc9a0311c2186883b0ce4548 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 17 Jul 2023 11:48:46 +0300 Subject: [PATCH 295/325] nevermind - make all py exec --- PYTHON/nstream-mpi.py | 0 PYTHON/nstream-numpy-mpi.py | 0 PYTHON/nstream-numpy-shmem.py | 0 PYTHON/p2p-numba-mpi.py | 0 PYTHON/p2p-numba-shmem.py | 0 PYTHON/p2p-numpy-mpi.py | 0 PYTHON/p2p-numpy-shmem.py | 0 PYTHON/stencil-numba-mpi.py | 0 PYTHON/stencil-numba-shmem.py | 0 PYTHON/stencil-numpy-mpi.py | 0 PYTHON/stencil-numpy-shmem.py | 0 PYTHON/transpose-numpy-mpi-p2p.py | 0 PYTHON/transpose-numpy-mpi-rma.py | 0 PYTHON/transpose-numpy-mpi.py | 0 PYTHON/transpose-numpy-shmem.py | 0 15 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 PYTHON/nstream-mpi.py mode change 100644 => 100755 PYTHON/nstream-numpy-mpi.py mode change 100644 => 100755 PYTHON/nstream-numpy-shmem.py mode change 100644 => 100755 PYTHON/p2p-numba-mpi.py mode change 100644 => 100755 PYTHON/p2p-numba-shmem.py mode change 100644 => 100755 PYTHON/p2p-numpy-mpi.py mode change 100644 => 100755 PYTHON/p2p-numpy-shmem.py mode change 100644 => 100755 PYTHON/stencil-numba-mpi.py mode change 100644 => 100755 PYTHON/stencil-numba-shmem.py mode change 100644 => 100755 PYTHON/stencil-numpy-mpi.py mode change 100644 => 100755 PYTHON/stencil-numpy-shmem.py mode change 100644 => 100755 PYTHON/transpose-numpy-mpi-p2p.py mode change 100644 => 100755 PYTHON/transpose-numpy-mpi-rma.py mode change 100644 => 100755 PYTHON/transpose-numpy-mpi.py mode change 100644 => 100755 PYTHON/transpose-numpy-shmem.py diff --git a/PYTHON/nstream-mpi.py b/PYTHON/nstream-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/nstream-numpy-mpi.py b/PYTHON/nstream-numpy-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/nstream-numpy-shmem.py b/PYTHON/nstream-numpy-shmem.py old mode 100644 new mode 100755 diff --git a/PYTHON/p2p-numba-mpi.py b/PYTHON/p2p-numba-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/p2p-numba-shmem.py b/PYTHON/p2p-numba-shmem.py old mode 100644 new mode 100755 diff --git a/PYTHON/p2p-numpy-mpi.py b/PYTHON/p2p-numpy-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/p2p-numpy-shmem.py b/PYTHON/p2p-numpy-shmem.py old mode 100644 new mode 100755 diff --git a/PYTHON/stencil-numba-mpi.py b/PYTHON/stencil-numba-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/stencil-numba-shmem.py b/PYTHON/stencil-numba-shmem.py old mode 100644 new mode 100755 diff --git a/PYTHON/stencil-numpy-mpi.py b/PYTHON/stencil-numpy-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/stencil-numpy-shmem.py b/PYTHON/stencil-numpy-shmem.py old mode 100644 new mode 100755 diff --git a/PYTHON/transpose-numpy-mpi-p2p.py b/PYTHON/transpose-numpy-mpi-p2p.py old mode 100644 new mode 100755 diff --git a/PYTHON/transpose-numpy-mpi-rma.py b/PYTHON/transpose-numpy-mpi-rma.py old mode 100644 new mode 100755 diff --git a/PYTHON/transpose-numpy-mpi.py b/PYTHON/transpose-numpy-mpi.py old mode 100644 new mode 100755 diff --git a/PYTHON/transpose-numpy-shmem.py b/PYTHON/transpose-numpy-shmem.py old mode 100644 new mode 100755 From ed650b98c5d1df99150f4b588c2bc72e02200afc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 17 Jul 2023 12:15:18 +0300 Subject: [PATCH 296/325] does not work yet --- PYTHON/transpose-numpy-shmem-rma.py | 234 ++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100755 PYTHON/transpose-numpy-shmem-rma.py diff --git a/PYTHON/transpose-numpy-shmem-rma.py b/PYTHON/transpose-numpy-shmem-rma.py new file mode 100755 index 000000000..065c3b11e --- /dev/null +++ b/PYTHON/transpose-numpy-shmem-rma.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2023, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: transpose +# +# PURPOSE: This program measures the time for the transpose of a +# column-major stored matrix into a row-major stored matrix. +# +# USAGE: Program input is the matrix order and the number of times to +# repeat the operation: +# +# transpose <# iterations> +# +# The output consists of diagnostics to make sure the +# transpose worked and timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# +# ******************************************************************* + +# Layout nomenclature +# ------------------- +# +# - Each rank owns one block of columns (Colblock) of the overall +# matrix to be transposed, as well as of the transposed matrix. +# - Colblock is stored contiguously in the memory of the rank. +# The stored format is column major, which means that matrix +# elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1) +# are "order" words apart +# - Colblock is logically composed of #ranks Blocks, but a Block is +# not stored contiguously in memory. Conceptually, the Block is +# the unit of data that gets communicated between ranks. Block i of +# rank j is locally transposed and gathered into a buffer called Work, +# which is sent to rank i, where it is scattered into Block j of the +# transposed matrix. +# - When tiling is applied to reduce TLB misses, each block gets +# accessed by tiles. +# - The original and transposed matrices are called A and B +# +# +-----------------------------------------------------------------+ +# | | | | | +# | Colblock | | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | Block | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | Overall Matrix | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# +-----------------------------------------------------------------+ + +import sys +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +from shmem4py import shmem +import numpy + +def main(): + + me = shmem.my_pe() + np = shmem.n_pes() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python SHMEM/Numpy Matrix transpose: B = A^T') + + if len(sys.argv) != 3: + if (me==0): + print('argument count = ', len(sys.argv)) + print("Usage: ./transpose <# iterations> ") + sys.exit() + + iterations = int(sys.argv[1]) + if iterations < 1: + if (me==0): + print("ERROR: iterations must be >= 1") + sys.exit() + + order = int(sys.argv[2]) + if order < 1: + if (me==0): + print("ERROR: order must be >= 1") + sys.exit() + + if order % np != 0: + if (me==0): + print(f"ERROR: matrix order ({order}) should be divisible by # procs ({np})") + sys.exit() + + block_order = int(order / np) + + if (me==0): + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Matrix order = ', order) + + shmem.barrier_all() + + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + #LA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype='d') + #A = shmem.full((order,block_order),LA) + A = shmem.zeros((order,block_order)) + B = shmem.zeros((order,block_order)) + T = shmem.zeros((block_order,block_order)) + + TA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=numpy.double) + A[:,:] = TA[:,:] + + for k in range(0,iterations+1): + + if k<1: + shmem.barrier_all() + t0 = timer() + + # barrier required before alltoall for correctness + #shmem.barrier_all() + #shmem.alltoall(T, A) + #for r in range(0,np): + # lo = block_order * r + # hi = block_order * (r+1) + # #B[lo:hi,:] += numpy.transpose(T[lo:hi,:]) + # B[lo:hi,:] += T[lo:hi,:].T + + for phase in range(0,np): + recv_from = (me + phase) % np + bsize = block_order * block_order + #WA.Get(T, recv_from, [bsize * me, bsize, MPI.DOUBLE]) + shmem.get(T, A[bsize * me : bsize * (me+1),:], recv_from) + + lo = block_order * recv_from + hi = block_order * (recv_from+1) + B[lo:hi,:] += T.T + + + + shmem.barrier_all() + + A += 1.0 + shmem.barrier_all() + + shmem.barrier_all() + t1 = timer() + trans_time = t1 - t0 + + shmem.free(A) + shmem.free(T) + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # allgather is non-scalable but was easier to debug + F = shmem.zeros((np,order,block_order)) + shmem.fcollect(F,B) + G = numpy.concatenate(F,axis=1) + #if (me==0): + # print(G) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype='d') + abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) + + shmem.free(B) + shmem.free(F) + + epsilon=1.e-8 + nbytes = 2 * order**2 * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + if abserr < epsilon: + if (me==0): + print('Solution validates') + avgtime = trans_time/iterations + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + else: + if (me==0): + print('error ',abserr, ' exceeds threshold ',epsilon) + print("ERROR: solution did not validate") + + +if __name__ == '__main__': + main() From c3f880a6af86aa29861238933be3a5e8728bf2b1 Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Mon, 17 Jul 2023 11:32:05 +0200 Subject: [PATCH 297/325] does not work yet either --- MPI1/Transpose/transpose-a2a.c | 354 +++++++++++++++++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 MPI1/Transpose/transpose-a2a.c diff --git a/MPI1/Transpose/transpose-a2a.c b/MPI1/Transpose/transpose-a2a.c new file mode 100644 index 000000000..eba905402 --- /dev/null +++ b/MPI1/Transpose/transpose-a2a.c @@ -0,0 +1,354 @@ +/* +Copyright (c) 2013, Intel Corporation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. +* Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +/******************************************************************* + +NAME: transpose + +PURPOSE: This program tests the efficiency with which a square matrix + can be transposed and stored in another matrix. The matrices + are distributed identically. + +USAGE: Program inputs are the matrix order, the number of times to + repeat the operation, and the communication mode + + transpose <# iterations> [tile size] + + An optional parameter specifies the tile size used to divide the + individual matrix blocks for improved cache and TLB performance. + + The output consists of diagnostics to make sure the + transpose worked and timing statistics. + +FUNCTIONS CALLED: + + Other than MPI or standard C functions, the following + functions are used in this program: + + wtime() Portable wall-timer interface. + bail_out() Determine global error and exit if nonzero. + +HISTORY: Written by Tim Mattson, April 1999. + Updated by Rob Van der Wijngaart, December 2005. + Updated by Rob Van der Wijngaart, October 2006. + Updated by Rob Van der Wijngaart, November 2014:: + - made variable names more consistent + - put timing around entire iterative loop of transposes + - fixed incorrect matrix block access; no separate function + for local transpose of matrix block + - reordered initialization and verification loops to + produce unit stride + - changed initialization values, such that the input matrix + elements are: A(i,j) = i+order*j + + +*******************************************************************/ + +/****************************************************************** + Layout nomenclature + ------------------- + +o Each rank owns one block of columns (Colblock) of the overall + matrix to be transposed, as well as of the transposed matrix. +o Colblock is stored contiguously in the memory of the rank. + The stored format is column major, which means that matrix + elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1) + are "order" words apart +o Colblock is logically composed of #ranks Blocks, but a Block is + not stored contiguously in memory. Conceptually, the Block is + the unit of data that gets communicated between ranks. Block i of + rank j is locally transposed and gathered into a buffer called Work, + which is sent to rank i, where it is scattered into Block j of the + transposed matrix. +o The original and transposed matrices are called A and B + + ----------------------------------------------------------------- +| | | | | +| Colblock | | | | +| | | | | +| | | | | +| | | | | +| ------------------------------- | +| | | | | +| | Block | | | +| | | | | +| | | | | +| | | | | +| ------------------------------- | +| | | | | +| | | | Overall Matrix | +| | | | | +| | | | | +| | | | | +| ------------------------------- | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | + -----------------------------------------------------------------*/ + +#include +#include + +#define A(i,j) A_p[(i+istart)+order*(j)] +#define B(i,j) B_p[(i+istart)+order*(j)] +#define T(i,j) T_p[(i+istart)+order*(j)] + +int main(int argc, char ** argv) +{ + long Block_order; /* number of columns owned by rank */ + long Colblock_size; /* size of column block */ + int Num_procs; /* number of ranks */ + long order; /* order of overall matrix */ + long bytes; /* combined size of matrices */ + int my_ID; /* rank */ + int root=0; /* rank of root */ + int iterations; /* number of times to do the transpose */ + int i, j, istart; /* dummies */ + int iter; /* index of iteration */ + int phase; /* phase inside staged communication */ + int colstart; /* starting column for owning rank */ + int error; /* error flag */ + double * RESTRICT A_p; /* original matrix column block */ + double * RESTRICT B_p; /* transposed matrix column block */ + double * RESTRICT T_p; /* original matrix column block */ + double abserr, /* absolute error */ + abserr_tot; /* aggregate absolute error */ + double epsilon = 1.e-8; /* error tolerance */ + double local_trans_time, /* timing parameters */ + trans_time, + avgtime; + +/********************************************************************* +** Initialize the MPI environment +*********************************************************************/ + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); + MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); + +/********************************************************************* +** process, test and broadcast input parameters +*********************************************************************/ + error = 0; + if (my_ID == root) + { + printf("Parallel Research Kernels version %s\n", PRKVERSION); + printf("MPI matrix transpose: B = A^T\n"); + + if (argc != 3) + { + printf("Usage: %s <# iterations> \n", *argv); + error = 1; goto ENDOFTESTS; + } + + iterations = atoi(*++argv); + if(iterations < 1) + { + printf("ERROR: iterations must be >= 1 : %d \n",iterations); + error = 1; goto ENDOFTESTS; + } + + order = atol(*++argv); + if (order < Num_procs) + { + printf("ERROR: matrix order %ld should at least # procs %d\n", + order, Num_procs); + error = 1; goto ENDOFTESTS; + } + + if (order%Num_procs) + { + printf("ERROR: matrix order %ld should be divisible by # procs %d\n", + order, Num_procs); + error = 1; goto ENDOFTESTS; + } + + ENDOFTESTS:; + } + bail_out(error); + + if (my_ID == root) + { + printf("Number of ranks = %d\n", Num_procs); + printf("Matrix order = %ld\n", order); + printf("Number of iterations = %d\n", iterations); + printf("Blocking messages\n"); + } + + /* Broadcast input data to all ranks */ + MPI_Bcast(&order, 1, MPI_LONG, root, MPI_COMM_WORLD); + MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); + + /* a non-positive tile size means no tiling of the local transpose */ + bytes = 2 * sizeof(double) * order * order; + +/********************************************************************* +** The matrix is broken up into column blocks that are mapped one to a +** rank. Each column block is made up of Num_procs smaller square +** blocks of order block_order. +*********************************************************************/ + + Block_order = order/Num_procs; + colstart = Block_order * my_ID; + Colblock_size = order * Block_order; + +/********************************************************************* +** Create the column block of the test matrix, the row block of the +** transposed matrix, and workspace (workspace only if #procs>1) +*********************************************************************/ + A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); + T_p = (double *)prk_malloc(Colblock_size*sizeof(double)); + if (A_p == NULL) + { + printf(" Error allocating space for original matrix on node %d\n",my_ID); + error = 1; + } + bail_out(error); + + B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); + if (B_p == NULL) + { + printf(" Error allocating space for transpose matrix on node %d\n",my_ID); + error = 1; + } + bail_out(error); + + /* Fill the original column matrix */ + istart = 0; + for (j=0;j Date: Mon, 17 Jul 2023 12:58:17 +0300 Subject: [PATCH 298/325] working --- PYTHON/transpose-numpy-shmem-rma.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/PYTHON/transpose-numpy-shmem-rma.py b/PYTHON/transpose-numpy-shmem-rma.py index 065c3b11e..172d69c62 100755 --- a/PYTHON/transpose-numpy-shmem-rma.py +++ b/PYTHON/transpose-numpy-shmem-rma.py @@ -168,33 +168,19 @@ def main(): shmem.barrier_all() t0 = timer() - # barrier required before alltoall for correctness - #shmem.barrier_all() - #shmem.alltoall(T, A) - #for r in range(0,np): - # lo = block_order * r - # hi = block_order * (r+1) - # #B[lo:hi,:] += numpy.transpose(T[lo:hi,:]) - # B[lo:hi,:] += T[lo:hi,:].T - for phase in range(0,np): recv_from = (me + phase) % np - bsize = block_order * block_order - #WA.Get(T, recv_from, [bsize * me, bsize, MPI.DOUBLE]) - shmem.get(T, A[bsize * me : bsize * (me+1),:], recv_from) + shmem.get(target=T, source=A[block_order * me : block_order * (me+1),:], pe=recv_from) lo = block_order * recv_from hi = block_order * (recv_from+1) B[lo:hi,:] += T.T - - shmem.barrier_all() A += 1.0 shmem.barrier_all() - shmem.barrier_all() t1 = timer() trans_time = t1 - t0 From 2445a179e485522dec00ced8772baf9654b2e01e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 17 Jul 2023 14:52:00 +0300 Subject: [PATCH 299/325] add MPI C1z transpose --- C1z/Makefile | 2 +- C1z/transpose-mpi.c | 196 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 C1z/transpose-mpi.c diff --git a/C1z/Makefile b/C1z/Makefile index f719a9096..f4f705fe5 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -60,7 +60,7 @@ openmp: nstream-openmp \ tasks: p2p-tasks-openmp p2p-tasks-2d-openmp -mpi: nstream-mpi +mpi: nstream-mpi transpose-mpi petsc: nstream-petsc transpose-petsc diff --git a/C1z/transpose-mpi.c b/C1z/transpose-mpi.c new file mode 100644 index 000000000..86221ff60 --- /dev/null +++ b/C1z/transpose-mpi.c @@ -0,0 +1,196 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// C11-ification by Jeff Hammond, June 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_openmp.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %d\n", PRKVERSION ); +#ifdef _OPENMP + printf("C11/OpenMP Matrix transpose: B = A^T\n"); +#else + printf("C11/Serial Matrix transpose: B = A^T\n"); +#endif + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> [tile size]\n"); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // order of a the matrix + int order = atoi(argv[2]); + if (order <= 0) { + printf("ERROR: Matrix Order must be greater than 0\n"); + return 1; + } + + // default tile size for tiling of local transpose + int tile_size = (argc>3) ? atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + +#ifdef _OPENMP + printf("Number of threads (max) = %d\n", omp_get_max_threads()); +#endif + printf("Number of iterations = %d\n", iterations); + printf("Matrix order = %d\n", order); + printf("Tile size = %d\n", tile_size); + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + double trans_time = 0.0; + + size_t bytes = order*order*sizeof(double); + double * restrict A = prk_malloc(bytes); + double * restrict B = prk_malloc(bytes); + + OMP_PARALLEL() + { + OMP_FOR() + for (int i=0;i Date: Mon, 17 Jul 2023 17:06:52 +0300 Subject: [PATCH 300/325] works, but is not optimal --- C1z/transpose-mpi.c | 189 ++++++++++++++++++++++++++++---------------- 1 file changed, 121 insertions(+), 68 deletions(-) diff --git a/C1z/transpose-mpi.c b/C1z/transpose-mpi.c index 86221ff60..1df12d84f 100644 --- a/C1z/transpose-mpi.c +++ b/C1z/transpose-mpi.c @@ -1,5 +1,6 @@ /// /// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2023, NVIDIA /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -54,142 +55,194 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" -#include "prk_openmp.h" +#include int main(int argc, char * argv[]) { - printf("Parallel Research Kernels version %d\n", PRKVERSION ); -#ifdef _OPENMP - printf("C11/OpenMP Matrix transpose: B = A^T\n"); -#else - printf("C11/Serial Matrix transpose: B = A^T\n"); -#endif + const int requested = MPI_THREAD_SERIALIZED; + int provided; + MPI_Init_thread(&argc, &argv, requested, &provided); + if (provided < requested) MPI_Abort(MPI_COMM_WORLD,provided); + + int me, np; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + MPI_Comm_size(MPI_COMM_WORLD, &np); + + if (me==0) { + printf("Parallel Research Kernels version %d\n", PRKVERSION ); + printf("C11/MPI Matrix transpose: B = A^T\n"); + } ////////////////////////////////////////////////////////////////////// /// Read and test input parameters ////////////////////////////////////////////////////////////////////// if (argc < 3) { - printf("Usage: <# iterations> [tile size]\n"); + if (me==0) printf("Usage: <# iterations> [tile size]\n"); + MPI_Finalize(); return 1; } // number of times to do the transpose int iterations = atoi(argv[1]); if (iterations < 1) { - printf("ERROR: iterations must be >= 1\n"); + if (me==0) printf("ERROR: iterations must be >= 1\n"); + MPI_Finalize(); return 1; } // order of a the matrix int order = atoi(argv[2]); if (order <= 0) { - printf("ERROR: Matrix Order must be greater than 0\n"); + if (me==0) printf("ERROR: Matrix Order must be greater than 0\n"); + MPI_Finalize(); + return 1; + } + else if (order % np != 0) { + if (me==0) printf("ERROR: Matrix Order %d must be evenly divisible by np=%d\n", order, np); + MPI_Finalize(); + return 1; + } + + const int block_order = order / np; + if (block_order > floor(sqrt(INT_MAX))) { + if (me==0) printf("ERROR: block_order too large - overflow risk\n"); + MPI_Finalize(); return 1; } + const int bo2 = block_order * block_order; // default tile size for tiling of local transpose int tile_size = (argc>3) ? atoi(argv[3]) : 32; // a negative tile size means no tiling of the local transpose if (tile_size <= 0) tile_size = order; -#ifdef _OPENMP - printf("Number of threads (max) = %d\n", omp_get_max_threads()); -#endif - printf("Number of iterations = %d\n", iterations); - printf("Matrix order = %d\n", order); - printf("Tile size = %d\n", tile_size); + if (me==0) { + printf("Number of processes = %d\n", np); + printf("Number of iterations = %d\n", iterations); + printf("Matrix order = %d\n", order); + printf("Tile size = %d\n", tile_size); + } + fflush(stdout); + MPI_Barrier(MPI_COMM_WORLD); ////////////////////////////////////////////////////////////////////// /// Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// - double trans_time = 0.0; - - size_t bytes = order*order*sizeof(double); - double * restrict A = prk_malloc(bytes); - double * restrict B = prk_malloc(bytes); + const size_t bytes = (size_t)order * (size_t)block_order * sizeof(double); + double (* const restrict A)[block_order] = (double (*)[block_order]) prk_malloc(bytes); + double (* const restrict B)[block_order] = (double (*)[block_order]) prk_malloc(bytes); + double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes); + double (* const restrict X)[block_order] = (double (*)[block_order]) prk_malloc(bytes/np); + double (* const restrict Y)[block_order] = (double (*)[block_order]) prk_malloc(bytes/np); + if (A == NULL || B == NULL || T == NULL) { + printf("Error allocating space; A=%p B=%p T=%p\n",A,B,T); + MPI_Abort(MPI_COMM_WORLD,99); + } - OMP_PARALLEL() - { - OMP_FOR() - for (int i=0;i Date: Mon, 17 Jul 2023 17:33:06 +0300 Subject: [PATCH 301/325] half as dumb as before --- C1z/transpose-mpi.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/C1z/transpose-mpi.c b/C1z/transpose-mpi.c index 1df12d84f..a68a4ebbb 100644 --- a/C1z/transpose-mpi.c +++ b/C1z/transpose-mpi.c @@ -135,7 +135,6 @@ int main(int argc, char * argv[]) double (* const restrict B)[block_order] = (double (*)[block_order]) prk_malloc(bytes); double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes); double (* const restrict X)[block_order] = (double (*)[block_order]) prk_malloc(bytes/np); - double (* const restrict Y)[block_order] = (double (*)[block_order]) prk_malloc(bytes/np); if (A == NULL || B == NULL || T == NULL) { printf("Error allocating space; A=%p B=%p T=%p\n",A,B,T); MPI_Abort(MPI_COMM_WORLD,99); @@ -175,12 +174,7 @@ int main(int argc, char * argv[]) } for (int i=0; i Date: Mon, 17 Jul 2023 17:39:56 +0300 Subject: [PATCH 302/325] half as dumb as before, meaning no longer dumb --- C1z/transpose-mpi.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/C1z/transpose-mpi.c b/C1z/transpose-mpi.c index a68a4ebbb..86eed7fa6 100644 --- a/C1z/transpose-mpi.c +++ b/C1z/transpose-mpi.c @@ -134,7 +134,6 @@ int main(int argc, char * argv[]) double (* const restrict A)[block_order] = (double (*)[block_order]) prk_malloc(bytes); double (* const restrict B)[block_order] = (double (*)[block_order]) prk_malloc(bytes); double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes); - double (* const restrict X)[block_order] = (double (*)[block_order]) prk_malloc(bytes/np); if (A == NULL || B == NULL || T == NULL) { printf("Error allocating space; A=%p B=%p T=%p\n",A,B,T); MPI_Abort(MPI_COMM_WORLD,99); @@ -163,24 +162,11 @@ int main(int argc, char * argv[]) MPI_COMM_WORLD); for (int r=0; r Date: Mon, 17 Jul 2023 17:22:25 +0200 Subject: [PATCH 303/325] add transpose numpy shmem rma with get --- PYTHON/transpose-numpy-shmem-rma-get.py | 236 ++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 PYTHON/transpose-numpy-shmem-rma-get.py diff --git a/PYTHON/transpose-numpy-shmem-rma-get.py b/PYTHON/transpose-numpy-shmem-rma-get.py new file mode 100644 index 000000000..e2f452b66 --- /dev/null +++ b/PYTHON/transpose-numpy-shmem-rma-get.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2023, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: transpose +# +# PURPOSE: This program measures the time for the transpose of a +# column-major stored matrix into a row-major stored matrix. +# +# USAGE: Program input is the matrix order and the number of times to +# repeat the operation: +# +# transpose <# iterations> +# +# The output consists of diagnostics to make sure the +# transpose worked and timing statistics. +# +# HISTORY: Written by Rob Van der Wijngaart, February 2009. +# Converted to Python by Jeff Hammond, February 2016. +# +# ******************************************************************* + +# Layout nomenclature +# ------------------- +# +# - Each rank owns one block of columns (Colblock) of the overall +# matrix to be transposed, as well as of the transposed matrix. +# - Colblock is stored contiguously in the memory of the rank. +# The stored format is column major, which means that matrix +# elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1) +# are "order" words apart +# - Colblock is logically composed of #ranks Blocks, but a Block is +# not stored contiguously in memory. Conceptually, the Block is +# the unit of data that gets communicated between ranks. Block i of +# rank j is locally transposed and gathered into a buffer called Work, +# which is sent to rank i, where it is scattered into Block j of the +# transposed matrix. +# - When tiling is applied to reduce TLB misses, each block gets +# accessed by tiles. +# - The original and transposed matrices are called A and B +# +# +-----------------------------------------------------------------+ +# | | | | | +# | Colblock | | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | Block | | | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | Overall Matrix | +# | | | | | +# | | | | | +# | | | | | +# | ------------------------------- | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# | | | | | +# +-----------------------------------------------------------------+ + +import sys +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer +from shmem4py import shmem +import numpy +import time + +def main(): + + me = shmem.my_pe() + np = shmem.n_pes() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python SHMEM/Numpy Matrix transpose: B = A^T') + + if len(sys.argv) != 3: + if (me==0): + print('argument count = ', len(sys.argv)) + print("Usage: ./transpose <# iterations> ") + sys.exit() + + iterations = int(sys.argv[1]) + if iterations < 1: + if (me==0): + print("ERROR: iterations must be >= 1") + sys.exit() + + order = int(sys.argv[2]) + if order < 1: + if (me==0): + print("ERROR: order must be >= 1") + sys.exit() + + if order % np != 0: + if (me==0): + print(f"ERROR: matrix order ({order}) should be divisible by # procs ({np})") + sys.exit() + + block_order = int(order / np) + + if (me==0): + print('Number of ranks = ', np) + print('Number of iterations = ', iterations) + print('Matrix order = ', order) + + shmem.barrier_all() + + # ******************************************************************** + # ** Allocate space for the input and transpose matrix + # ******************************************************************** + + #LA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype='d') + #A = shmem.full((order,block_order),LA) + A = shmem.zeros((order,block_order)) + B = shmem.zeros((order,block_order)) + T = shmem.zeros((np,block_order,block_order)) + send_flag = shmem.ones(np, dtype='i') + recv_flag = shmem.zeros(np, dtype='i') + + TA = numpy.fromfunction(lambda i,j: me * block_order + i*order + j, (order,block_order), dtype=numpy.double) + A[:,:] = TA[:,:] + + for k in range(0,iterations+1): + + if k<1: + shmem.barrier_all() + t0 = timer() + + for phase in range(0,np): + recv_from = (me + phase) % np + send_to = (me - phase + np) % np + + lo = block_order * send_to + hi = block_order * (send_to+1) + + shmem.wait_until(send_flag[send_to:send_to+1], shmem.CMP.EQ, 1) + send_flag[send_to] = 0 + + shmem.put(T[phase], A[lo : hi,:], send_to) + shmem.fence() + + shmem.atomic_inc(recv_flag[phase:phase+1], send_to) + shmem.wait_until(recv_flag[phase:phase+1], shmem.CMP.EQ, k+1) + + lo = block_order * recv_from + hi = block_order * (recv_from+1) + B[lo:hi,:] += T[phase].T + + shmem.put(send_flag[me:me+1], numpy.array([1],dtype='i'), recv_from) + + + A += 1.0 + + t1 = timer() + shmem.barrier_all() + trans_time = t1 - t0 + + shmem.free(A) + shmem.free(T) + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # allgather is non-scalable but was easier to debug + F = shmem.zeros((np,order,block_order)) + shmem.fcollect(F,B) + G = numpy.concatenate(F,axis=1) + #if (me==0): + # print(G) + H = numpy.fromfunction(lambda i,j: ((iterations/2.0)+(order*j+i))*(iterations+1.0), (order,order), dtype='d') + abserr = numpy.linalg.norm(numpy.reshape(G-H,order*order),ord=1) + + shmem.free(B) + shmem.free(F) + + epsilon=1.e-8 + nbytes = 2 * order**2 * 8 # 8 is not sizeof(double) in bytes, but allows for comparison to C etc. + if abserr < epsilon: + if (me==0): + print('Solution validates') + avgtime = trans_time/iterations + print('Rate (MB/s): ',1.e-6*nbytes/avgtime, ' Avg time (s): ', avgtime) + else: + if (me==0): + print('error ',abserr, ' exceeds threshold ',epsilon) + print("ERROR: solution did not validate") + + +if __name__ == '__main__': + main() From 1433c93857123b7e73033c517ca7e230aba85bcc Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Mon, 17 Jul 2023 20:18:17 +0200 Subject: [PATCH 304/325] alltoall works --- MPI1/Transpose/transpose-a2a.c | 74 +++++++++------------------------- 1 file changed, 19 insertions(+), 55 deletions(-) diff --git a/MPI1/Transpose/transpose-a2a.c b/MPI1/Transpose/transpose-a2a.c index eba905402..7b90a4cfc 100644 --- a/MPI1/Transpose/transpose-a2a.c +++ b/MPI1/Transpose/transpose-a2a.c @@ -120,10 +120,6 @@ o The original and transposed matrices are called A and B #include #include -#define A(i,j) A_p[(i+istart)+order*(j)] -#define B(i,j) B_p[(i+istart)+order*(j)] -#define T(i,j) T_p[(i+istart)+order*(j)] - int main(int argc, char ** argv) { long Block_order; /* number of columns owned by rank */ @@ -134,10 +130,9 @@ int main(int argc, char ** argv) int my_ID; /* rank */ int root=0; /* rank of root */ int iterations; /* number of times to do the transpose */ - int i, j, istart; /* dummies */ + int i, j; /* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ - int colstart; /* starting column for owning rank */ int error; /* error flag */ double * RESTRICT A_p; /* original matrix column block */ double * RESTRICT B_p; /* transposed matrix column block */ @@ -202,7 +197,7 @@ int main(int argc, char ** argv) printf("Number of ranks = %d\n", Num_procs); printf("Matrix order = %ld\n", order); printf("Number of iterations = %d\n", iterations); - printf("Blocking messages\n"); + printf("Alltoall\n"); } /* Broadcast input data to all ranks */ @@ -219,7 +214,6 @@ int main(int argc, char ** argv) *********************************************************************/ Block_order = order/Num_procs; - colstart = Block_order * my_ID; Colblock_size = order * Block_order; /********************************************************************* @@ -244,17 +238,14 @@ int main(int argc, char ** argv) bail_out(error); /* Fill the original column matrix */ - istart = 0; - for (j=0;j Date: Tue, 18 Jul 2023 10:23:51 +0300 Subject: [PATCH 305/325] rename to a2a --- C1z/Makefile | 2 +- C1z/{transpose-mpi.c => transpose-a2a-mpi.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename C1z/{transpose-mpi.c => transpose-a2a-mpi.c} (100%) diff --git a/C1z/Makefile b/C1z/Makefile index f4f705fe5..4e47dbc94 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -60,7 +60,7 @@ openmp: nstream-openmp \ tasks: p2p-tasks-openmp p2p-tasks-2d-openmp -mpi: nstream-mpi transpose-mpi +mpi: nstream-mpi transpose-a2a-mpi petsc: nstream-petsc transpose-petsc diff --git a/C1z/transpose-mpi.c b/C1z/transpose-a2a-mpi.c similarity index 100% rename from C1z/transpose-mpi.c rename to C1z/transpose-a2a-mpi.c From 75e1810b498a197ab731310108571f8fbc92fe34 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 18 Jul 2023 10:24:32 +0300 Subject: [PATCH 306/325] add p2p version --- C1z/Makefile | 2 +- C1z/transpose-p2p-mpi.c | 227 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 C1z/transpose-p2p-mpi.c diff --git a/C1z/Makefile b/C1z/Makefile index 4e47dbc94..e15fc4e05 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -60,7 +60,7 @@ openmp: nstream-openmp \ tasks: p2p-tasks-openmp p2p-tasks-2d-openmp -mpi: nstream-mpi transpose-a2a-mpi +mpi: nstream-mpi transpose-a2a-mpi transpose-p2p-mpi petsc: nstream-petsc transpose-petsc diff --git a/C1z/transpose-p2p-mpi.c b/C1z/transpose-p2p-mpi.c new file mode 100644 index 000000000..86eed7fa6 --- /dev/null +++ b/C1z/transpose-p2p-mpi.c @@ -0,0 +1,227 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2023, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// C11-ification by Jeff Hammond, June 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include + +int main(int argc, char * argv[]) +{ + const int requested = MPI_THREAD_SERIALIZED; + int provided; + MPI_Init_thread(&argc, &argv, requested, &provided); + if (provided < requested) MPI_Abort(MPI_COMM_WORLD,provided); + + int me, np; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + MPI_Comm_size(MPI_COMM_WORLD, &np); + + if (me==0) { + printf("Parallel Research Kernels version %d\n", PRKVERSION ); + printf("C11/MPI Matrix transpose: B = A^T\n"); + } + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + if (me==0) printf("Usage: <# iterations> [tile size]\n"); + MPI_Finalize(); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + if (me==0) printf("ERROR: iterations must be >= 1\n"); + MPI_Finalize(); + return 1; + } + + // order of a the matrix + int order = atoi(argv[2]); + if (order <= 0) { + if (me==0) printf("ERROR: Matrix Order must be greater than 0\n"); + MPI_Finalize(); + return 1; + } + else if (order % np != 0) { + if (me==0) printf("ERROR: Matrix Order %d must be evenly divisible by np=%d\n", order, np); + MPI_Finalize(); + return 1; + } + + const int block_order = order / np; + if (block_order > floor(sqrt(INT_MAX))) { + if (me==0) printf("ERROR: block_order too large - overflow risk\n"); + MPI_Finalize(); + return 1; + } + const int bo2 = block_order * block_order; + + // default tile size for tiling of local transpose + int tile_size = (argc>3) ? atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + + if (me==0) { + printf("Number of processes = %d\n", np); + printf("Number of iterations = %d\n", iterations); + printf("Matrix order = %d\n", order); + printf("Tile size = %d\n", tile_size); + } + fflush(stdout); + MPI_Barrier(MPI_COMM_WORLD); + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + const size_t bytes = (size_t)order * (size_t)block_order * sizeof(double); + double (* const restrict A)[block_order] = (double (*)[block_order]) prk_malloc(bytes); + double (* const restrict B)[block_order] = (double (*)[block_order]) prk_malloc(bytes); + double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes); + if (A == NULL || B == NULL || T == NULL) { + printf("Error allocating space; A=%p B=%p T=%p\n",A,B,T); + MPI_Abort(MPI_COMM_WORLD,99); + } + + for (int i=0; i Date: Tue, 18 Jul 2023 10:35:59 +0300 Subject: [PATCH 307/325] step 1 in p2p --- C1z/transpose-p2p-mpi.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/C1z/transpose-p2p-mpi.c b/C1z/transpose-p2p-mpi.c index 86eed7fa6..1a1fac379 100644 --- a/C1z/transpose-p2p-mpi.c +++ b/C1z/transpose-p2p-mpi.c @@ -157,9 +157,15 @@ int main(int argc, char * argv[]) } // B += A^T - MPI_Alltoall(A, bo2, MPI_DOUBLE, - T, bo2, MPI_DOUBLE, - MPI_COMM_WORLD); + //MPI_Alltoall(A, bo2, MPI_DOUBLE, T, bo2, MPI_DOUBLE, MPI_COMM_WORLD); + for (int r=0; r Date: Tue, 18 Jul 2023 10:36:45 +0300 Subject: [PATCH 308/325] step 2 in p2p --- C1z/transpose-p2p-mpi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/C1z/transpose-p2p-mpi.c b/C1z/transpose-p2p-mpi.c index 1a1fac379..ee201a233 100644 --- a/C1z/transpose-p2p-mpi.c +++ b/C1z/transpose-p2p-mpi.c @@ -164,9 +164,7 @@ int main(int argc, char * argv[]) MPI_Sendrecv(&A[to*block_order][0],bo2,MPI_DOUBLE,to,r, &T[from*block_order][0],bo2,MPI_DOUBLE,from,r, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - } - for (int r=0; r Date: Tue, 18 Jul 2023 10:38:03 +0300 Subject: [PATCH 309/325] step 3 in p2p --- C1z/transpose-p2p-mpi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/C1z/transpose-p2p-mpi.c b/C1z/transpose-p2p-mpi.c index ee201a233..f5cc8683b 100644 --- a/C1z/transpose-p2p-mpi.c +++ b/C1z/transpose-p2p-mpi.c @@ -133,7 +133,7 @@ int main(int argc, char * argv[]) const size_t bytes = (size_t)order * (size_t)block_order * sizeof(double); double (* const restrict A)[block_order] = (double (*)[block_order]) prk_malloc(bytes); double (* const restrict B)[block_order] = (double (*)[block_order]) prk_malloc(bytes); - double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes); + double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes/np); if (A == NULL || B == NULL || T == NULL) { printf("Error allocating space; A=%p B=%p T=%p\n",A,B,T); MPI_Abort(MPI_COMM_WORLD,99); @@ -162,7 +162,7 @@ int main(int argc, char * argv[]) const int to = (me + r) % np; const int from = (me - r + np) % np; MPI_Sendrecv(&A[to*block_order][0],bo2,MPI_DOUBLE,to,r, - &T[from*block_order][0],bo2,MPI_DOUBLE,from,r, + T,bo2,MPI_DOUBLE,from,r, MPI_COMM_WORLD, MPI_STATUS_IGNORE); const int lo = block_order * r; @@ -170,7 +170,7 @@ int main(int argc, char * argv[]) // B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,lo:hi)) for (int i=0; i Date: Tue, 18 Jul 2023 10:56:28 +0300 Subject: [PATCH 310/325] this works, but is dumb --- C1z/transpose-a2a-mpi.c | 13 +++++++++++++ C1z/transpose-p2p-mpi.c | 24 ++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/C1z/transpose-a2a-mpi.c b/C1z/transpose-a2a-mpi.c index 86eed7fa6..1e6cca9a4 100644 --- a/C1z/transpose-a2a-mpi.c +++ b/C1z/transpose-a2a-mpi.c @@ -147,6 +147,15 @@ int main(int argc, char * argv[]) } MPI_Barrier(MPI_COMM_WORLD); +#if 0 + //int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype * newtype) + MPI_Datatype stride_dt; + MPI_Datatype trans_dt; + MPI_Type_vector(block_order, 1, block_order, MPI_DOUBLE, &stride_dt); + MPI_Type_vector(stride_dt, 1, 1, stride_dt, &trans_dt); + MPI_Type_commit(&trans_dt); +#endif + double t0=0.0, t1; for (int iter = 0; iter<=iterations; iter++) { @@ -182,6 +191,10 @@ int main(int argc, char * argv[]) const double trans_time = t1 - t0; //if (me==0) printf("trans_time=%lf\n", trans_time); +#if 0 + MPI_Type_free(&trans_dt); +#endif + ////////////////////////////////////////////////////////////////////// // Analyze and output results ////////////////////////////////////////////////////////////////////// diff --git a/C1z/transpose-p2p-mpi.c b/C1z/transpose-p2p-mpi.c index f5cc8683b..ff65add81 100644 --- a/C1z/transpose-p2p-mpi.c +++ b/C1z/transpose-p2p-mpi.c @@ -133,7 +133,7 @@ int main(int argc, char * argv[]) const size_t bytes = (size_t)order * (size_t)block_order * sizeof(double); double (* const restrict A)[block_order] = (double (*)[block_order]) prk_malloc(bytes); double (* const restrict B)[block_order] = (double (*)[block_order]) prk_malloc(bytes); - double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes/np); + double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes); if (A == NULL || B == NULL || T == NULL) { printf("Error allocating space; A=%p B=%p T=%p\n",A,B,T); MPI_Abort(MPI_COMM_WORLD,99); @@ -147,6 +147,15 @@ int main(int argc, char * argv[]) } MPI_Barrier(MPI_COMM_WORLD); +#if 0 + //int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype * newtype) + MPI_Datatype stride_dt; + MPI_Datatype trans_dt; + MPI_Type_vector(block_order, 1, block_order, MPI_DOUBLE, &stride_dt); + MPI_Type_vector(stride_dt, 1, 1, stride_dt, &trans_dt); + MPI_Type_commit(&trans_dt); +#endif + double t0=0.0, t1; for (int iter = 0; iter<=iterations; iter++) { @@ -161,16 +170,19 @@ int main(int argc, char * argv[]) for (int r=0; r Date: Tue, 18 Jul 2023 10:57:10 +0300 Subject: [PATCH 311/325] this works, but is dumb --- C1z/transpose-p2p-mpi.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/C1z/transpose-p2p-mpi.c b/C1z/transpose-p2p-mpi.c index ff65add81..5d01ccdb3 100644 --- a/C1z/transpose-p2p-mpi.c +++ b/C1z/transpose-p2p-mpi.c @@ -172,17 +172,18 @@ int main(int argc, char * argv[]) const int from = (me - r + np) % np; printf("%d: r=%d to=%d, from=%d\n", me, r, to, from); MPI_Sendrecv(&A[to*block_order][0],bo2,MPI_DOUBLE,to,r, - //T,bo2,MPI_DOUBLE,from,r, - &T[from*block_order][0],bo2,MPI_DOUBLE,from,r, + T,bo2,MPI_DOUBLE,from,r, + //&T[from*block_order][0],bo2,MPI_DOUBLE,from,r, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - } - for (int r=0; r Date: Tue, 18 Jul 2023 10:58:04 +0300 Subject: [PATCH 312/325] this works - fixed the dumb --- C1z/transpose-p2p-mpi.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/C1z/transpose-p2p-mpi.c b/C1z/transpose-p2p-mpi.c index 5d01ccdb3..c9a8c53e8 100644 --- a/C1z/transpose-p2p-mpi.c +++ b/C1z/transpose-p2p-mpi.c @@ -133,7 +133,7 @@ int main(int argc, char * argv[]) const size_t bytes = (size_t)order * (size_t)block_order * sizeof(double); double (* const restrict A)[block_order] = (double (*)[block_order]) prk_malloc(bytes); double (* const restrict B)[block_order] = (double (*)[block_order]) prk_malloc(bytes); - double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes); + double (* const restrict T)[block_order] = (double (*)[block_order]) prk_malloc(bytes/np); if (A == NULL || B == NULL || T == NULL) { printf("Error allocating space; A=%p B=%p T=%p\n",A,B,T); MPI_Abort(MPI_COMM_WORLD,99); @@ -173,16 +173,12 @@ int main(int argc, char * argv[]) printf("%d: r=%d to=%d, from=%d\n", me, r, to, from); MPI_Sendrecv(&A[to*block_order][0],bo2,MPI_DOUBLE,to,r, T,bo2,MPI_DOUBLE,from,r, - //&T[from*block_order][0],bo2,MPI_DOUBLE,from,r, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - //} - //for (int r=0; r Date: Tue, 18 Jul 2023 11:17:20 +0300 Subject: [PATCH 313/325] added datatypes but they suck --- C1z/transpose-a2a-mpi.c | 24 +++++++++++++++++++----- C1z/transpose-p2p-mpi.c | 26 ++++++++++++++++++++------ 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/C1z/transpose-a2a-mpi.c b/C1z/transpose-a2a-mpi.c index 1e6cca9a4..52b5dcd4a 100644 --- a/C1z/transpose-a2a-mpi.c +++ b/C1z/transpose-a2a-mpi.c @@ -147,12 +147,17 @@ int main(int argc, char * argv[]) } MPI_Barrier(MPI_COMM_WORLD); -#if 0 - //int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype * newtype) +// Datatypes are slower +//#define USE_DATATYPES +#ifdef USE_DATATYPES MPI_Datatype stride_dt; - MPI_Datatype trans_dt; + //int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype * newtype) MPI_Type_vector(block_order, 1, block_order, MPI_DOUBLE, &stride_dt); - MPI_Type_vector(stride_dt, 1, 1, stride_dt, &trans_dt); + int dsize; + MPI_Type_size(MPI_DOUBLE,&dsize); + MPI_Datatype trans_dt; + //int MPI_Type_hvector(int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype, MPI_Datatype * newtype) + MPI_Type_hvector(block_order, 1, dsize, stride_dt, &trans_dt); MPI_Type_commit(&trans_dt); #endif @@ -167,7 +172,11 @@ int main(int argc, char * argv[]) // B += A^T MPI_Alltoall(A, bo2, MPI_DOUBLE, +#ifdef USE_DATATYPES + T, 1, trans_dt, +#else T, bo2, MPI_DOUBLE, +#endif MPI_COMM_WORLD); for (int r=0; r Date: Tue, 18 Jul 2023 11:30:14 +0300 Subject: [PATCH 314/325] added datatypes but they suck --- C1z/transpose-p2p-mpi.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/C1z/transpose-p2p-mpi.c b/C1z/transpose-p2p-mpi.c index 166b4858e..eedeae980 100644 --- a/C1z/transpose-p2p-mpi.c +++ b/C1z/transpose-p2p-mpi.c @@ -148,8 +148,10 @@ int main(int argc, char * argv[]) MPI_Barrier(MPI_COMM_WORLD); // Datatypes are slower -//#define USE_DATATYPES -#ifdef USE_DATATYPES +// define only 1 of these +//#define USE_SEND_DATATYPES +//#define USE_RECV_DATATYPES +#if defined(USE_SEND_DATATYPES) || defined(USE_RECV_DATATYPES) MPI_Datatype stride_dt; //int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype * newtype) MPI_Type_vector(block_order, 1, block_order, MPI_DOUBLE, &stride_dt); @@ -159,6 +161,9 @@ int main(int argc, char * argv[]) //int MPI_Type_hvector(int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype, MPI_Datatype * newtype) MPI_Type_hvector(block_order, 1, dsize, stride_dt, &trans_dt); MPI_Type_commit(&trans_dt); +#endif +#if defined(USE_SEND_DATATYPES) && defined(USE_RECV_DATATYPES) +#error You can define USE_SEND_DATATYPES or USE_RECV_DATATYPES but not both! #endif double t0=0.0, t1; @@ -176,19 +181,25 @@ int main(int argc, char * argv[]) const int to = (me + r) % np; const int from = (me - r + np) % np; //printf("%d: r=%d to=%d, from=%d\n", me, r, to, from); - MPI_Sendrecv(&A[to*block_order][0],bo2,MPI_DOUBLE,to,r, -#ifdef USE_DATATYPES - T, 1, trans_dt,from,r, + MPI_Sendrecv(&A[to*block_order][0], +#ifdef USE_SEND_DATATYPES + 1,trans_dt, +#else + bo2,MPI_DOUBLE, +#endif + to,r,T, +#ifdef USE_RECV_DATATYPES + 1,trans_dt, #else - T,bo2,MPI_DOUBLE,from,r, + bo2,MPI_DOUBLE, #endif - MPI_COMM_WORLD, MPI_STATUS_IGNORE); + from,r,MPI_COMM_WORLD, MPI_STATUS_IGNORE); const int lo = block_order * from; // B(:,lo:hi) = B(:,lo:hi) + transpose(T(:,lo:hi)) for (int i=0; i Date: Tue, 18 Jul 2023 12:38:46 +0300 Subject: [PATCH 315/325] cosmetic changes --- FORTRAN/transpose-p2p-mpi.F90 | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/FORTRAN/transpose-p2p-mpi.F90 b/FORTRAN/transpose-p2p-mpi.F90 index 52a37a443..4d3f16874 100644 --- a/FORTRAN/transpose-p2p-mpi.F90 +++ b/FORTRAN/transpose-p2p-mpi.F90 @@ -63,7 +63,7 @@ program main integer :: err ! problem definition integer(kind=INT32) :: iterations - integer(kind=INT32) :: order, block_order + integer(kind=INT32) :: order, block_order, bo2 real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix real(kind=REAL64), allocatable :: T(:,:) ! temporary to hold tile @@ -102,6 +102,7 @@ program main call MPI_Bcast(order, 1, MPI_INTEGER4, 0, MPI_COMM_WORLD) block_order = int(order / np) + bo2 = block_order * block_order call MPI_Barrier(MPI_COMM_WORLD) @@ -137,15 +138,13 @@ program main lo = block_order * send_to + 1 hi = block_order * (send_to+1) - call MPI_Sendrecv(A(:,lo:hi), block_order*block_order, MPI_DOUBLE_PRECISION, & - send_to,q, & - T,block_order*block_order, MPI_DOUBLE_PRECISION, & - recv_from, q, MPI_COMM_WORLD, MPI_STATUS_IGNORE) + call MPI_Sendrecv(A(:,lo:hi), bo2, MPI_DOUBLE_PRECISION, send_to, q, & + T, bo2, MPI_DOUBLE_PRECISION, recv_from, q, & + MPI_COMM_WORLD, MPI_STATUS_IGNORE) + lo = block_order * recv_from + 1 hi = block_order * (recv_from+1) B(:,lo:hi) = B(:,lo:hi) + transpose(T) - - end do ! A += 1 A = A + one From f9adef3bb66310e4ac4e45c8a473764bc7fe8811 Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Tue, 18 Jul 2023 21:26:05 +0300 Subject: [PATCH 316/325] perhaps not the fastest, but faster numba --- PYTHON/p2p-numba-mpi.py | 8 +++----- PYTHON/p2p-numba-shmem.py | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/PYTHON/p2p-numba-mpi.py b/PYTHON/p2p-numba-mpi.py index 24d8a85bc..3cfe44c5e 100755 --- a/PYTHON/p2p-numba-mpi.py +++ b/PYTHON/p2p-numba-mpi.py @@ -64,10 +64,9 @@ from numba import jit @jit(nopython=True) -def iterate_over_grid(grid, i1, i2, j1, j2): +def iterate_over_grid(grid, i1, i2, j): for i in range(i1,i2): - for j in range(j1,j2): - grid[i,j] = grid[i-1,j] + grid[i,j-1] - grid[i-1,j-1] + grid[i,j] = grid[i-1,j] + grid[i,j-1] - grid[i-1,j-1] @jit(nopython=True) def iterate_over_grid_grp(grid, g, i1, i2, j1, j2): @@ -160,9 +159,8 @@ def main(): if me > 0: comm.Recv(grid[start-1,j:j+1], source=me-1, tag=j) - iterate_over_grid(grid, start, end+1, 1, n) + iterate_over_grid(grid, start, end+1, j) - for j in range(1,n): # if I am not on the right boundary, send data to my right neighbor if me < np-1: comm.Send(grid[end,j:j+1], dest=me+1, tag=j) diff --git a/PYTHON/p2p-numba-shmem.py b/PYTHON/p2p-numba-shmem.py index 7058bf016..21a68fd9e 100755 --- a/PYTHON/p2p-numba-shmem.py +++ b/PYTHON/p2p-numba-shmem.py @@ -65,10 +65,9 @@ from numba import jit @jit(nopython=True) -def iterate_over_grid(grid, i1, i2, j1, j2): +def iterate_over_grid(grid, i1, i2, j): for i in range(i1,i2): - for j in range(j1,j2): - grid[i,j] = grid[i-1,j] + grid[i,j-1] - grid[i-1,j-1] + grid[i,j] = grid[i-1,j] + grid[i,j-1] - grid[i-1,j-1] def main(): me = shmem.my_pe() @@ -167,9 +166,8 @@ def main(): shmem.wait_until(flag_left[j:j+1], shmem.CMP.EQ, true) grid[start[me]-1,j] = dst[j] - iterate_over_grid(grid, start[me], end[me]+1, 1, n) + iterate_over_grid(grid, start[me], end[me]+1, j) - for j in range(1,n): if me != np-1: src[j] = grid[end[me],j] From 2ed3324db3d8e3df63a52932238adaaebc21367f Mon Sep 17 00:00:00 2001 From: Marcin Rogowski Date: Tue, 18 Jul 2023 22:46:47 +0200 Subject: [PATCH 317/325] add SHMEM C transpose with alltoall --- MPI1/Transpose/transpose-a2a.c | 3 +- SHMEM/Transpose/transpose_a2a.c | 353 ++++++++++++++++++++++++++++++++ 2 files changed, 355 insertions(+), 1 deletion(-) create mode 100644 SHMEM/Transpose/transpose_a2a.c diff --git a/MPI1/Transpose/transpose-a2a.c b/MPI1/Transpose/transpose-a2a.c index 7b90a4cfc..905d1e729 100644 --- a/MPI1/Transpose/transpose-a2a.c +++ b/MPI1/Transpose/transpose-a2a.c @@ -1,5 +1,6 @@ /* Copyright (c) 2013, Intel Corporation +Copyright (c) 2023 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -136,7 +137,7 @@ int main(int argc, char ** argv) int error; /* error flag */ double * RESTRICT A_p; /* original matrix column block */ double * RESTRICT B_p; /* transposed matrix column block */ - double * RESTRICT T_p; /* original matrix column block */ + double * RESTRICT T_p; double abserr, /* absolute error */ abserr_tot; /* aggregate absolute error */ double epsilon = 1.e-8; /* error tolerance */ diff --git a/SHMEM/Transpose/transpose_a2a.c b/SHMEM/Transpose/transpose_a2a.c new file mode 100644 index 000000000..3d4fc1be7 --- /dev/null +++ b/SHMEM/Transpose/transpose_a2a.c @@ -0,0 +1,353 @@ +/* +Copyright (c) 2013, Intel Corporation +Copyright (c) 2023 + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. +* Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +/******************************************************************* + +NAME: transpose + +PURPOSE: This program tests the efficiency with which a square matrix + can be transposed and stored in another matrix. The matrices + are distributed identically. + +USAGE: Program inputs are the matrix order, the number of times to + repeat the operation, and the communication mode + + transpose <# iterations> [tile size] + + An optional parameter specifies the tile size used to divide the + individual matrix blocks for improved cache and TLB performance. + + The output consists of diagnostics to make sure the + transpose worked and timing statistics. + +FUNCTIONS CALLED: + + Other than SHMEM or standard C functions, the following + functions are used in this program: + + wtime() Portable wall-timer interface. + bail_out() Determine global error and exit if nonzero. + +HISTORY: Written by Tom St. John, July 2015. + Rob vdW: Fixed race condition on synchronization flags, August 2015 + Marcin Rogowski: shmem_alltoall, July 2023 + +*******************************************************************/ + +/****************************************************************** + Layout nomenclature + ------------------- + +o Each rank owns one block of columns (Colblock) of the overall + matrix to be transposed, as well as of the transposed matrix. +o Colblock is stored contiguously in the memory of the rank. + The stored format is column major, which means that matrix + elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1) + are "order" words apart +o Colblock is logically composed of #ranks Blocks, but a Block is + not stored contiguously in memory. Conceptually, the Block is + the unit of data that gets communicated between ranks. Block i of + rank j is locally transposed and gathered into a buffer called Work, + which is sent to rank i, where it is scattered into Block j of the + transposed matrix. +o The original and transposed matrices are called A and B + + ----------------------------------------------------------------- +| | | | | +| Colblock | | | | +| | | | | +| | | | | +| | | | | +| ------------------------------- | +| | | | | +| | Block | | | +| | | | | +| | | | | +| | | | | +| ------------------------------- | +| | | | | +| | | | Overall Matrix | +| | | | | +| | | | | +| | | | | +| ------------------------------- | +| | | | | +| | | | | +| | | | | +| | | | | +| | | | | + -----------------------------------------------------------------*/ + +#include +#include + +#include + +int main(int argc, char ** argv) +{ + long Block_order; /* number of columns owned by rank */ + int Block_size; /* size of a single block */ + int Colblock_size; /* size of column block */ + int Num_procs; /* number of ranks */ + int order; /* order of overall matrix */ + int send_to, recv_from; /* ranks with which to communicate */ + long bytes; /* combined size of matrices */ + int my_ID; /* rank */ + int root=0; /* rank of root */ + int iterations; /* number of times to do the transpose */ + long i, j, it, jt, istart;/* dummies */ + int iter; /* index of iteration */ + int phase; /* phase inside staged communication */ + int colstart; /* starting column for owning rank */ + int error; /* error flag */ + double * RESTRICT A_p; /* original matrix column block */ + double * RESTRICT B_p; /* transposed matrix column block */ + double * RESTRICT T_p; + double epsilon = 1.e-8; /* error tolerance */ + double avgtime; /* timing parameters */ + long *pSync_bcast; /* work space for collectives */ + long *pSync_reduce; /* work space for collectives */ + double *pWrk; /* work space for SHMEM collectives */ + double *local_trans_time, + *trans_time; /* timing parameters */ + double *abserr, + *abserr_tot; /* local and aggregate error */ +#if !BARRIER_SYNCH + int *recv_flag; /* synchronization flags: data received */ + int *send_flag; /* synchronization flags: receiver ready */ +#endif + int *arguments; /* command line arguments */ + +/********************************************************************* +** Initialize the SHMEM environment +*********************************************************************/ + + prk_shmem_init(); + my_ID=prk_shmem_my_pe(); + Num_procs=prk_shmem_n_pes(); + + if (my_ID == root) { + printf("Parallel Research Kernels version %s\n", PRKVERSION); + printf("SHMEM matrix transpose: B = A^T\n"); + } + +// initialize sync variables for error checks + pSync_bcast = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); + pSync_reduce = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); + pWrk = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE); + local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); + trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); + arguments = (int *) prk_shmem_align(prk_get_alignment(),3*sizeof(int)); + abserr = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double)); + if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time || + !trans_time || !arguments || !abserr) { + printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID); + error = 1; + goto ENDOFTESTS; + } + + for(i=0;i \n", + *argv); + error = 1; goto ENDOFTESTS; + } + + iterations = atoi(*++argv); + arguments[0]=iterations; + if(iterations < 1){ + printf("ERROR: iterations must be >= 1 : %d \n",iterations); + error = 1; goto ENDOFTESTS; + } + + order = atoi(*++argv); + arguments[1]=order; + if (order < Num_procs) { + printf("ERROR: matrix order %d should at least # procs %d\n", + order, Num_procs); + error = 1; goto ENDOFTESTS; + } + if (order%Num_procs) { + printf("ERROR: matrix order %d should be divisible by # procs %d\n", + order, Num_procs); + error = 1; goto ENDOFTESTS; + } + + ENDOFTESTS:; + } + bail_out(error); + + if (my_ID == root) { + printf("Number of ranks = %d\n", Num_procs); + printf("Matrix order = %d\n", order); + printf("Number of iterations = %d\n", iterations); + } + + /* Broadcast input data to all ranks */ + shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast); + shmem_barrier_all(); + + iterations=arguments[0]; + order=arguments[1]; + + shmem_barrier_all(); + prk_shmem_free(arguments); + + bytes = 2 * sizeof(double) * order * order; + +/********************************************************************* +** The matrix is broken up into column blocks that are mapped one to a +** rank. Each column block is made up of Num_procs smaller square +** blocks of order block_order. +*********************************************************************/ + + Block_order = order/Num_procs; + colstart = Block_order * my_ID; + Colblock_size = order * Block_order; + Block_size = Block_order * Block_order; + +/********************************************************************* +** Create the column block of the test matrix, the row block of the +** transposed matrix, and workspace (workspace only if #procs>1) +*********************************************************************/ + A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); + T_p = (double *)prk_malloc(Colblock_size*sizeof(double)); + if (A_p == NULL){ + printf(" Error allocating space for original matrix on node %d\n",my_ID); + error = 1; + } + bail_out(error); + + B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); + if (B_p == NULL){ + printf(" Error allocating space for transpose matrix on node %d\n",my_ID); + error = 1; + } + bail_out(error); + + /* Fill the original column matrices */ + for (i=0;i Date: Tue, 15 Aug 2023 08:04:49 -0400 Subject: [PATCH 318/325] update for MI 200 series etc --- common/make.defs.hip | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/make.defs.hip b/common/make.defs.hip index 46b9977b5..974a70e87 100644 --- a/common/make.defs.hip +++ b/common/make.defs.hip @@ -1,13 +1,13 @@ # # This file shows the HIP toolchain options for PRKs. # -ROCM_PATH=/opt/rocm +ROCM_PATH=/opt/rocm-5.6.0 # # Base compilers and language options # VERSION= # C99 is required in some implementations. -CC=${ROCM_PATH}/llvm/bin/clang -std=gnu11 -pthread +CC=${ROCM_PATH}/llvm/bin/clang -std=gnu11 -pthread -lm #EXTRA_CLIBS=-lrt # All of the Fortran code is written for the 2008 standard and requires preprocessing. FC=${ROCM_PATH}/llvm/bin/flang -DAOMP #-std=f2008 -cpp @@ -24,7 +24,7 @@ DEFAULT_OPT_FLAGS=-g -O3 -ffast-math -mtune=native OPENMPFLAG=-fopenmp OPENMPSIMDFLAG=-fopenmp-simd #GCC#OFFLOADFLAG=-foffload=amdgcn-amdhsa="-march=fiji" -OFFLOADFLAG=-fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 +OFFLOADFLAG=-fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a OFFLOADFLAG+=-DGPU_SCHEDULE="schedule(static,1)" # makes ~10x diff with AMD Flang 12 on MI-100 OPENACCFLAG=-fopenacc $(OFFLOADFLAG) # @@ -44,7 +44,7 @@ SYCLFLAG=-std=c++17 -O3 SYCLFLAG+=-DHIPSYCL # CPU platform SYCLFLAG+=--hipsycl-platform=rocm -SYCLFLAG+=--hipsycl-gpu-arch=gfx908 +SYCLFLAG+=--hipsycl-gpu-arch=gfx90a SYCLFLAG+=-Wl,-rpath=/opt/rocm/llvm/lib # #CELERITYDIR=${SYCLDIR} From e242b86cb4f18942471a999d1f0317d05517c0b5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Aug 2023 08:05:12 -0400 Subject: [PATCH 319/325] xgemm-hipblas and related --- Cxx11/Makefile | 2 +- Cxx11/prk_hip.h | 2 +- Cxx11/xgemm-hipblas.cc | 268 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 Cxx11/xgemm-hipblas.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index ebf78591c..14a90b345 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -126,7 +126,7 @@ hip: nstream-hip transpose-hip stencil-hip nstream-managed-hip hipstl: nstream-hipstl -hipblas: nstream-hipblas sgemm-hipblas dgemm-hipblas transpose-hipblas +hipblas: nstream-hipblas sgemm-hipblas dgemm-hipblas xgemm-hipblas transpose-hipblas thrust: nstream-host-thrust nstream-device-thrust \ transpose-host-thrust transpose-device-thrust diff --git a/Cxx11/prk_hip.h b/Cxx11/prk_hip.h index f2d78005b..713eee92c 100644 --- a/Cxx11/prk_hip.h +++ b/Cxx11/prk_hip.h @@ -10,7 +10,7 @@ #include #include -#include +#include #ifdef HIP_THRUST #include diff --git a/Cxx11/xgemm-hipblas.cc b/Cxx11/xgemm-hipblas.cc new file mode 100644 index 000000000..7d2efa52d --- /dev/null +++ b/Cxx11/xgemm-hipblas.cc @@ -0,0 +1,268 @@ +/// +/// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2023, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: gemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_hip.h" + +prk::HIP::info info; + +template +__global__ void init(int order, T * C) +{ + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i +__global__ void init(int order, T * A, T * B, T * C) +{ + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i +void prk_gemm(const hipblasHandle_t & h, + const int order, const TC alpha, const TC beta, + const TAB * A, const TAB * B, TC * C) +{ + std::cerr << "No valid template match for type T" << std::endl; + std::abort(); +} + +template <> +void prk_gemm(const hipblasHandle_t & h, + const int order, const __half alpha, const __half beta, + const __half * A, const __half * B, __half * C) +{ + prk::HIP::check( hipblasHgemm(h, + HIPBLAS_OP_N, HIPBLAS_OP_N, + order, order, order, + &alpha, + A, order, + B, order, + &beta, + C, order) ); +} + +template <> +void prk_gemm(const hipblasHandle_t & h, + const int order, const float alpha, const float beta, + const float * A, const float * B, float * C) +{ + prk::HIP::check( hipblasSgemm(h, + HIPBLAS_OP_N, HIPBLAS_OP_N, + order, order, order, + &alpha, + A, order, + B, order, + &beta, + C, order) ); +} + +template <> +void prk_gemm(const hipblasHandle_t & h, + const int order, const double alpha, const double beta, + const double * A, const double * B, double * C) +{ + prk::HIP::check( hipblasDgemm(h, + HIPBLAS_OP_N, HIPBLAS_OP_N, + order, order, order, + &alpha, + A, order, + B, order, + &beta, + C, order) ); +} + +template +void run(const hipblasHandle_t & h, int iterations, int order) +{ + double gemm_time{0}; + + const size_t nelems = (size_t)order * (size_t)order; + auto h_c = prk::HIP::malloc_host( nelems); + + const int tile_size = 32; + dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + info.checkDims(dimBlock, dimGrid); + + auto d_a = prk::HIP::malloc_device(nelems); + auto d_b = prk::HIP::malloc_device(nelems); + auto d_c = prk::HIP::malloc_device(nelems); + init<<>>(order, d_a, d_b, d_c); + prk::HIP::sync(); + { + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) gemm_time = prk::wtime(); + + const T alpha{1}; + const T beta{1}; + + prk_gemm(h, order, alpha, beta, d_a, d_b, d_c); + prk::HIP::sync(); + } + gemm_time = prk::wtime() - gemm_time; + } + // copy output back to host + prk::HIP::copyD2H(h_c, d_c, nelems); + + prk::HIP::free(d_a); + prk::HIP::free(d_b); + prk::HIP::free(d_c); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double forder = static_cast(order); + const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double checksum{0}; + for (int i=0; i "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + //info.print(); + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup CUBLAS environment + ////////////////////////////////////////////////////////////////////// + + hipblasHandle_t h; + prk::HIP::check( hipblasCreate(&h) ); + run<__half>(h, iterations, order); + run(h, iterations, order); + run(h, iterations, order); + prk::HIP::check( hipblasDestroy(h) ); + + return 0; +} From cf6412ec382a41ff1c2ed3baac64795cbd9a217c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Aug 2023 08:08:59 -0400 Subject: [PATCH 320/325] half precision header --- Cxx11/prk_hip.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cxx11/prk_hip.h b/Cxx11/prk_hip.h index 713eee92c..3dad4da58 100644 --- a/Cxx11/prk_hip.h +++ b/Cxx11/prk_hip.h @@ -10,6 +10,10 @@ #include #include + +// half-precision for HIPBLAS +#include + #include #ifdef HIP_THRUST From 929f43a4c4f4cc6f3a78094185dcd4c317e5c4af Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 15 Aug 2023 08:33:34 -0400 Subject: [PATCH 321/325] bfloat16 is awful to use --- Cxx11/xgemm-hipblas.cc | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/Cxx11/xgemm-hipblas.cc b/Cxx11/xgemm-hipblas.cc index 7d2efa52d..14121f543 100644 --- a/Cxx11/xgemm-hipblas.cc +++ b/Cxx11/xgemm-hipblas.cc @@ -62,6 +62,7 @@ prk::HIP::info info; +#if 0 template __global__ void init(int order, T * C) { @@ -73,6 +74,20 @@ __global__ void init(int order, T * C) } } +template <> +__global__ void init(int order, hipblasBfloat16 * A, hipblasBfloat16 * B, hipblasBfloat16 * C) +{ + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i __global__ void init(int order, T * A, T * B, T * C) { @@ -97,8 +112,8 @@ void prk_gemm(const hipblasHandle_t & h, template <> void prk_gemm(const hipblasHandle_t & h, - const int order, const __half alpha, const __half beta, - const __half * A, const __half * B, __half * C) + const int order, const hipblasHalf alpha, const hipblasHalf beta, + const hipblasHalf * A, const hipblasHalf * B, hipblasHalf * C) { prk::HIP::check( hipblasHgemm(h, HIPBLAS_OP_N, HIPBLAS_OP_N, @@ -200,7 +215,7 @@ void run(const hipblasHandle_t & h, int iterations, int order) auto nflops = 2.0 * prk::pow(forder,3); auto is_fp64 = (typeid(T) == typeid(double)); auto is_fp32 = (typeid(T) == typeid(float)); - auto is_fp16 = (typeid(T) == typeid(__half)); + auto is_fp16 = (typeid(T) == typeid(hipblasHalf)); auto pname = (is_fp64 ? "FP64" : (is_fp32 ? "FP32" : (is_fp16 ? "FP16" : "Unknown FP type"))); @@ -218,7 +233,7 @@ void run(const hipblasHandle_t & h, int iterations, int order) int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/CUBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl; + std::cout << "C++11/HIPBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl; ////////////////////////////////////////////////////////////////////// /// Read and test input parameters @@ -259,7 +274,8 @@ int main(int argc, char * argv[]) hipblasHandle_t h; prk::HIP::check( hipblasCreate(&h) ); - run<__half>(h, iterations, order); + run(h, iterations, order); + //run(h, iterations, order); run(h, iterations, order); run(h, iterations, order); prk::HIP::check( hipblasDestroy(h) ); From f2c1e1e82bd21973884665b0102c90a6758b45c9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 16 Aug 2023 04:57:55 -0700 Subject: [PATCH 322/325] add DPC++ recipe for A100 --- common/make.defs.nvhpc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index fa4b59e8b..f040194b4 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -43,6 +43,16 @@ OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL #OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations #OPENCLFLAG+=-Wno-deprecated-declarations -Wno-missing-braces # +# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md +# +SYCLDIR=${HOME}/DPCPP/build +SYCLCXX=${SYCLDIR}/bin/clang++ +SYCLFLAG=-std=c++20 -O3 +SYCLFLAG+=-DDPCPP # important +SYCLFLAG+=-fsycl #-fsycl-unnamed-lambda +SYCLFLAG+=-fsycl-targets=nvptx64-nvidia-cuda +SYCLFLAG+=-L${SYCLDIR}/lib -lsycl -Wl,-rpath=${SYCLDIR}/lib +# # # Parallel STL, Boost, etc. # From f6607c0ab763db99cd06c1d008d3fbdad0bb41da Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 1 Sep 2023 14:26:17 +0300 Subject: [PATCH 323/325] update --- common/make.defs.cuda | 10 +++++----- common/make.defs.nvhpc | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index ec460964c..6c573d51e 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -38,7 +38,7 @@ OPENACCFLAG=-fopenacc # # OpenCL flags # -OPENCLDIR=/usr/local/cuda-11.2/targets/x86_64-linux +OPENCLDIR=/usr/local/cuda-12.1/targets/x86_64-linux OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib64 -lOpenCL #OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations #OPENCLFLAG+=-Wno-deprecated-declarations -Wno-missing-braces @@ -126,8 +126,8 @@ THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # # CBLAS for C++ DGEMM # -#BLASFLAG= -#CBLASFLAG= +BLASFLAG=-L/usr/lib/x86_64-linux-gnu/blis-openmp -lblis +CBLASFLAG=${BLASFLAG} # # CUDA flags # @@ -136,9 +136,9 @@ THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # Linux w/ NVIDIA CUDA # NVCC never supports the latest GCC. # Use appropriate arch or code is compiled to ancient features. -NVCC=/usr/local/cuda-11.2/bin/nvcc +NVCC=/usr/local/cuda-12.1/bin/nvcc CUDAFLAGS=-g -O3 -std=c++11 -CUDAFLAGS+=--gpu-architecture=sm_70 +CUDAFLAGS+=--gpu-architecture=sm_89 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' diff --git a/common/make.defs.nvhpc b/common/make.defs.nvhpc index f040194b4..cb1899895 100644 --- a/common/make.defs.nvhpc +++ b/common/make.defs.nvhpc @@ -84,12 +84,12 @@ CBLASFLAG=${BLASFLAG} NVCC=${NVHPC_CBIN}nvcc CUDAFLAGS=-g -O3 -std=c++17 CUDAFLAGS+=--extended-lambda -CUDAFLAGS+=--gpu-architecture=sm_80 +CUDAFLAGS+=--gpu-architecture=sm_89 #CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' -CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/11.5/targets/$$(uname -m)-linux/include -CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/11.5/targets/$$(uname -m)-linux/lib +CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/12.1/targets/$$(uname -m)-linux/include +CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/12.1/targets/$$(uname -m)-linux/lib # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 # heavy hammer: CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED From c7d6861a1e1677352c7dc75b8d73232f798d2198 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 20 Oct 2023 17:19:43 +0300 Subject: [PATCH 324/325] more elegant printing of rate --- Cxx11/prk_util.h | 17 +++++++++++++++++ Cxx11/xgemm-cublas.cu | 4 +--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 93a037f78..08bfe2f91 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -408,6 +408,23 @@ namespace prk { return __builtin_pow(x,n); } + template + void print_flop_rate_time(T name, double rate, double time) + { + const auto d = std::log10(rate); + const int shifts[6] = { 15, 12, 9, 6, 3, 0 }; + const char prefix[6] = { 'P', 'T', 'G', 'M', 'K', ' ' }; + for ( int r=0; r<6; r++ ) { + const auto shift = shifts[r]; + if (d > shift) { + std::cout << name + << " Rate (" << prefix[r] << "F/s): " << std::pow(1.0,-shift) * rate + << " Avg time (s): " << time << std::endl; + break; + } + } + } + } // namespace prk #endif /* PRK_UTIL_H */ diff --git a/Cxx11/xgemm-cublas.cu b/Cxx11/xgemm-cublas.cu index eff45c634..cfba85af1 100644 --- a/Cxx11/xgemm-cublas.cu +++ b/Cxx11/xgemm-cublas.cu @@ -204,9 +204,7 @@ void run(const cublasHandle_t & h, int iterations, int order) auto pname = (is_fp64 ? "FP64" : (is_fp32 ? "FP32" : (is_fp16 ? "FP16" : "Unknown FP type"))); - std::cout << pname - << " Rate (MF/s): " << 1.0e-6 * nflops/avgtime - << " Avg time (s): " << avgtime << std::endl; + prk::print_flop_rate_time(pname, nflops/avgtime, avgtime); } else { std::cout << "Reference checksum = " << reference << "\n" << "Residuum = " << residuum << std::endl; From 3ba090863d663fd5e3fda7697bbae3cc24236dc9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 13 Nov 2023 08:45:03 -0800 Subject: [PATCH 325/325] fix pow --- Cxx11/prk_util.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 08bfe2f91..aec795d92 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -413,12 +413,13 @@ namespace prk { { const auto d = std::log10(rate); const int shifts[6] = { 15, 12, 9, 6, 3, 0 }; + const double scales[6] = { 1.e-15, 1.e-12, 1.e-9, 1.e-6, 1.e-3, 1. }; const char prefix[6] = { 'P', 'T', 'G', 'M', 'K', ' ' }; for ( int r=0; r<6; r++ ) { const auto shift = shifts[r]; if (d > shift) { std::cout << name - << " Rate (" << prefix[r] << "F/s): " << std::pow(1.0,-shift) * rate + << " Rate (" << prefix[r] << "F/s): " << scales[r] * rate << " Avg time (s): " << time << std::endl; break; }