diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h index 8a9b147..652805f 100644 --- a/Grid/threads/Pragmas.h +++ b/Grid/threads/Pragmas.h @@ -36,6 +36,7 @@ Author: paboyle #define strong_inline __attribute__((always_inline)) inline #define UNROLL _Pragma("unroll") +#define OMP_UROLL_FACT 4 ////////////////////////////////////////////////////////////////////////////////// // New primitives; explicit host thread calls, and accelerator data parallel calls ////////////////////////////////////////////////////////////////////////////////// @@ -132,13 +133,15 @@ extern uint32_t gpu_threads; #define accelerator_for(iterator,num,nsimd, ... ) \ { \ uint32_t nteams=(num+gpu_threads-1)/gpu_threads; \ - _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \ + uint32_t unroll_factor = OMP_UROLL_FACT; + _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \ naked_for(iterator, num, { __VA_ARGS__ }); \ } #define accelerator_forNB(iterator,num,nsimd, ... ) \ { \ uint32_t nteams=(num+gpu_threads-1)/gpu_threads; \ - _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \ + uint32_t unroll_factor = OMP_UROLL_FACT; + _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \ naked_for(iterator, num, { __VA_ARGS__ }); \ } diff --git a/Makefile b/Makefile index 6d7ee28..e77bb49 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,10 @@ MAIN=Benchmark_su3 #CXX=pgc++ #CXXFLAGS=-fast --c++14 -acc -Mnollvm -Minfo=accel -ta=tesla:cc70,managed -Mlarge_arrays --no_exceptions +## Add flags for forcing no compiler-automated loop unroll ##Clang CXX=clang++ -CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart +CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -fno-exceptions -march=native -fopenmp-version=51 -fno-unroll-loops -fno-vectorize -llvm_info -Rpass=loop-unroll CXXFLAGS += -DOMPTARGET CXXFLAGS +=-DOMPTARGET_MANAGED #CXXFLAGS += -DVECTOR_LOOPS diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index 3eaaab2..f932f0f 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -31,6 +31,15 @@ Author: Peter Boyle using namespace std; using namespace Grid; + +#define TILE_SZ 4 + +#define UNROLL_FACTOR 2 + +#define TILE +#define UNROLL +//#define OMP_TILE +//#define OMP_UNROLL int main (int argc, char ** argv) { Grid_init(&argc,&argv); @@ -132,8 +141,33 @@ int main (int argc, char ** argv) for(int64_t i=0;i