diff --git a/.gitignore b/.gitignore index bf1f8a9..097b117 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -* !.gitignore diff --git a/CREDITS b/CREDITS index ddbc72c..77cf852 100644 --- a/CREDITS +++ b/CREDITS @@ -1,6 +1,6 @@ This software is an outcome of the PhD thesis of Matthias Rottmann, University of Wuppertal. -Code Designers: Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. +Code Designers: Matthias Rottmann, Simone Bacchio, Artur Strebel, Simon Heybrock, Bjoern Leder. Contributions by: Andreas Frommer, Karsten Kahl, Stefan Krieg, Kalman Szabo, Wolfgang Soeldner, Holger Arndt, Peter Georg. diff --git a/Makefile b/Makefile index 6abdff9..ae45ecc 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ CC = mpiicc # --- CFLAGS ----------------------------------------- -CFLAGS_gnu = -std=gnu99 -Wall -pedantic -fopenmp -O3 -ffast-math -msse4.2 -CFLAGS_intel = -std=gnu99 -Wall -pedantic -qopenmp -O3 -xHOST +CFLAGS_gnu = -std=gnu99 -Wall -pedantic -O3 -ffast-math -msse4.2 -fopenmp +CFLAGS_intel = -std=gnu99 -Wall -pedantic -O3 -xHOST -qopenmp CFLAGS = $(CFLAGS_intel) # --- DO NOT CHANGE ----------------------------------- @@ -45,8 +45,8 @@ LIMELIB= -L$(LIMEDIR)/lib -llime # -DPARAMOUTPUT -DTRACK_RES -DFGMRES_RESTEST -DPROFILING # -DSINGLE_ALLREDUCE_ARNOLDI # -DCOARSE_RES -DSCHWARZ_RES -DTESTVECTOR_ANALYSIS -DDEBUG -OPT_VERSION_FLAGS = $(CFLAGS) $(LIMEFLAGS) $(H5FLAGS) -DOPENMP -DSSE -DPARAMOUTPUT -DTRACK_RES -DEVEL_VERSION_FLAGS = $(CFLAGS) $(LIMEFLAGS) -DOPENMP -DSSE -DDEBUG -DPARAMOUTPUT -DTRACK_RES -DFGMRES_RESTEST -DPROFILING -DCOARSE_RES -DSCHWARZ_RES -DTESTVECTOR_ANALYSIS +OPT_VERSION_FLAGS = $(CFLAGS) $(LIMEFLAGS) $(H5FLAGS) -DPARAMOUTPUT -DTRACK_RES -DSSE -DOPENMP +DEVEL_VERSION_FLAGS = $(CFLAGS) $(LIMEFLAGS) -DDEBUG -DPARAMOUTPUT -DTRACK_RES -DFGMRES_RESTEST -DPROFILING -DCOARSE_RES -DSCHWARZ_RES -DTESTVECTOR_ANALYSIS -DSSE -DOPENMP all: execs library exec-tests @@ -92,10 +92,10 @@ $(INCDIR)/%: $(SRCDIR)/% cp $(SRCDIR)/`basename $@` $@ $(BUILDDIR)/%.o: $(GSRCDIR)/%.c $(SRCDIR)/*.h - $(CC) $(CFLAGS) $(OPT_VERSION_FLAGS) -c $< -o $@ + $(CC) $(OPT_VERSION_FLAGS) -c $< -o $@ $(BUILDDIR)/%_devel.o: $(GSRCDIR)/%.c $(SRCDIR)/*.h - $(CC) -g $(CFLAGS) $(DEVEL_VERSION_FLAGS) -c $< -o $@ + $(CC) -g $(DEVEL_VERSION_FLAGS) -c $< -o $@ $(GSRCDIR)/%.h: $(SRCDIR)/%.h $(firstword $(MAKEFILE_LIST)) cp $< $@ diff --git a/NEWS b/NEWS index 5efb65f..6fe3f55 100644 --- a/NEWS +++ b/NEWS @@ -1,9 +1,25 @@ -Version v1606-sbacchio/master: - -Twisted mass fermions operator for Nf=2 - -Support of different shift on even and odd sites + + +Version v1610 - TM Nf=2+1+1: + + - Twisted mass fermions operator for Nf=1+1 + - Aupport of different shift on even and odd sites -Personalized version of the library + - Following parameters have been added to input file: + -- //TODO + + +Version v1606 - TM Nf=2: + + - Twisted mass fermions operator for Nf=2 + - A different TM shift can be applied on even and odd sites + - Personalized version of the library + - Following parameters have been added to input file: + -- //TODO + Version v1606 - first release: + This is the first release of the DDalphaAMG solver library. For an overview of its features included, please consult the user documentation in /doc. \ No newline at end of file diff --git a/README b/README index 94e2604..a4d9e8e 100644 --- a/README +++ b/README @@ -22,6 +22,6 @@ INSTALL: HOWTO: - After having compiled the the user documentation via + After having compiled the user documentation via "make documentation" please consult the compiled PDF in /doc for further information. diff --git a/sample.ini b/sample.ini index a8c64e6..3729bed 100644 --- a/sample.ini +++ b/sample.ini @@ -33,48 +33,52 @@ right hand side: 2 | 2 - twisted boundary cond. | | (M_PI,M_PI,M_PI,M_PI)*t.b.c. | |----------------------------------------------| -boundary conditions: 2 +boundary conditions: 1 twisted boundary conditions: 1 0 0 0 number of levels: 3 -number of openmp threads: 2 +number of openmp threads: 1 |--- depth 0 ----------------------------------| d0 global lattice: 8 8 8 8 // global lattice size -d0 local lattice: 4 8 8 8 // lattice size on each process - +d0 local lattice: 8 8 4 4 // lattice size on each process // nproc = prod(global lattice)/prod(local lattice) -d0 block lattice: 2 2 2 2 // Schwarz blocks +d0 block lattice: 4 4 4 4 // Schwarz blocks + d0 block lattice: 2 2 2 2 // Schwarz blocks d0 post smooth iter: 2 // number of Schwarz cycles per MG iteration d0 block iter: 4 d0 test vectors: 24 // number of test vectors used to construct the interpolation -d0 setup iter: 4 // number of bootstrap setup iteration (excluding the initial step) +d0 setup iter: 3 // number of bootstrap setup iteration (excluding the initial step) -d2 mu factor: 4.0 -d3 mu factor: 8.0 +d1 mu factor: 5.0 + d2 mu factor: 5.0 #wilson_param // parameters for the inverter -#the following OR kappa: ... -m0: -0.5 -csw: 1.0 -#the following OR 2KappaMu: ... -mu: 0.5 -mu odd shift: 0.0 -mu even shift: 0.0 +m0: -0.4 +csw: 1.6 +mu: 0.005 +setup mu: 0.001 + mu odd shift: 0.0 + mu even shift: 0.0 +epsbar: 0.11 + epsbar odd shift: 0.0 + epsbar even shift: 0.0 addDownPropagator: 1 tolerance for relative residual: 1E-10 -iterations between restarts: 50 // should be increased for ill-conditioned cnfgs -maximum of restarts: 20 // should be increased for ill-conditioned cnfgs -coarse grid tolerance: 5E-2 -coarse grid iterations: 100 // should be increased for ill-conditioned cnfgs -coarse grid restarts: 5 // should be increased for ill-conditioned cnfgs +iterations between restarts: 20 +maximum of restarts: 50 +coarse grid tolerance: 1E-2 +coarse grid iterations: 25 +coarse grid restarts: 20 #general_param print mode: 1 method: 2 -mixed precision: 1 +interpolation: 2 +mixed precision: 2 randomize test vectors: 0 // initialize random number generator with time(0) ? 0=no/1=yes +odd even preconditioning: 1 // for further information, please read the user documentation in doc/ // developers version of an input file in sample_devel.ini diff --git a/src/DDalphaAMG.h b/src/DDalphaAMG.h index 5f50b32..a9556c3 100644 --- a/src/DDalphaAMG.h +++ b/src/DDalphaAMG.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * Copyright (C) 2016, Simone Bacchio. * * This file is part of the DDalphaAMG solver library. * @@ -145,8 +145,17 @@ ** mg_status.success = 0: not converged, 1: converged ** mg_status.info = final residual **/ - void DDalphaAMG_solve( double *vector_out, double *vector_in, double tol, - DDalphaAMG_status *mg_status ); + void DDalphaAMG_solve( double *vector_out, double *vector_in, + double tol, DDalphaAMG_status *mg_status ); + + void DDalphaAMG_solve_doublet( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, + double tol, DDalphaAMG_status *mg_status ); + + void DDalphaAMG_solve_ms_doublet( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status ); /** ** Optional - Solve squared operator performing two inversions: @@ -154,8 +163,17 @@ ** mg_status.success = 0: not converged, 1: converged ** mg_status.info = final residual **/ - void DDalphaAMG_solve_squared( double *vector_out, double *vector_in, double tol, - DDalphaAMG_status *mg_status ); + void DDalphaAMG_solve_squared( double *vector_out, double *vector_in, + double tol, DDalphaAMG_status *mg_status ); + + void DDalphaAMG_solve_doublet_squared( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, + double tol, DDalphaAMG_status *mg_status ); + + void DDalphaAMG_solve_ms_doublet_squared( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status ); /** ** Optional - Solve squared operator against the odd compoments performing two inversions: @@ -163,8 +181,17 @@ ** mg_status.success = 0: not converged, 1: converged ** mg_status.info = final residual **/ - void DDalphaAMG_solve_squared_odd( double *vector_out, double *vector_in, double tol, - DDalphaAMG_status *mg_status ); + void DDalphaAMG_solve_squared_odd( double *vector_out, double *vector_in, + double tol, DDalphaAMG_status *mg_status ); + + void DDalphaAMG_solve_doublet_squared_odd( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, + double tol, DDalphaAMG_status *mg_status ); + + void DDalphaAMG_solve_ms_doublet_squared_odd( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status ); /** ** Optional - Solve squared operator against the even compoments performing two inversions: @@ -172,8 +199,17 @@ ** mg_status.success = 0: not converged, 1: converged ** mg_status.info = final residual **/ - void DDalphaAMG_solve_squared_even( double *vector_out, double *vector_in, double tol, - DDalphaAMG_status *mg_status ); + void DDalphaAMG_solve_squared_even( double *vector_out, double *vector_in, + double tol, DDalphaAMG_status *mg_status ); + + void DDalphaAMG_solve_doublet_squared_even( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, + double tol, DDalphaAMG_status *mg_status ); + + void DDalphaAMG_solve_ms_doublet_squared_even( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status ); /** ** Optional - Apply the operator: @@ -181,7 +217,10 @@ ** mg_status.success = 1 **/ void DDalphaAMG_apply_operator( double *vector_out, double *vector_in, - DDalphaAMG_status *mg_status ); + DDalphaAMG_status *mg_status ); + + void DDalphaAMG_apply_operator_doublet( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, DDalphaAMG_status *mg_status ); /** ** Optional - Apply a preconditioner step: @@ -190,7 +229,9 @@ ** mg_status.info = residual after preconditioning **/ void DDalphaAMG_preconditioner( double *vector_out, double *vector_in, - DDalphaAMG_status *mg_status ); + DDalphaAMG_status *mg_status ); + void DDalphaAMG_preconditioner_doublet( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, DDalphaAMG_status *mg_status ); /* * Concluding the following functions have to be call for freeing the memory and finalizing @@ -228,11 +269,11 @@ ** -> mg_params.conf_index_fct = NULL, mg_params.vector_index_fct = NULL; **/ void DDalphaAMG_read_configuration( double *gauge_field, char *filename, int format, - DDalphaAMG_status *mg_status ); + DDalphaAMG_status *mg_status ); void DDalphaAMG_read_vector( double *vector_in, char *filename, int format, - DDalphaAMG_status *mg_status ); + DDalphaAMG_status *mg_status ); void DDalphaAMG_write_vector( double *vector_out, char *filename, int format, - DDalphaAMG_status *mg_status ); + DDalphaAMG_status *mg_status ); /** ** Extra - Define vector with constant or random components @@ -414,23 +455,39 @@ ** Hopping parameter **/ double kappa; - + /** ** Twisted mass parameter and shifts on even/odd sites **/ double mu; double mu_odd_shift; double mu_even_shift; - + /** - ** Twisted mass factor for the preconditioner on each level. + ** Twisted mass factor for the preconditioner on each level, l. ** Default 6 on the coarsest level ** - ** -> mu_o[l] = (mu + mu_odd_shift) * mu_factor - ** -> mu_e[l] = (mu + mu_even_shift) * mu_factor + ** -> mu_o[l] = (mu + mu_odd_shift) * mu_factor[l] + ** -> mu_e[l] = (mu + mu_even_shift) * mu_factor[l] **/ double mu_factor[MAX_MG_LEVELS]; + /** + ** Twisted mass doublet parameter and shifts on even/odd sites + **/ + double epsbar; + double epsbar_ig5_odd_shift; + double epsbar_ig5_even_shift; + + /** + ** Twisted mass doublet factor for the preconditioner on each level, l. + ** Default 6 on the coarsest level + ** + ** -> epsbar_o[l] = ( epsbar + i * gamma_5 * epsbar_ig5_odd_shift ) * epsbar_factor[l] + ** -> epsbar_e[l] = ( epsbar + i * gamma_5 * epsbar_ig5_even_shift ) * epsbar_factor[l] + **/ + double epsbar_factor[MAX_MG_LEVELS]; + /** ** Function returning the index of a element at the corresponding ** position (t,z,y,x are local position w.r.t the process ). diff --git a/src/DDalphaAMG_interface.c b/src/DDalphaAMG_interface.c index 03680d4..65f80a2 100644 --- a/src/DDalphaAMG_interface.c +++ b/src/DDalphaAMG_interface.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * Copyright (C) 2016, Simone Bacchio. * * This file is part of the DDalphaAMG solver library. * @@ -101,7 +101,7 @@ void DDalphaAMG_initialize( DDalphaAMG_init *mg_init, DDalphaAMG_parameters *mg_ threading[i] = NULL; MALLOC( threading[i], struct Thread, 1); } -#pragma omp parallel num_threads(g.num_openmp_processes) + THREADED(g.num_openmp_processes) setup_threading(threading[omp_get_thread_num()], commonthreaddata, &l); g.conf_flag = 0; @@ -128,22 +128,26 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_ // int method; if ( mg_params->method != g.method ) { + g.method = mg_params->method; if( g.setup_flag ) { //TODO: test which cases work and what to do for making the other working warning0("Change of method parameter after setup not guaranteed\n"); } - g.method = mg_params->method; } // int interpolation; if ( g.interpolation != mg_params->interpolation ) { - //TODO: test if it always works g.interpolation = mg_params->interpolation; + if( g.setup_flag ) { + //TODO: test which cases work and what to do for making the other working + warning0("Change of interpolation parameter after setup not guaranteed\n"); + } } // int mixed_precision; if ( mg_params->mixed_precision != g.mixed_precision ) { -#ifndef INIT_ONE_PREC + g.mixed_precision = mg_params->mixed_precision; +#ifndef INIT_ONE_PREC //change between 1 and 2 allowed if( g.setup_flag && mg_params->mixed_precision * g.mixed_precision == 0 ) { warning0("Change from mixed_precision==0 to !=0 (or viceversa) needs a new setup.\n"); re_setup++; @@ -152,45 +156,48 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_ warning0("Change of mixed_precision needs a new setup.\n"); re_setup++; #endif - g.mixed_precision = mg_params->mixed_precision; } // int block_lattice[MAX_MG_LEVELS][4]; for ( i=0; iblock_lattice[i][j]) { - if (g.setup_flag) - re_setup++; - g.block_lattice[i][j] = mg_params->block_lattice[i][j]; - // TODO: add check + g.block_lattice[i][j] = mg_params->block_lattice[i][j]; + parameter_update(&l); + if (g.setup_flag) { + warning0("Change of block_lattice needs a new setup.\n"); + re_setup++; + } } - + // int mg_basis_vectors[MAX_MG_LEVELS-1]; l_tmp=&l; for ( i=0; img_basis_vectors[i] != g.num_eig_vect[i] ) { + g.num_eig_vect[i] = mg_params->mg_basis_vectors[i]; + if( i==0 ) + parameter_update(&l); if( g.setup_flag ) { - if ( mg_params->mg_basis_vectors[i] < g.num_eig_vect[i] ) - re_projs++; //TODO: check if it works - else - re_setup++; + if ( mg_params->mg_basis_vectors[i] < g.num_eig_vect[i] ) + re_projs++; //TODO: check if this works + else { //TODO just compute the extra vectors + warning0("Increasing mg_basis_vectors needs a new setup.\n"); + re_setup++; + } } - g.num_eig_vect[i] = mg_params->mg_basis_vectors[i]; - if( g.setup_flag || i==0 ) - l_tmp->num_eig_vect = mg_params->mg_basis_vectors[i]; } if( g.setup_flag ) l_tmp = l_tmp->next_level; } - + // int setup_iterations[MAX_MG_LEVELS]; l_tmp=&l; for ( i=0; isetup_iterations[i] != g.setup_iter[i] ) { g.setup_iter[i] = mg_params->setup_iterations[i]; if( (g.setup_flag && i>0) || (!g.setup_flag && i==0) ) - //after setup, l.setup_iter[i] is used as a counter for total number of setup iters - l_tmp->setup_iter = mg_params->setup_iterations[i]; + //after setup, l.setup_iter[i] is used as a counter for total number of setup iters + l_tmp->setup_iter = mg_params->setup_iterations[i]; } if( g.setup_flag ) l_tmp = l_tmp->next_level; @@ -212,22 +219,22 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_ if (l_tmp->level > 0) { // double kcycle_tolerance; if ( mg_params->kcycle_tolerance != g.kcycle_tol ) { - g.kcycle_tol = mg_params->kcycle_tolerance; - if( g.setup_flag || i==0 ) { - if ( g.mixed_precision ) - l_tmp->p_float.tol = g.kcycle_tol; - else - l_tmp->p_float.tol = g.kcycle_tol; - } + g.kcycle_tol = mg_params->kcycle_tolerance; + if( g.setup_flag || i==0 ) { + if ( g.mixed_precision ) + l_tmp->p_float.tol = g.kcycle_tol; + else + l_tmp->p_float.tol = g.kcycle_tol; + } } } else { // double coarse_tolerance; if ( mg_params->coarse_tolerance != g.coarse_tol ){ - g.coarse_tol = mg_params->coarse_tolerance; - if (g.setup_flag && g.mixed_precision ) - l_tmp->p_float.tol = g.coarse_tol; - else if(g.setup_flag) - l_tmp->p_float.tol = g.coarse_tol; + g.coarse_tol = mg_params->coarse_tolerance; + if (g.setup_flag && g.mixed_precision ) + l_tmp->p_float.tol = g.coarse_tol; + else if(g.setup_flag) + l_tmp->p_float.tol = g.coarse_tol; } } @@ -236,32 +243,74 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_ else break; } - + // double kappa; m0 = 1./(2.*mg_params->kappa)-4.; - if( creal(l.dirac_shift)!= m0 ){ + if( g.m0 != m0 ){ + g.m0 = m0; + THREADED(threading[0]->n_core) + if ( g.setup_flag ) + m0_update( g.m0, &l, threading[omp_get_thread_num()] ); + else if ( g.conf_flag ) + m0_update_double( g.m0, &(g.op_double), &l, threading[omp_get_thread_num()] ); re_dirac++; } - + // double mu; // double mu_odd_shift; // double mu_even_shift; // double mu_factor[MAX_MG_LEVELS]; #ifdef HAVE_TM - if( mg_params->mu != g.tm_mu || mg_params->mu_odd_shift != g.tm_mu_odd_shift || - mg_params->mu_even_shift != g.tm_mu_even_shift){ - g.setup_tm_mu = mg_params->mu; - g.tm_mu = mg_params->mu; - g.tm_mu_even_shift = mg_params->mu_even_shift; - g.tm_mu_odd_shift = mg_params->mu_odd_shift; + int update_mu = 0; + for ( i=0; imu_factor[i] != g.mu_factor[i] ) { + g.mu_factor[i] = mg_params->mu_factor[i]; + update_mu = 1; + } + + if( update_mu || mg_params->mu != g.mu || mg_params->mu_odd_shift != g.mu_odd_shift || + mg_params->mu_even_shift != g.mu_even_shift ){ + g.setup_mu = mg_params->mu; + g.mu = mg_params->mu; + g.mu_even_shift = mg_params->mu_even_shift; + g.mu_odd_shift = mg_params->mu_odd_shift; + THREADED(threading[0]->n_core) + if ( g.setup_flag ) + tm_term_update( g.mu, &l, threading[omp_get_thread_num()] ); + else if ( g.conf_flag ) + tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, &(g.op_double), &l, threading[omp_get_thread_num()] ); re_dirac++; } +#else + if ( mg_params->mu != 0 || mg_params->mu_odd_shift != 0 || mg_params->mu_even_shift != 0 ) + warning0("Parameters mu, mu_odd_shift, mu_even_shift not supported without defining HAVE_TM flag."); +#endif + +#ifdef HAVE_TM1p1 + int update_eps = 0; + for ( i=0; imu_factor[i] != g.tm_mu_factor[i] ) { - g.tm_mu_factor[i] = mg_params->mu_factor[i]; - re_dirac++; - } + if (mg_params->epsbar_factor[i] != g.epsbar_factor[i] ) { + g.epsbar_factor[i] = mg_params->epsbar_factor[i]; + update_eps = 1; + } + + if( update_eps || mg_params->epsbar != g.epsbar || mg_params->epsbar_ig5_odd_shift != g.epsbar_ig5_odd_shift || mg_params->epsbar_ig5_even_shift != g.epsbar_ig5_even_shift ){ + g.epsbar = mg_params->epsbar; + g.epsbar_ig5_even_shift = mg_params->epsbar_ig5_even_shift; + g.epsbar_ig5_odd_shift = mg_params->epsbar_ig5_odd_shift; + THREADED(threading[0]->n_core) + if ( g.setup_flag ) + epsbar_term_update( &l, threading[omp_get_thread_num()] ); + else if ( g.conf_flag ) + epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, &(g.op_double), &l, threading[omp_get_thread_num()] ); + re_dirac++; + } + +#else + if ( mg_params->epsbar != 0 || mg_params->epsbar_ig5_odd_shift != 0 || mg_params->epsbar_ig5_even_shift != 0 ) + warning0("Parameters epsbar, epsbar_odd_shift, epsbar_even_shift not supported without defining HAVE_TM1p1 flag."); #endif // int (*conf_index_fct)(int t, int z, int y, int x, int mu); @@ -271,78 +320,27 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_ // int print; g.print = mg_params->print; - + // UPDATING - if ( re_setup && g.setup_flag ){ - if ( re_dirac ) { - if( creal(l.dirac_shift)!= m0 ) -#pragma omp parallel num_threads(threading[0]->n_core) - shift_update_double( &(g.op_double), m0, &l, threading[omp_get_thread_num()] ); -#ifdef HAVE_TM - l.tm_shift = g.tm_mu; - l.tm_even_shift = g.tm_mu_even_shift; - l.tm_odd_shift = g.tm_mu_odd_shift; -#pragma omp parallel num_threads(threading[0]->n_core) - tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, &l, threading[omp_get_thread_num()]); -#endif - } - l.dirac_shift = m0; + if ( re_setup && g.setup_flag ){ // destroy and repeate setup DDalphaAMG_setup( mg_status ); // TODO handle status - } else if ( re_projs && g.setup_flag ) { - if ( re_dirac ) { -#pragma omp parallel num_threads(threading[0]->n_core) - if( creal(l.dirac_shift)!= m0 ) { - shift_update_double( &(g.op_double), m0, &l, threading[omp_get_thread_num()] ); - shift_update_float( &(g.op_float), m0, &l, threading[omp_get_thread_num()] ); - if(l.s_double.op.clover != NULL) - shift_update_double( &(l.s_double.op), m0, &l, threading[omp_get_thread_num()] ); - if ( l.s_float.op.clover != NULL ) - shift_update_float( &(l.s_float.op), m0, &l, threading[omp_get_thread_num()] ); - } -#ifdef HAVE_TM - l.tm_shift = g.tm_mu; - l.tm_even_shift = g.tm_mu_even_shift; - l.tm_odd_shift = g.tm_mu_odd_shift; -#pragma omp parallel num_threads(threading[0]->n_core) - { - tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, &l, threading[omp_get_thread_num()]); - tm_term_float_setup( g.op_float.tm_term, g.op_float.odd_proj, &l, threading[omp_get_thread_num()] ); - if(l.s_double.op.tm_term != NULL) - tm_term_double_setup( l.s_double.op.tm_term, l.s_double.op.odd_proj, &l, threading[omp_get_thread_num()] ); - if ( l.s_float.op.tm_term != NULL ) - tm_term_float_setup( l.s_float.op.tm_term, l.s_float.op.odd_proj, &l, threading[omp_get_thread_num()] ); - } -#endif - } - l.dirac_shift = m0; + } else if ( re_projs && g.setup_flag ) { //project again the operators if ( g.mixed_precision ) -#pragma omp parallel num_threads(threading[0]->n_core) - re_setup_float( &l, threading[omp_get_thread_num()] ); + THREADED(threading[0]->n_core) + re_setup_float( &l, threading[omp_get_thread_num()] ); else -#pragma omp parallel num_threads(threading[0]->n_core) - re_setup_double( &l, threading[omp_get_thread_num()] ); - - } else if ( (re_dirac && g.conf_flag) || re_projs || re_setup ) { - if (g.setup_flag ) -#pragma omp parallel num_threads(threading[0]->n_core) - optimized_shift_update( m0, &l, threading[omp_get_thread_num()]); - else { - if( creal(l.dirac_shift)!= m0 ) -#pragma omp parallel num_threads(threading[0]->n_core) - shift_update_double( &(g.op_double), m0, &l, threading[omp_get_thread_num()] ); -#ifdef HAVE_TM - l.tm_shift = g.tm_mu; - l.tm_even_shift = g.tm_mu_even_shift; - l.tm_odd_shift = g.tm_mu_odd_shift; -#pragma omp parallel num_threads(threading[0]->n_core) - tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, &l, threading[omp_get_thread_num()]); -#endif - } + THREADED(threading[0]->n_core) + re_setup_double( &l, threading[omp_get_thread_num()] ); + + } else if ( re_dirac && g.setup_flag ) { //update just the oddeven and vecorized operators + THREADED(threading[0]->n_core) + finalize_operator_update( &l, threading[omp_get_thread_num()]); } + DDalphaAMG_get_parameters( mg_params ); - + t1 = MPI_Wtime(); mg_status->success = 1+re_setup;// 1: OK, 2: re_setup done @@ -353,7 +351,7 @@ void DDalphaAMG_update_parameters( DDalphaAMG_parameters *mg_params, DDalphaAMG_ } void DDalphaAMG_change_mu_sign( DDalphaAMG_status *mg_status ) { - + double t0, t1; t0 = MPI_Wtime(); g.coarse_time = 0; @@ -362,33 +360,35 @@ void DDalphaAMG_change_mu_sign( DDalphaAMG_status *mg_status ) { mg_status->success = 0; mg_status->info = 0; - g.tm_mu *= -1; - g.tm_mu_even_shift *= -1; - g.tm_mu_odd_shift *= -1; +#ifdef HAVE_TM + g.mu *= -1; + g.mu_even_shift *= -1; + g.mu_odd_shift *= -1; if (g.conf_flag && !g.setup_flag ) { - l.tm_shift = g.tm_mu; - l.tm_even_shift = g.tm_mu_even_shift; - l.tm_odd_shift = g.tm_mu_odd_shift; - -#pragma omp parallel num_threads(threading[0]->n_core) - tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, &l, threading[omp_get_thread_num()]); + THREADED(threading[0]->n_core) + tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, &(g.op_double), &l, threading[omp_get_thread_num()]); } else if (g.conf_flag && g.setup_flag ) -#pragma omp parallel num_threads(threading[0]->n_core) - optimized_shift_update( l.dirac_shift, &l, threading[omp_get_thread_num()]); - + THREADED(threading[0]->n_core) { + tm_term_update( g.mu, &l, threading[omp_get_thread_num()] ); + finalize_operator_update( &l, threading[omp_get_thread_num()] ); + } + mg_status->info = g.mu; +#else + warning0("DDalphaAMG_change_mu_sign called without the flag HAVE_TM enabled. Doing nothing.\n"); + mg_status->info = 0; +#endif + t1 = MPI_Wtime(); mg_status->success = 1;// 1: OK, 2: re_setup done mg_status->time = t1-t0; - mg_status->info = g.tm_mu; mg_status->coarse_time = g.coarse_time; } - void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_status ) { int t, z, y, x, mu, i, j, k; @@ -407,28 +407,28 @@ void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_st if ( g.print > 0 ) printf0("%s\n", CLIFFORD_BASIS ); if ( g.bc == _ANTIPERIODIC ) printf0("antiperiodic in time"); else if ( g.bc == _TWISTED ) printf0("twisted (%.2f, %.2f, %.2f, %.2f)", g.twisted_bc[0], - g.twisted_bc[1], g.twisted_bc[2], g.twisted_bc[3]); + g.twisted_bc[1], g.twisted_bc[2], g.twisted_bc[3]); else printf0("periodic in time"); printf0(" boundary conditions \n"); SU3_storage_alloc( &U, &l ); - + if(g.bc == _ANTIPERIODIC && onb[T] ) { phase[Z] = 1; phase[Y] = 1; phase[X] = 1; for ( t=1, i=0, k=0; t 0 ) printf0("Configuration stored...\n"); - + compute_clover_term( U, &l ); // calculate the plaquette @@ -482,7 +482,7 @@ void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_st SU3_storage_free( &U, &l ); //END: dirac_setup - + mg_status->success = 1; g.conf_flag = 1; mg_status->info = g.plaq; @@ -492,7 +492,7 @@ void DDalphaAMG_set_configuration( double *gauge_field, DDalphaAMG_status *mg_st schwarz_double_setup( &(l.s_double), &(g.op_double), &l ); if(l.s_float.op.clover != NULL) schwarz_float_setup( &(l.s_float), &(g.op_double), &l ); -#pragma omp parallel num_threads(threading[0]->n_core) + THREADED(threading[0]->n_core) if ( g.mixed_precision ) operator_updates_float( &l, threading[omp_get_thread_num()] ); else @@ -522,7 +522,7 @@ void DDalphaAMG_setup( DDalphaAMG_status * mg_status ) { if(g.conf_flag == 1) { if ( g.setup_flag ) method_free( &l ); -#pragma omp parallel num_threads(threading[0]->n_core) + THREADED(threading[0]->n_core) { method_setup( NULL, &l, threading[omp_get_thread_num()] ); method_update( g.setup_iter[0], &l, threading[omp_get_thread_num()] ); @@ -552,7 +552,7 @@ void DDalphaAMG_update_setup( int iterations, DDalphaAMG_status * mg_status ) { mg_status->success = 0; mg_status->info = 0; -#pragma omp parallel num_threads(threading[0]->n_core) + THREADED(threading[0]->n_core) method_update( iterations, &l, threading[omp_get_thread_num()] ); // method_update( iterations, &l, no_threading ); @@ -571,13 +571,101 @@ void DDalphaAMG_update_setup( int iterations, DDalphaAMG_status * mg_status ) { } } +static inline void vector_copy( vector_double vector_out, vector_double vector_in ) +{ + THREADED(threading[0]->n_core) { + int start = threading[omp_get_thread_num()]->start_index[0], + end = threading[omp_get_thread_num()]->end_index[0]; + vector_double_copy( vector_out, vector_in, start, end, &l ); + } +} + +static inline void solver( ) +{ + THREADED(threading[0]->n_core) + if ( g.method == -1 ) { + cgn_double( &(g.p), &l, threading[omp_get_thread_num()] ); + } else if ( g.mixed_precision == 2 ) { + fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] ); + } else { + fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] ); + } +} + +static inline void correct_guess( vector_double guess, vector_double solution, vector_double solution2, + double even_dshift, double odd_dshift ) +{ + // guess = D^{-1}*rhs - i*dshift*D^{-2}*rhs + THREADED(threading[0]->n_core) { + int start = threading[omp_get_thread_num()]->start_index[0], + end = threading[omp_get_thread_num()]->end_index[0]; + if( odd_dshift == 0 || even_dshift == 0 || even_dshift == odd_dshift ) { + double dshift = ( odd_dshift == 0 ) ? even_dshift:odd_dshift; + printf0("correcting with dshift %le\n", dshift); + vector_double_scale( guess, solution2, -I*dshift, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()]); + vector_double_plus( guess, guess, solution, start, end, &l ); + } else + vector_double_copy( guess, solution, start, end, &l ); + } +} + +static inline void change_epsbar_shift_sign( ) { + +#ifdef HAVE_TM1p1 + if ( g.epsbar_ig5_even_shift !=0 || g.epsbar_ig5_odd_shift !=0 ) { + g.epsbar_ig5_even_shift *= -1; + g.epsbar_ig5_odd_shift *= -1; + + if (g.conf_flag && !g.setup_flag ) { + + THREADED(threading[0]->n_core) { + epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, &(g.op_double), + &l, threading[omp_get_thread_num()]); + } + } else if (g.conf_flag && g.setup_flag ) + THREADED(threading[0]->n_core) { + epsbar_term_update( &l, threading[omp_get_thread_num()] ); + finalize_operator_update( &l, threading[omp_get_thread_num()] ); + } + } +#else + warning0("change_epsbar_shift_sign called without the flag HAVE_TM1p1 enabled. Doing nothing.\n"); +#endif +} enum {_SOLVE, _SOLVE_SQ, _SOLVE_SQ_ODD, _SOLVE_SQ_EVEN, _PRECOND, _OPERATOR}; -void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status *mg_status, int _TYPE ) { + +// NOTE RESIDUAL +// +// The _SOLVE_SQ invert the squared operator in two inversion. +// One has to be careful to return a solution with the right residual. +// +// We have: +// D^2 = Dd D +// +// Dd D x = b direct solution +// Dd y = b first step +// D x = y second step +// +// r1 = Dd y - b +// r2 = D x - y +// r = Dd D x - b = Dd r2 + r1 +// +// |r| < tol --> |r| < |Dd| |r2| + |r1| < tol +// +// For the residual we do +// |r1| < tol/2 +// |r2| < (tol - |r1|)/|Dd| using |Dd|=8 since is |Dd|<8 +// +// With relative residual we have +// |r1|/|b| < tol/2 +// |r2|/|y| < (tol - |r1|)/|Dd|*|b|/|y| + +static inline void DDalphaAMG_driver( double *vector1_out, double *vector1_in, double *vector2_out, double *vector2_in, double tol, DDalphaAMG_status *mg_status, int _TYPE ) { int t, z, y, x, i, j, k, mu, *ll = l.local_lattice, *gl=l.global_lattice, sl[4], precision_changed; - complex_double twisted_bc, tmp; - double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp; + complex_double twisted_bc, tmp1, tmp2; + double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2; gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p); vector_double vb, rhs = p->b; vector_double vx, sol = p->x; @@ -590,63 +678,97 @@ void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status g.coarse_iter_count = 0; mg_status->success = 0; mg_status->info = 0; + + ASSERT(vector1_out!=NULL); + ASSERT(vector1_in!=NULL); +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) { + ASSERT(vector2_out!=NULL); + ASSERT(vector2_in!=NULL); + } +#endif + + if(g.mixed_precision!=2) + g.p.tol = tol; + else + g.p_MP.dp.tol = tol; for (i=0; i<4; i++) sl[i] = ll[i]*g.my_coords[i]; - /* - #ifndef INIT_ONE_PREC - if ( g.mixed_precision==2 || vector_index_fct!=NULL || g.bc==_TWISTED) - #else - if ( vector_index_fct!=NULL || g.bc==_TWISTED) - #endif - */ + for (t=0, j=0; t vmax) + vmax=vtmp; + if( vtmp > EPS_double && vtmp < vmin ) + vmin=vtmp; + vtmp=cabs(rhs[j+6]); + if(vtmp > vmax) + vmax=vtmp; + if( vtmp > EPS_double && vtmp < vmin ) + vmin=vtmp; + } + } +#endif + if(mu%2) + j+=6; + } + } else +#endif + for ( mu=0; mu<4; mu++ ) + for ( k=0; k<3; k++, j++ ) { #ifndef BASIS4 - rhs[j] = ((complex_double)vector_in[i+2*(k+3*mu)] + I*(complex_double)vector_in[i+2*(k+3*mu)+1]) * twisted_bc; + rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - rhs[j] = ((complex_double)vector_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; + rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; #endif - + #ifndef INIT_ONE_PREC - if(g.mixed_precision==2) { - vtmp=cabs(rhs[j]); - if(vtmp > vmax) - vmax=vtmp; - if( vtmp > EPS_double && vtmp < vmin ) - vmin=vtmp; - } - } + if(g.mixed_precision==2) { + vtmp=cabs(rhs[j]); + if(vtmp > vmax) + vmax=vtmp; + if( vtmp > EPS_double && vtmp < vmin ) + vmin=vtmp; + } + } #endif - } + } } } } - - /* - else { - p->b = (vector_double) vector_in; - p->x = (vector_double) vector_out; - } - */ - + #ifndef INIT_ONE_PREC double gvmin, gvmax; @@ -667,112 +789,148 @@ void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status p->b = g.p_MP.dp.b; p->x = g.p_MP.dp.x; p->tol = g.p_MP.dp.tol; - } else precision_changed=0; + } else precision_changed = 0; #endif - + switch(_TYPE) { case _SOLVE : -#pragma omp parallel num_threads(threading[0]->n_core) - if ( g.method == -1 ) { - cgn_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } else if ( g.mixed_precision == 2 ) { - fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] ); - } else { - fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } + solver( ); break; case _SOLVE_SQ : -#pragma omp parallel num_threads(threading[0]->n_core) - { - // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs - gamma5_double(rhs, rhs, &l, threading[omp_get_thread_num()] ); + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs + tau1_gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] ); + else +#endif + // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs + gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] ); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = tol/2.; + solver( ); - if ( g.method == -1 ) { - cgn_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } else if ( g.mixed_precision == 2 ) { - fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] ); - } else { - fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + tau1_gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] ); + else +#endif + gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] ); + +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + change_epsbar_shift_sign( ); + else +#endif + DDalphaAMG_change_mu_sign( &tmp_status ); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = (tol-g.norm_res)*nrhs/nrhs2/8.; + solver( ); - gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] ); - } - DDalphaAMG_change_mu_sign( &tmp_status ); -#pragma omp parallel num_threads(threading[0]->n_core) - if ( g.method == -1 ) { - cgn_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } else if ( g.mixed_precision == 2 ) { - fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] ); - } else { - fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); break; case _SOLVE_SQ_ODD : -#pragma omp parallel num_threads(threading[0]->n_core) - { - // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs - vector_double_gamma5_set_even_to_zero(rhs, rhs, &l, threading[omp_get_thread_num()]); - if ( g.method == -1 ) { - cgn_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } else if ( g.mixed_precision == 2 ) { - fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] ); - } else { - fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } - vector_double_gamma5_set_even_to_zero(rhs, sol, &l, threading[omp_get_thread_num()]); - } - DDalphaAMG_change_mu_sign( &tmp_status ); -#pragma omp parallel num_threads(threading[0]->n_core) - if ( g.method == -1 ) { - cgn_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } else if ( g.mixed_precision == 2 ) { - fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] ); - } else { - fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs + tau1_gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); + else +#endif + // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs + gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = tol/2.; + solver( ); + + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + tau1_gamma5_set_even_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]); + else +#endif + gamma5_set_even_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]); + +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + change_epsbar_shift_sign( ); + else +#endif + DDalphaAMG_change_mu_sign( &tmp_status ); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = (tol-g.norm_res)*nrhs/nrhs2/8.; + solver( ); + // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); break; case _SOLVE_SQ_EVEN : -#pragma omp parallel num_threads(threading[0]->n_core) - { - // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs - vector_double_gamma5_set_odd_to_zero(rhs, rhs, &l, threading[omp_get_thread_num()]); - if ( g.method == -1 ) { - cgn_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } else if ( g.mixed_precision == 2 ) { - fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] ); - } else { - fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } - vector_double_gamma5_set_odd_to_zero(rhs, sol, &l, threading[omp_get_thread_num()]); - } - DDalphaAMG_change_mu_sign( &tmp_status ); -#pragma omp parallel num_threads(threading[0]->n_core) - if ( g.method == -1 ) { - cgn_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } else if ( g.mixed_precision == 2 ) { - fgmres_MP( &(g.p_MP), &l, threading[omp_get_thread_num()] ); - } else { - fgmres_double( &(g.p), &l, threading[omp_get_thread_num()] ); - } + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs + tau1_gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); + else +#endif + // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs + gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = tol/2.; + solver( ); + + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + tau1_gamma5_set_odd_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]); + else +#endif + gamma5_set_odd_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]); + +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + change_epsbar_shift_sign( ); + else +#endif + DDalphaAMG_change_mu_sign( &tmp_status ); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = (tol-g.norm_res)*nrhs/nrhs2/8.; + solver( ); + // DDalphaAMG_change_mu_sign( &tmp_status ); warning0("sign of mu changed during the inversion of squared operator\n"); break; case _PRECOND : -#pragma omp parallel num_threads(threading[0]->n_core) + THREADED(threading[0]->n_core) preconditioner( sol, NULL, rhs, _NO_RES, &l, threading[omp_get_thread_num()] ); break; case _OPERATOR : -#pragma omp parallel num_threads(threading[0]->n_core) + THREADED(threading[0]->n_core) if ( g.mixed_precision == 2 ) { apply_operator_double( sol, rhs, &(g.p_MP.dp), &l, threading[omp_get_thread_num()] ); } else { @@ -786,52 +944,538 @@ void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status break; } - /* + for (t=0, j=0; tb = vb; + p->x = vx; + } +#endif + + if ( g.norm_res <= tol || _TYPE == _OPERATOR || _TYPE == _PRECOND ) + mg_status->success = 1; + mg_status->info = g.norm_res; + t1 = MPI_Wtime(); + mg_status->time = t1-t0; + mg_status->coarse_time = g.coarse_time; + mg_status->iter_count = g.iter_count; + mg_status->coarse_iter_count = g.coarse_iter_count; + +} + +static inline void DDalphaAMG_ms_driver( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status, int _TYPE ) +{ + int t, z, y, x, i, j, k, n, mu, *ll = l.local_lattice, *gl=l.global_lattice, sl[4], precision_changed; + complex_double twisted_bc, tmp1, tmp2; + double phase[4] = {_COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO, _COMPLEX_double_ZERO}, + vmin=1, vmax=EPS_float, vtmp, nrhs, nrhs2; + gmres_double_struct *p = g.mixed_precision==2?&(g.p_MP.dp):&(g.p); + vector_double vb, rhs = p->b; + vector_double vx, sol = p->x; + vector_double source = NULL, solution = NULL, solution2 = NULL; + DDalphaAMG_status tmp_status; + + double t0, t1; + t0 = MPI_Wtime(); + g.coarse_time = 0; + g.iter_count = 0; + g.coarse_iter_count = 0; + mg_status->success = 0; + mg_status->info = 0; + + ASSERT(vector1_out!=NULL); + ASSERT(vector1_in!=NULL); + ASSERT(tol!=NULL); +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) { + ASSERT(vector2_out!=NULL); + ASSERT(vector2_in!=NULL); + } +#endif + + if(g.mixed_precision!=2) + g.p.tol = tol[0]; + else + g.p_MP.dp.tol = tol[0]; + + for (i=0; i<4; i++) + sl[i] = ll[i]*g.my_coords[i]; + + for (t=0, j=0; t vmax) + vmax=vtmp; + if( vtmp > EPS_double && vtmp < vmin ) + vmin=vtmp; + vtmp=cabs(rhs[j+6]); + if(vtmp > vmax) + vmax=vtmp; + if( vtmp > EPS_double && vtmp < vmin ) + vmin=vtmp; + } + } +#endif + if(mu%2) + j+=6; + } + } else +#endif + for ( mu=0; mu<4; mu++ ) + for ( k=0; k<3; k++, j++ ) { +#ifndef BASIS4 + rhs[j] = ((complex_double)vector1_in[i+2*(k+3*mu)] + I*(complex_double)vector1_in[i+2*(k+3*mu)+1]) * twisted_bc; #else - if ( vector_index_fct!=NULL || g.bc==_TWISTED) + rhs[j] = ((complex_double)vector1_in[i+2*(k+3*(3-mu))] + I*(complex_double)vector1_in[i+2*(k+3*(3-mu))+1]) * twisted_bc; +#endif + +#ifndef INIT_ONE_PREC + if( g.mixed_precision == 2 ) { + vtmp = cabs(rhs[j]); + if(vtmp > vmax) + vmax = vtmp; + if( vtmp > EPS_double && vtmp < vmin ) + vmin = vtmp; + } + } +#endif + } + } + } + } + +#ifndef INIT_ONE_PREC + + double gvmin, gvmax; + if( g.mixed_precision == 2 ) { + MPI_Allreduce(&vmin, &gvmin, 1, MPI_DOUBLE, MPI_MIN, g.comm_cart); + MPI_Allreduce(&vmax, &gvmax, 1, MPI_DOUBLE, MPI_MAX, g.comm_cart); + } + + //switching to double precision on the fine level + if(g.mixed_precision==2 && gvmin/gvmaxb; + vx = p->x; + p->b = g.p_MP.dp.b; + p->x = g.p_MP.dp.x; + p->tol = g.p_MP.dp.tol; + } else precision_changed = 0; +#endif + + if ( n_shifts > 0 ) { + ASSERT( even_shifts != NULL ); + ASSERT( odd_shifts != NULL ); + } + if ( n_shifts > 1 ) { + MALLOC( source, complex_double, l.inner_vector_size ); + MALLOC( solution, complex_double, l.inner_vector_size ); + if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN ) + MALLOC( solution2, complex_double, l.inner_vector_size ); + } + + for ( n = 0; n < n_shifts; n++ ) { + +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) { + if( g.epsbar_ig5_even_shift != even_shifts[n] || g.epsbar_ig5_odd_shift != odd_shifts[n] ) { + g.epsbar_ig5_even_shift = even_shifts[n]; + g.epsbar_ig5_odd_shift = odd_shifts[n]; + THREADED(threading[0]->n_core) + epsbar_term_update( &l, threading[omp_get_thread_num()] ); + THREADED(threading[0]->n_core) + finalize_operator_update( &l, threading[omp_get_thread_num()]); + } + } else +#endif + { +#ifdef HAVE_TM + if( g.mu_even_shift != even_shifts[n] || g.mu_odd_shift != odd_shifts[n] ) { + g.mu_even_shift = even_shifts[n]; + g.mu_odd_shift = odd_shifts[n]; + THREADED(threading[0]->n_core) + tm_term_update( g.mu, &l, threading[omp_get_thread_num()] ); + THREADED(threading[0]->n_core) + finalize_operator_update( &l, threading[omp_get_thread_num()]); + } +#endif + } + + p->tol = tol[n]; + + switch(_TYPE) { + + case _SOLVE : + if ( n ) { + vector_copy( rhs, source ); + p->initial_guess_zero = 0; + } else if ( n_shifts > 1 ) + vector_copy( source, rhs ); + + solver( ); + break; + + + case _SOLVE_SQ : + if ( n ) { + vector_copy( rhs, source ); + p->initial_guess_zero = 0; + } else if ( n_shifts > 1 ) { + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs + tau1_gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] ); + else +#endif + // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs + gamma5_double( rhs, rhs, &l, threading[omp_get_thread_num()] ); + vector_copy( source, rhs ); + } + + if( n ) + correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = tol[n]/2.; + solver( ); + if ( n < n_shifts-1 ) + vector_copy( solution, sol ); + + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + tau1_gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] ); + else +#endif + gamma5_double(rhs, sol, &l, threading[omp_get_thread_num()] ); + +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + change_epsbar_shift_sign( ); + else +#endif + DDalphaAMG_change_mu_sign( &tmp_status ); + + if( n ) + vector_copy( sol, solution2 ); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; + solver( ); + if ( n < n_shifts-1 ) + vector_copy( solution2, sol ); + + // DDalphaAMG_change_mu_sign( &tmp_status ); + warning0("sign of mu changed during the inversion of squared operator\n"); + break; + + + case _SOLVE_SQ_ODD : + if ( n ) { + vector_copy( rhs, source ); + p->initial_guess_zero = 0; + } else if ( n_shifts > 1 ) { + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs + tau1_gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); + else #endif - */ + // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs + gamma5_set_even_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); + + vector_copy( source, rhs ); + } + + if( n ) + correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = tol[n]/2.; + solver( ); + if ( n < n_shifts-1 ) + vector_copy( solution, sol ); + + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + tau1_gamma5_set_even_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]); + else +#endif + gamma5_set_even_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]); + +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + change_epsbar_shift_sign( ); + else +#endif + DDalphaAMG_change_mu_sign( &tmp_status ); + + if( n ) + vector_copy( sol, solution2 ); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; + solver( ); + if ( n < n_shifts-1 ) + vector_copy( solution2, sol ); + + // DDalphaAMG_change_mu_sign( &tmp_status ); + warning0("sign of mu changed during the inversion of squared operator\n"); + break; + + + case _SOLVE_SQ_EVEN : + if ( n ) { + vector_copy( rhs, source ); + p->initial_guess_zero = 0; + } else if ( n_shifts > 1 ) { + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + // sol = (D_h^{-1})*g5*tau1*(D_h^{-1})*g5*tau1*rhs + tau1_gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); + else +#endif + // sol = (D_d^{-1})*g5*(D_u^{-1})*g5*rhs + gamma5_set_odd_to_zero_double(rhs, rhs, &l, threading[omp_get_thread_num()]); + + vector_copy( source, rhs ); + } + + if( n ) + correct_guess( sol, solution, solution2, even_shifts[n]-even_shifts[n-1], odd_shifts[n]-odd_shifts[n-1]); + + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = tol[n]/2.; + solver( ); + if ( n < n_shifts-1 ) + vector_copy( solution, sol ); + + THREADED(threading[0]->n_core) +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + tau1_gamma5_set_odd_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]); + else +#endif + gamma5_set_odd_to_zero_double(rhs, sol, &l, threading[omp_get_thread_num()]); + +#ifdef HAVE_TM1p1 + if(g.n_flavours==2) + change_epsbar_shift_sign( ); + else +#endif + DDalphaAMG_change_mu_sign( &tmp_status ); + + if( n ) + vector_copy( sol, solution2 ); + // read NOTE RESIDUAL + THREADED(threading[0]->n_core) + nrhs2 = global_norm_double( rhs, p->v_start, p->v_end, &l, threading[omp_get_thread_num()] ); + p->tol = (tol[n]-g.norm_res)*nrhs/nrhs2/8.; + solver( ); + if ( n < n_shifts-1 ) + vector_copy( solution2, sol ); + + // DDalphaAMG_change_mu_sign( &tmp_status ); + warning0("sign of mu changed during the inversion of squared operator\n"); + break; + + + case _PRECOND : + THREADED(threading[0]->n_core) + preconditioner( sol, NULL, rhs, _NO_RES, &l, threading[omp_get_thread_num()] ); + break; + + + case _OPERATOR : + THREADED(threading[0]->n_core) + if ( g.mixed_precision == 2 ) { + apply_operator_double( sol, rhs, &(g.p_MP.dp), &l, threading[omp_get_thread_num()] ); + } else { + apply_operator_double( sol, rhs, &(g.p), &l, threading[omp_get_thread_num()] ); + } + break; + + + default : + warning0("_TYPE not found in DDalphaAMG_driver. Returing vector in as vector out."); + sol=rhs; + break; + } + for (t=0, j=0; tb = rhs; - p->x = sol; + + } + + p->initial_guess_zero = 1; + if ( n_shifts > 0 ) { + FREE( source, complex_double, l.inner_vector_size ); + FREE( solution, complex_double, l.inner_vector_size ); + if( _TYPE == _SOLVE_SQ || _TYPE == _SOLVE_SQ_ODD || _TYPE == _SOLVE_SQ_EVEN ) + FREE( solution2, complex_double, l.inner_vector_size ); } - */ + #ifndef INIT_ONE_PREC if (precision_changed) { g.mixed_precision=2; @@ -840,95 +1484,152 @@ void DDalphaAMG_driver( double *vector_out, double *vector_in, DDalphaAMG_status p->x = vx; } #endif - + + if ( g.norm_res <= p->tol || _TYPE == _OPERATOR || _TYPE == _PRECOND ) + mg_status->success = 1; mg_status->info = g.norm_res; t1 = MPI_Wtime(); mg_status->time = t1-t0; mg_status->coarse_time = g.coarse_time; mg_status->iter_count = g.iter_count; mg_status->coarse_iter_count = g.coarse_iter_count; - + +} + +static inline void set_n_flavours( int n) { + +#ifdef HAVE_TM1p1 + THREADED(threading[0]->n_core) + data_layout_n_flavours( n, &l, threading[omp_get_thread_num()] ); +#else + if( n==2 ) + error0("For DDalphaAMG_solve_doublet_*, HAVE_TM1p1 flag required\n"); +#endif + } void DDalphaAMG_solve( double *vector_out, double *vector_in, double tol, DDalphaAMG_status *mg_status ) { - - if(g.mixed_precision!=2) { - g.p.tol = tol; - } - else { - g.p_MP.dp.tol = tol; - } - - DDalphaAMG_driver( vector_out, vector_in, mg_status, _SOLVE ); + DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, tol, mg_status, _SOLVE ); +} - if ( g.norm_res <= tol ) - mg_status->success = 1; +void DDalphaAMG_solve_doublet( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, + double tol, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, tol, mg_status, _SOLVE ); + set_n_flavours( 1 ); +} +void DDalphaAMG_solve_ms_doublet( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_ms_driver( vector1_out, vector1_in, vector2_out, vector2_in, even_shifts, odd_shifts, n_shifts, + tol, mg_status, _SOLVE ); + set_n_flavours( 1 ); } void DDalphaAMG_solve_squared( double *vector_out, double *vector_in, double tol, DDalphaAMG_status *mg_status ) { + DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, tol, mg_status, _SOLVE_SQ ); +} - if(g.mixed_precision!=2) { - g.p.tol = tol; - } - else { - g.p_MP.dp.tol = tol; - } - - DDalphaAMG_driver( vector_out, vector_in, mg_status, _SOLVE_SQ ); - - if ( g.norm_res <= tol ) - mg_status->success = 1; +void DDalphaAMG_solve_doublet_squared( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, + double tol, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, tol, mg_status, _SOLVE_SQ ); + set_n_flavours( 1 ); +} +void DDalphaAMG_solve_ms_doublet_squared( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_ms_driver( vector1_out, vector1_in, vector2_out, vector2_in, even_shifts, odd_shifts, n_shifts, + tol, mg_status, _SOLVE_SQ ); + set_n_flavours( 1 ); } void DDalphaAMG_solve_squared_odd( double *vector_out, double *vector_in, double tol, DDalphaAMG_status *mg_status ) { + DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, tol, mg_status, _SOLVE_SQ_ODD ); +} - if(g.mixed_precision!=2) { - g.p.tol = tol; - } - else { - g.p_MP.dp.tol = tol; - } - - DDalphaAMG_driver( vector_out, vector_in, mg_status, _SOLVE_SQ_ODD ); - - if ( g.norm_res <= tol ) - mg_status->success = 1; +void DDalphaAMG_solve_doublet_squared_odd( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, + double tol, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, tol, mg_status, _SOLVE_SQ_ODD ); + set_n_flavours( 1 ); +} + +void DDalphaAMG_solve_ms_doublet_squared_odd( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_ms_driver( vector1_out, vector1_in, vector2_out, vector2_in, even_shifts, odd_shifts, n_shifts, + tol, mg_status, _SOLVE_SQ_ODD ); + set_n_flavours( 1 ); } void DDalphaAMG_solve_squared_even( double *vector_out, double *vector_in, double tol, DDalphaAMG_status *mg_status ) { + DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, tol, mg_status, _SOLVE_SQ_EVEN ); +} - if(g.mixed_precision!=2) { - g.p.tol = tol; - } - else { - g.p_MP.dp.tol = tol; - } - - DDalphaAMG_driver( vector_out, vector_in, mg_status, _SOLVE_SQ_EVEN ); - - if ( g.norm_res <= tol ) - mg_status->success = 1; +void DDalphaAMG_solve_doublet_squared_even( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, + double tol, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, tol, mg_status, _SOLVE_SQ_EVEN ); + set_n_flavours( 1 ); } +void DDalphaAMG_solve_ms_doublet_squared_even( double **vector1_out, double *vector1_in, + double **vector2_out, double *vector2_in, + double *even_shifts, double *odd_shifts, int n_shifts, + double *tol, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_ms_driver( vector1_out, vector1_in, vector2_out, vector2_in, even_shifts, odd_shifts, n_shifts, + tol, mg_status, _SOLVE_SQ_EVEN ); + set_n_flavours( 1 ); +} void DDalphaAMG_apply_operator( double *vector_out, double *vector_in, DDalphaAMG_status *mg_status ) { - - DDalphaAMG_driver( vector_out, vector_in, mg_status, _OPERATOR ); - - mg_status->success = 1; + DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, 0, mg_status, _OPERATOR ); +} + +void DDalphaAMG_apply_operator_doublet( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, 0, mg_status, _OPERATOR ); + set_n_flavours( 1 ); } void DDalphaAMG_preconditioner( double *vector_out, double *vector_in, DDalphaAMG_status * mg_status ) { + DDalphaAMG_driver( vector_out, vector_in, NULL, NULL, 0, mg_status, _PRECOND ); +} - DDalphaAMG_driver( vector_out, vector_in, mg_status, _PRECOND ); - - mg_status->success = 1; +void DDalphaAMG_preconditioner_doublet( double *vector1_out, double *vector1_in, + double *vector2_out, double *vector2_in, DDalphaAMG_status *mg_status ) +{ + set_n_flavours( 2 ); + DDalphaAMG_driver( vector1_out, vector1_in, vector2_out, vector2_in, 0, mg_status, _PRECOND ); + set_n_flavours( 1 ); } void DDalphaAMG_free( void ) { @@ -949,7 +1650,7 @@ void DDalphaAMG_finalize( void ) { if (g.setup_flag) method_free( &l ); method_finalize( &l ); - + } MPI_Comm DDalphaAMG_get_communicator( void ){ @@ -989,11 +1690,14 @@ void DDalphaAMG_write_vector( double *vector_out, char *filename, int format, DD void DDalphaAMG_define_vector_const( double *vector, double re, double im ) { -#pragma omp parallel num_threads(threading[0]->n_core) + THREADED(threading[0]->n_core) if(vector!=NULL){ - int start, end; - compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_define( (vector_double) vector, re+I*im, start, end, &l ); + if ( re && im ) + vector_double_define( (vector_double) vector, re+I*im, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); + else if ( re ) + vector_double_define_real( (vector_double) vector, re, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); + else + vector_double_define_zero( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); } else { warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); @@ -1002,14 +1706,12 @@ void DDalphaAMG_define_vector_const( double *vector, double re, double im ) { void DDalphaAMG_define_vector_rand( double *vector ) { -#pragma omp parallel num_threads(threading[0]->n_core) + THREADED(threading[0]->n_core) if(vector!=NULL){ - int start, end; - compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_define_random( (vector_double) vector, start, end, &l ); + vector_double_define_random( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); } else { - warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); + warning0("Vector NULL when calling DDalphaAMG_define_vector_rand!"); } } @@ -1017,29 +1719,23 @@ void DDalphaAMG_define_vector_rand( double *vector ) { double DDalphaAMG_vector_norm( double *vector ) { double norm = 0; -#pragma omp parallel num_threads(threading[0]->n_core) - if(vector!=NULL){ - int start, end; - norm = global_norm_double( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); - } - else { - warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); - } + if(vector!=NULL) + THREADED(threading[0]->n_core) + norm = global_norm_double( (vector_double) vector, 0, l.inner_vector_size, &l, threading[omp_get_thread_num()] ); + else + warning0("Vector NULL when calling DDalphaAMG_define_vector_norm!"); return norm; } void DDalphaAMG_vector_saxpy( double *vector_out, double a, double *x, double *y ) { - #pragma omp parallel num_threads(threading[0]->n_core) - if(vector_out!=NULL && x!=NULL && y!=NULL){ - int start, end; - compute_core_start_end( 0, l.inner_vector_size, &start, &end, &l, threading[omp_get_thread_num()]); - vector_double_saxpy( (vector_double) vector_out, (vector_double) x, (vector_double) y, a, start, end, &l ); - } - else { - warning0("Vector NULL when calling DDalphaAMG_define_vector_const!"); - } + if(vector_out!=NULL && x!=NULL && y!=NULL) + THREADED(threading[0]->n_core) + vector_double_saxpy( (vector_double) vector_out, (vector_double) x, (vector_double) y, a, 0, + l.inner_vector_size, &l, threading[omp_get_thread_num()] ); + else + warning0("Vector NULL when calling DDalphaAMG_define_vector_saxpy!"); } @@ -1049,7 +1745,7 @@ void DDalphaAMG_test_routine( DDalphaAMG_status *mg_status ) { t0 = MPI_Wtime(); printf00("\n"); -#pragma omp parallel num_threads(threading[0]->n_core) + THREADED(threading[0]->n_core) test_routine( &l, threading[omp_get_thread_num()]); if (g.test < 1e-5) @@ -1073,14 +1769,29 @@ void DDalphaAMG_get_parameters( DDalphaAMG_parameters *mg_params ){ mg_params->mixed_precision = g.mixed_precision; mg_params->kcycle_tolerance = g.kcycle_tol; mg_params->coarse_tolerance = g.coarse_tol; + mg_params->smoother_iterations = g.post_smooth_iter[0]; mg_params->conf_index_fct = conf_index_fct; mg_params->vector_index_fct = vector_index_fct; - mg_params->kappa = 0.5/(l.real_shift + 4.); - mg_params->mu = g.tm_mu; - mg_params->mu_odd_shift = g.tm_mu_odd_shift; - mg_params->mu_even_shift = g.tm_mu_even_shift; + mg_params->kappa = 0.5/(g.m0 + 4.); +#ifdef HAVE_TM + mg_params->mu = g.mu; + mg_params->mu_odd_shift = g.mu_odd_shift; + mg_params->mu_even_shift = g.mu_even_shift; +#else + mg_params->mu = 0; + mg_params->mu_odd_shift = 0; + mg_params->mu_even_shift = 0; +#endif +#ifdef HAVE_TM1p1 + mg_params->epsbar = g.epsbar; + mg_params->epsbar_ig5_odd_shift = g.epsbar_ig5_odd_shift; + mg_params->epsbar_ig5_even_shift = g.epsbar_ig5_even_shift; +#else + mg_params->epsbar = 0; + mg_params->epsbar_ig5_odd_shift = 0; + mg_params->epsbar_ig5_even_shift = 0; +#endif mg_params->print = g.print; - mg_params->smoother_iterations = g.post_smooth_iter[0]; for( i=0; img_basis_vectors[i] = g.num_eig_vect[i]; mg_params->setup_iterations[i] = g.setup_iter[i]; - mg_params->mu_factor[i] = g.tm_mu_factor[i]; +#ifdef HAVE_TM + mg_params->mu_factor[i] = g.mu_factor[i]; +#else + mg_params->mu_factor[i] = 1; +#endif +#ifdef HAVE_TM1p1 + mg_params->epsbar_factor[i] = g.epsbar_factor[i]; +#else + mg_params->epsbar_factor[i] = 1; +#endif } } diff --git a/src/blas_vectorized.h b/src/blas_vectorized.h deleted file mode 100644 index 2afa928..0000000 --- a/src/blas_vectorized.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef BLAS_VECTORIZED_H -#define BLAS_VECTORIZED_H - -// BLAS naming convention: LDA = leading dimension of A -#ifdef SSE -#include "sse_blas_vectorized.h" -#endif - -// C=A*B+C -static inline void cgemv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C) -{ -#ifdef SSE - sse_cgemv( N, A, lda, B, C ); -#endif -} - -// C=-A*B+C -static inline void cgenmv(const int N, const OPERATOR_TYPE_float *A, int lda, const float *B, float *C) -{ -#ifdef SSE - sse_cgenmv( N, A, lda, B, C ); -#endif -} - - -static inline void cgem_inverse(const int N, OPERATOR_TYPE_float *A_inverse, OPERATOR_TYPE_float *A, int lda) -{ -#ifdef SSE - sse_cgem_inverse( N, A_inverse, A, lda ); -#endif -} - -#endif // BLAS_VECTORIZED_H diff --git a/src/clifford.h b/src/clifford.h index 6521566..683b062 100644 --- a/src/clifford.h +++ b/src/clifford.h @@ -653,14 +653,27 @@ #endif #endif -#ifdef SSE static const int gamma_co[4][4] = { {GAMMA_T_SPIN0_CO, GAMMA_T_SPIN1_CO, GAMMA_T_SPIN2_CO, GAMMA_T_SPIN3_CO}, {GAMMA_Z_SPIN0_CO, GAMMA_Z_SPIN1_CO, GAMMA_Z_SPIN2_CO, GAMMA_Z_SPIN3_CO}, {GAMMA_Y_SPIN0_CO, GAMMA_Y_SPIN1_CO, GAMMA_Y_SPIN2_CO, GAMMA_Y_SPIN3_CO}, {GAMMA_X_SPIN0_CO, GAMMA_X_SPIN1_CO, GAMMA_X_SPIN2_CO, GAMMA_X_SPIN3_CO}}; -static const double complex gamma_val[4][4] = { +#ifdef HAVE_TM1p1 +static const int gamma_doublet_offset[4][4] = { + {GAMMA_T_SPIN0_CO/2, GAMMA_T_SPIN1_CO/2, GAMMA_T_SPIN2_CO/2, GAMMA_T_SPIN3_CO/2}, + {GAMMA_Z_SPIN0_CO/2, GAMMA_Z_SPIN1_CO/2, GAMMA_Z_SPIN2_CO/2, GAMMA_Z_SPIN3_CO/2}, + {GAMMA_Y_SPIN0_CO/2, GAMMA_Y_SPIN1_CO/2, GAMMA_Y_SPIN2_CO/2, GAMMA_Y_SPIN3_CO/2}, + {GAMMA_X_SPIN0_CO/2, GAMMA_X_SPIN1_CO/2, GAMMA_X_SPIN2_CO/2, GAMMA_X_SPIN3_CO/2}}; +#endif + +static const complex_double gamma_val_double[4][4] = { + {GAMMA_T_SPIN0_VAL, GAMMA_T_SPIN1_VAL, GAMMA_T_SPIN2_VAL, GAMMA_T_SPIN3_VAL}, + {GAMMA_Z_SPIN0_VAL, GAMMA_Z_SPIN1_VAL, GAMMA_Z_SPIN2_VAL, GAMMA_Z_SPIN3_VAL}, + {GAMMA_Y_SPIN0_VAL, GAMMA_Y_SPIN1_VAL, GAMMA_Y_SPIN2_VAL, GAMMA_Y_SPIN3_VAL}, + {GAMMA_X_SPIN0_VAL, GAMMA_X_SPIN1_VAL, GAMMA_X_SPIN2_VAL, GAMMA_X_SPIN3_VAL}}; + +static const complex_float gamma_val_float[4][4] = { {GAMMA_T_SPIN0_VAL, GAMMA_T_SPIN1_VAL, GAMMA_T_SPIN2_VAL, GAMMA_T_SPIN3_VAL}, {GAMMA_Z_SPIN0_VAL, GAMMA_Z_SPIN1_VAL, GAMMA_Z_SPIN2_VAL, GAMMA_Z_SPIN3_VAL}, {GAMMA_Y_SPIN0_VAL, GAMMA_Y_SPIN1_VAL, GAMMA_Y_SPIN2_VAL, GAMMA_Y_SPIN3_VAL}, @@ -683,6 +696,5 @@ static const int gamma_im_sign[4][4] = { {GAMMA_Z_SPIN0_IM_SIGN,GAMMA_Z_SPIN1_IM_SIGN,GAMMA_Z_SPIN2_IM_SIGN,GAMMA_Z_SPIN3_IM_SIGN}, {GAMMA_Y_SPIN0_IM_SIGN,GAMMA_Y_SPIN1_IM_SIGN,GAMMA_Y_SPIN2_IM_SIGN,GAMMA_Y_SPIN3_IM_SIGN}, {GAMMA_X_SPIN0_IM_SIGN,GAMMA_X_SPIN1_IM_SIGN,GAMMA_X_SPIN2_IM_SIGN,GAMMA_X_SPIN3_IM_SIGN}}; -#endif #endif diff --git a/src/coarse_coupling_generic.c b/src/coarse_coupling_generic.c new file mode 100644 index 0000000..12bd8e5 --- /dev/null +++ b/src/coarse_coupling_generic.c @@ -0,0 +1,1369 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#include "main.h" + +#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION +void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ) { + + SYNC_HYPERTHREADS(threading) + SYNC_CORES(threading) + + double t0, t1; + t0 = MPI_Wtime(); + + int mu, j, n = l->num_eig_vect, num_aggregates = l->is_PRECISION.num_agg, + aggregate_sites = l->num_inner_lattice_sites / num_aggregates, + clover_site_size = (l->num_eig_vect*(l->num_eig_vect*2+1)), + block_site_size = (l->num_eig_vect*(l->num_eig_vect+1)), + D_link_size = 4*l->num_eig_vect*l->num_eig_vect*4, // size of links in all 4 directions + fine_components = l->num_lattice_site_var; + + + + START_LOCKED_MASTER(threading) + operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level ); + END_LOCKED_MASTER(threading) + SYNC_HYPERTHREADS(threading) + + // each thread loops overs its aggregates and then over internal d.o.f. + for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { + for ( j=0; jnext_level->op_PRECISION.D[j+a*D_link_size] = _COMPLEX_PRECISION_ZERO; + for ( j=0; jnext_level->op_PRECISION.clover[j+a*clover_site_size] = _COMPLEX_PRECISION_ZERO; + for ( j=0; jnext_level->op_PRECISION.odd_proj[j+a*block_site_size] = _COMPLEX_PRECISION_ZERO; + } + + complex_PRECISION *mpi_buffer = NULL; + START_MASTER(threading) + MALLOC_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size), 64 ); + END_MASTER(threading) + + int direction_flags[8*l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X]]; + + // set up table for direction flags + int *flags = direction_flags; + if(l->depth == 0) { + // even sites + for(int t=0; t < l->block_lattice[T]; t++) { + for(int z=0; z < l->block_lattice[Z]; z++) { + for(int y=0; y < l->block_lattice[Y]; y++) { + for(int x=0; x < l->block_lattice[X]; x++) { + if((x+y+z+t)%2 == 0) { + flags[2*X+0] = (x == 0)?0:1; + flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; + flags[2*Y+0] = (y == 0)?0:1; + flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; + flags[2*Z+0] = (z == 0)?0:1; + flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; + flags[2*T+0] = (t == 0)?0:1; + flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; + flags += 8; + } + } + } + } + } + // odd sites + for(int t=0; t < l->block_lattice[T]; t++) { + for(int z=0; z < l->block_lattice[Z]; z++) { + for(int y=0; y < l->block_lattice[Y]; y++) { + for(int x=0; x < l->block_lattice[X]; x++) { + if((x+y+z+t)%2 == 1) { + flags[2*X+0] = (x == 0)?0:1; + flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; + flags[2*Y+0] = (y == 0)?0:1; + flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; + flags[2*Z+0] = (z == 0)?0:1; + flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; + flags[2*T+0] = (t == 0)?0:1; + flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; + flags += 8; + } + } + } + } + } + } else { + for(int t=0; t < l->block_lattice[T]; t++) { + for(int z=0; z < l->block_lattice[Z]; z++) { + for(int y=0; y < l->block_lattice[Y]; y++) { + for(int x=0; x < l->block_lattice[X]; x++) { + flags[2*X+0] = (x == 0)?0:1; + flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; + flags[2*Y+0] = (y == 0)?0:1; + flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; + flags[2*Z+0] = (z == 0)?0:1; + flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; + flags[2*T+0] = (t == 0)?0:1; + flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; + flags += 8; + } + } + } + } + } + + complex_PRECISION eta1[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); + complex_PRECISION eta2[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); + complex_PRECISION tmp[4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); + + for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { + + // new aggregate is starting, zero out tmp + for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) + tmp[i] = 0.0; + + for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { + if(l->depth == 0) { + for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) + d_plus_clover_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, + operator+c*l->vector_size, &(l->s_PRECISION), l, site, + direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) ); + } else { + for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) + coarse_aggregate_self_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, + operator+c*l->vector_size, &(l->s_PRECISION), l, site, + direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) ); + } + set_coarse_self_coupling_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp ); + } + + // aggregate is done, finalize + set_coarse_self_coupling_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp ); + + } + + + SYNC_HYPERTHREADS(threading) + START_LOCKED_MASTER(threading) + // neighbors + for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) { + for ( mu=0; mu<4; mu++ ) { + // determine start of buffer for this mu + int start = 0; + for ( int j=0; js_PRECISION.op.c.num_boundary_sites[2*j]; + + // update ghost cells of V[i] + negative_sendrecv_PRECISION_vectorized( operator+c*l->vector_size, mu, &(l->s_PRECISION.op.c), l, + SIMD_LENGTH_PRECISION, mpi_buffer+c*(l->vector_size-l->inner_vector_size)+fine_components*start*SIMD_LENGTH_PRECISION ); + } + for ( mu=0; mu<4; mu++ ) { + // finish updating ghostcells of V[i] + negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l ); + } + } + END_LOCKED_MASTER(threading) + SYNC_HYPERTHREADS(threading) + + + for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { + + // new aggregate is starting, zero out tmp + for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) + tmp[i] = 0.0; + + for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { + for ( mu=0; mu<4; mu++ ) { + if( (direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])))[2*mu+1] != 0) + continue; + + if(l->depth == 0) + for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) + d_neighbor_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, + operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site ); + else + for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) + coarse_aggregate_neighbor_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, + operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site ); + set_coarse_neighbor_coupling_PRECISION_vectorized( eta1, eta2, operator, mu, l, site, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION ); + } + } + + // aggregate is done, finalize + for ( mu=0; mu<4; mu++ ) + set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( mu, l, a*aggregate_sites, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION ); + } + + SYNC_HYPERTHREADS(threading) + SYNC_CORES(threading) + + for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { + + // new aggregate is starting, zero out tmp + for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) + tmp[i] = 0.0; + + for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { + if(l->depth == 0) { + for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) + diagonal_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, + operator+c*l->vector_size, &(l->s_PRECISION), l, site ); + } else { + for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) + coarse_aggregate_block_diagonal_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, + operator+c*l->vector_size, &(l->s_PRECISION), l, site ); + } + set_coarse_block_diagonal_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp ); + } + + // aggregate is done, finalize + set_coarse_block_diagonal_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp ); + } + + SYNC_HYPERTHREADS(threading) + SYNC_CORES(threading) + + coarse_operator_PRECISION_setup_finalize( l, threading ); + + START_MASTER(threading) + FREE_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size) ); + + t1 = MPI_Wtime(); + if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 ); + END_MASTER(threading) + + SYNC_HYPERTHREADS(threading) + SYNC_CORES(threading) +} +#endif + + +void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, + complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { + + int k, m, k1, k2, num_eig_vect = l->next_level->num_lattice_site_var/2, + offset = l->num_lattice_site_var/2; + PRECISION *spin_0_1_pt; + PRECISION *spin_2_3_pt; + PRECISION *interpolation_data; + + int component_offset = SIMD_LENGTH_PRECISION; + int fine_components = l->num_lattice_site_var; + + // U(x) = [ A B , A=A*, D=D*, C = -B* + // C D ] + // storage order: upper triangle of A, upper triangle of D, B, columnwise + // diagonal coupling + for ( int n=0; nvector_size + fine_components*SIMD_LENGTH_PRECISION*site); + + // A + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=0; m broadcast + mm_PRECISION spin_0_1_re = mm_set1_PRECISION(spin_0_1_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_0_1_im = mm_set1_PRECISION(spin_0_1_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + + // D + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=offset; m<2*offset; m++ ) { + // spin_2_3 is the same for all k => broadcast + mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + } + + // index k used for vectorization + for ( k=0; kvector_size + fine_components*component_offset*site); + + // B + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=0; m broadcast + mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + } + } +} + + +void set_coarse_block_diagonal_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, + complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { + + int k, m, k1, k2, num_eig_vect = l->next_level->num_parent_eig_vect, + offset = l->num_parent_eig_vect; + PRECISION *spin_0_1_pt; + PRECISION *spin_2_3_pt; + PRECISION *interpolation_data; + + int component_offset = SIMD_LENGTH_PRECISION; + int fine_components = l->num_lattice_site_var; + + // U(x) = [ A 0 , A=A*, D=D* + // 0 D ] + // storage order: upper triangle of A, upper triangle of D, B, columnwise + // diagonal coupling + for ( int n=0; nvector_size + fine_components*SIMD_LENGTH_PRECISION*site); + + // A + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=0; m broadcast + mm_PRECISION spin_0_1_re = mm_set1_PRECISION(spin_0_1_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_0_1_im = mm_set1_PRECISION(spin_0_1_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + + // D + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=offset; m<2*offset; m++ ) { + // spin_2_3 is the same for all k => broadcast + mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + } + } +} + +void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { + + int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, + num_eig_vect = l->next_level->num_lattice_site_var/2, + aggregate_size = l->inner_vector_size / num_aggregates, + clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2; + int t1, t2; + + config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover; + + // just an abbreviation + int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; + int fine_components = l->num_lattice_site_var; + + int aggregate = (fine_components*site)/aggregate_size; + clover_pt = clover + aggregate*clover_site_size; + + // U(x) = [ A B , A=A*, D=D*, C = -B* + // C D ] + // storage order: upper triangle of A, upper triangle of D, B, columnwise + // diagonal coupling + for ( int n=0; nnext_level->num_lattice_site_var/2, + offset = l->num_lattice_site_var/2; + + PRECISION *spin_0_1_pt; + PRECISION *spin_2_3_pt; + PRECISION *interpolation_data; + + int component_offset = SIMD_LENGTH_PRECISION; + int fine_components = l->num_lattice_site_var; + + // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* + // C D ] -B* D* ] + // storage order: A, C, B, D, each column wise + for ( int n=0; nvector_size + fine_components*component_offset*site); + + k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION; + k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION; + + // A + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=0; m broadcast + mm_PRECISION spin_0_1_re = mm_set1_PRECISION(spin_0_1_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_0_1_im = mm_set1_PRECISION(spin_0_1_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + + // C + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=offset; m<2*offset; m++ ) { + // spin_0_1 is the same for all k => broadcast + mm_PRECISION spin_0_1_re = mm_set1_PRECISION(spin_0_1_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_0_1_im = mm_set1_PRECISION(spin_0_1_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + + + k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION; + k2 = (n+3*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_PRECISION; + + // B + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=0; m broadcast + mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + + // D + buffer_re = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION); + buffer_im = mm_load_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION); + for ( m=offset; m<2*offset; m++ ) { + // spin_2_3 is the same for all k => broadcast + mm_PRECISION spin_2_3_re = mm_set1_PRECISION(spin_2_3_pt[(2*m+0)*component_offset]); + mm_PRECISION spin_2_3_im = mm_set1_PRECISION(spin_2_3_pt[(2*m+1)*component_offset]); + mm_PRECISION interpolation_data_re = mm_load_PRECISION(interpolation_data + (2*m+0)*component_offset); + mm_PRECISION interpolation_data_im = mm_load_PRECISION(interpolation_data + (2*m+1)*component_offset); + + cfmadd_conj_PRECISION(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); + } + mm_store_PRECISION((PRECISION *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_re); + mm_store_PRECISION((PRECISION *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_PRECISION, buffer_im); + } + } +} + + +void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site, + const int n_rhs, complex_PRECISION *tmp ) { + + int k, k1, k2, num_eig_vect = l->next_level->num_lattice_site_var/2, + D_link_size = num_eig_vect*num_eig_vect*4; + int t1, t2; + + config_PRECISION D_pt, D = l->next_level->op_PRECISION.D; + + // just an abbreviation + int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; + int fine_components = l->num_lattice_site_var; + + int aggregate = (fine_components*site)/(l->inner_vector_size / l->is_PRECISION.num_agg); + D_pt = D + (4*aggregate+mu)*D_link_size; + + // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* + // C D ] -B* D* ] + // storage order: A, C, B, D, each column wise + for ( int n=0; nis_PRECISION.num_agg, + num_eig_vect = l->next_level->num_parent_eig_vect, + aggregate_size = l->inner_vector_size / num_aggregates, + block_site_size = (l->next_level->num_parent_eig_vect*(l->next_level->num_parent_eig_vect+1)); + int t1, t2; + + config_PRECISION block_pt, block = l->next_level->op_PRECISION.odd_proj; + + // just an abbreviation + int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; + int fine_components = l->num_lattice_site_var; + + int aggregate = (fine_components*site)/aggregate_size; + block_pt = block + aggregate*block_site_size; + + // U(x) = [ A 0 , A=A*, D=D* + // 0 D ] + // storage order: upper triangle of A, upper triangle of D, B, columnwise + // diagonal coupling + for ( int n=0; n i) { + ip = j; + jp = i; + sign = -1.0; + } + int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal + out_tmp[(2*i+0)*column_offset + j] = creal(clover[offset_to_column+jp]); + out_tmp[(2*i+1)*column_offset + j] = sign*cimag(clover[offset_to_column+jp]); + // C = -B^dagger + out_tmp[(2*i+0)*column_offset + j + vecs] = -creal(clover[offset_to_B + j*vecs+i]); + out_tmp[(2*i+1)*column_offset + j + vecs] = cimag(clover[offset_to_B + j*vecs+i]); + } + // zero + for(int j=2*vecs; j i) { + ip = j; + jp = i; + sign = -1.0; + } + int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal + out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(clover[offset_to_D + offset_to_column+jp]); + out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]); + } + // zero + for(int j=2*vecs; j i) { + ip = j; + jp = i; + sign = -1.0; + } + int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal + // A + out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] = + out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_column+jp]); + out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] = + out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] = sign*cimag(clover[offset_to_column+jp]); + // B + out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 0*vecs] = + out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 1*vecs] = creal(clover[offset_to_B + i*vecs+j]); + out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 0*vecs] = + out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 1*vecs] = cimag(clover[offset_to_B + i*vecs+j]); + // C = -B^dagger + out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 2*vecs] = + out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 3*vecs] = -creal(clover[offset_to_B + j*vecs+i]); + out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 2*vecs] = + out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 3*vecs] = cimag(clover[offset_to_B + j*vecs+i]); + // D + out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] = + out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] = creal(clover[offset_to_D + offset_to_column+jp]); + out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] = + out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]); + // 0 + out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] = + out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] = + out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 3*vecs] = + out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 3*vecs] = + out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] = + out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] = + out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 2*vecs] = + out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 2*vecs] = + out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 1*vecs] = + out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 1*vecs] = + out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] = + out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] = + out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 0*vecs] = + out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 0*vecs] = + out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] = + out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] = 0.0; + } + // zero + for(int j=4*vecs; j i) { + ip = j; + jp = i; + sign = -1.0; + } + int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal + // E + out_tmp[(2*i+0)*column_offset + j] += sign*creal(tm_term[offset_to_column+jp]); + out_tmp[(2*i+1)*column_offset + j] += cimag(tm_term[offset_to_column+jp]); + // F + out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] += sign*creal(tm_term[offset_to_F + offset_to_column+jp]); + out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] += cimag(tm_term[offset_to_F + offset_to_column+jp]); + } + } + tm_term += 2*offset_to_F; + // out_tmp is an alias for the actual output + out_tmp += 2*column_offset*2*vecs; + } +#endif +} + +void add_tm_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION tm_term, OPERATOR_TYPE_PRECISION *clover_vectorized, + int num_aggregates, int num_eig_vect) { +#ifdef HAVE_TM + int vecs = num_eig_vect; + // in vectorized layout clover is stored column wise, but not split into ABCD + // each column is padded, such that next column can also start at 64B boundary + int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + // offset between blocks in clover + int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal + + PRECISION *out_tmp = clover_vectorized; + + // we add/sub the tm term to cloverD_vectorized + // A0B0 E000 0000 + // 0A0B + 0000 - 0E00 + // C0D0 00F0 0000 + // 0C0D 0000 000F + // 0000 0000 0000 + // (column wise, size of zeros such that columns length is multiple of 64B) + + // 4 directions + for ( int a=0; a i) { + ip = j; + jp = i; + sign = -1.0; + } + int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal + // E + out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(tm_term[offset_to_column+jp]); + out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 0*vecs] += cimag(tm_term[offset_to_column+jp]); + out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 1*vecs] -= sign*creal(tm_term[offset_to_column+jp]); + out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 1*vecs] -= cimag(tm_term[offset_to_column+jp]); + // F + out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(tm_term[offset_to_F+offset_to_column+jp]); + out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 2*vecs] += cimag(tm_term[offset_to_F+offset_to_column+jp]); + out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 3*vecs] -= sign*creal(tm_term[offset_to_F+offset_to_column+jp]); + out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 3*vecs] -= cimag(tm_term[offset_to_F+offset_to_column+jp]); + } + } + tm_term += 2*offset_to_F; + // out_tmp is an alias for the actual output + out_tmp += 2*4*vecs*column_offset; + } +#endif +} + +void add_epsbar_term_to_doublet_vectorized_layout_PRECISION(config_PRECISION eps_term, OPERATOR_TYPE_PRECISION *clover_vectorized, + int num_aggregates, int num_eig_vect) { +#ifdef HAVE_TM1p1 + int vecs = num_eig_vect; + // in vectorized layout clover is stored column wise, but not split into ABCD + // each column is padded, such that next column can also start at 64B boundary + int column_offset = SIMD_LENGTH_PRECISION*((4*vecs+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + // offset between blocks in clover + int offset_to_F = (vecs*vecs+vecs)/2; // upper triangle of A including diagonal + + PRECISION *out_tmp = clover_vectorized; + + // we add the eps term to cloverD_vectorized + // A0B0 0E00 + // 0A0B + E000 + // C0D0 000F + // 0C0D 00F0 + // 0000 0000 + // (column wise, size of zeros such that columns length is multiple of 64B) + + // 4 directions + for ( int a=0; a i) { + ip = j; + jp = i; + sign = -1.0; + } + int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal + // E + out_tmp[(2*(i+0*vecs)+0)*column_offset + j + 1*vecs] += sign*creal(eps_term[offset_to_column+jp]); + out_tmp[(2*(i+0*vecs)+1)*column_offset + j + 1*vecs] += cimag(eps_term[offset_to_column+jp]); + out_tmp[(2*(i+1*vecs)+0)*column_offset + j + 0*vecs] += sign*creal(eps_term[offset_to_column+jp]); + out_tmp[(2*(i+1*vecs)+1)*column_offset + j + 0*vecs] += cimag(eps_term[offset_to_column+jp]); + // F + out_tmp[(2*(i+2*vecs)+0)*column_offset + j + 3*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]); + out_tmp[(2*(i+2*vecs)+1)*column_offset + j + 3*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]); + out_tmp[(2*(i+3*vecs)+0)*column_offset + j + 2*vecs] += sign*creal(eps_term[offset_to_F+offset_to_column+jp]); + out_tmp[(2*(i+3*vecs)+1)*column_offset + j + 2*vecs] += cimag(eps_term[offset_to_F+offset_to_column+jp]); + } + } + eps_term += 2*offset_to_F; + // out_tmp is an alias for the actual output + out_tmp += 2*4*vecs*column_offset; + } +#endif +} + +void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, + complex_PRECISION *phi, schwarz_PRECISION_struct *s, + level_struct *l, int site, int *direction_flags ) { + + int offset = SIMD_LENGTH_PRECISION; + int site_offset = l->num_lattice_site_var*offset; + int index_bw; + int index_fw; + int *neighbor = s->op.neighbor_table; + int *backward_neighbor = s->op.backward_neighbor_table; + complex_PRECISION *phi_pt; + config_PRECISION D_pt; + config_PRECISION D = s->op.D; + int n = l->num_lattice_site_var; + int D_site_offset = 4*n*n; + int D_link_offset = n*n; + int clover_offset = (n*(n+1))/2*site; + + coarse_spinwise_site_self_couplings_PRECISION_vectorized( eta1, eta2, phi+site_offset*site, s->op.clover+clover_offset, offset, l ); + + for(int mu=0; mu<4; mu++) { + index_fw = neighbor[5*site+1 + mu]; + index_bw = backward_neighbor[5*site+1 + mu]; + + // from backward + if ( direction_flags[2*mu+0] == 1 ) { + D_pt = D + D_site_offset*index_bw + D_link_offset*mu; + phi_pt = phi + site_offset*index_bw; + coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); + } + + // from forward + if ( direction_flags[2*mu+1] == 1 ) { + D_pt = D + D_site_offset*site + D_link_offset*mu; + phi_pt = phi + site_offset*index_fw; + coarse_spinwise_pn_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l, -1 ); + } + } +} + +void coarse_aggregate_block_diagonal_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, + complex_PRECISION *phi, schwarz_PRECISION_struct *s, + level_struct *l, int site ) { + + int offset = SIMD_LENGTH_PRECISION; + int site_offset = l->num_lattice_site_var*offset; + int n = l->num_parent_eig_vect; + int block_offset = (n*(n+1))*site; + config_PRECISION block = s->op.odd_proj+block_offset; + int num_eig_vect = l->num_parent_eig_vect; + int block_step_size = (num_eig_vect * (num_eig_vect+1))/2; + complex_PRECISION *eta[2] = {eta1, eta2}; + phi += site_offset*site; + + // U(x) = [ A 0 , A=A*, D=D* + // 0 D ] + // storage order: upper triangle of A, upper triangle of D, B, columnwise + + mm_PRECISION block_re; + mm_PRECISION block_im; + mm_PRECISION in_re; + mm_PRECISION in_im; + mm_PRECISION out_re; + mm_PRECISION out_im; + + // zero output matrices + mm_PRECISION zero = mm_setzero_PRECISION(); + for(int s=0; s<2; s++) { + for(int i=0; ieta1) or 2and3 (->eta2) + eta[1] += num_eig_vect*offset; + for(int s=0; s<2; s++) { + // A and D: column major hermitian, stored as upper triangular + for(int i=0; inum_lattice_site_var*offset; + int index_fw; + int *neighbor = s->op.neighbor_table; + complex_PRECISION *phi_pt; + config_PRECISION D_pt; + config_PRECISION D = s->op.D; + int n = l->num_lattice_site_var; + int D_site_offset = 4*n*n; + int D_link_offset = n*n; + + vector_PRECISION_define_zero( eta1, 0, n*offset, l, no_threading ); + vector_PRECISION_define_zero( eta2, 0, n*offset, l, no_threading ); + + // requires the positive boundaries of phi to be communicated before + index_fw = neighbor[5*site+1 + mu]; + D_pt = D + D_site_offset*site + D_link_offset*mu; + phi_pt = phi + site_offset*index_fw; + coarse_spinwise_pn_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l, +1 ); +} + + +void coarse_spinwise_site_self_couplings_PRECISION_vectorized( + complex_PRECISION *eta1, complex_PRECISION *eta2, + complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ) { + + int num_eig_vect = l->num_lattice_site_var/2; + int clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2; + complex_PRECISION *eta[2] = {eta1, eta2}; + // U(x) = [ A B , A=A*, D=D*, C = -B* + // C D ] + // storage order: upper triangle of A, upper triangle of D, B, columnwise + + mm_PRECISION clover_re; + mm_PRECISION clover_im; + mm_PRECISION in_re; + mm_PRECISION in_im; + mm_PRECISION out_re; + mm_PRECISION out_im; + + // zero output matrices + mm_PRECISION zero = mm_setzero_PRECISION(); + for(int s=0; s<2; s++) { + for(int i=0; ieta1) or 2and3 (->eta2) + eta[1] += num_eig_vect*elements; + for(int s=0; s<2; s++) { + // A and D: column major hermitian, stored as upper triangular + for(int i=0; inum_lattice_site_var; +#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION + int num_eig_vect = l->num_parent_eig_vect, + clover_size = (2*num_eig_vect*num_eig_vect+num_eig_vect), + block_size = (num_eig_vect*num_eig_vect+num_eig_vect); + + coarse_self_couplings_clover_PRECISION( eta+start*vector_size, phi+start*vector_size, + op->clover+start*clover_size, (end-start)*vector_size, l ); +#ifdef HAVE_TM // tm_term + if (op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) + coarse_add_anti_block_diagonal_PRECISION( eta+start*vector_size, phi+start*vector_size, + op->tm_term+start*block_size, (end-start)*vector_size, l ); +#endif +#ifdef HAVE_TM1p1 //eps_term + if ( g.n_flavours == 2 && + ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) ) + coarse_add_doublet_coupling_PRECISION( eta+start*vector_size, phi+start*vector_size, + op->epsbar_term+start*block_size, (end-start)*vector_size, l ); +#endif + +#else + + int lda = SIMD_LENGTH_PRECISION*((vector_size+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); +#ifdef HAVE_TM1p1 + OPERATOR_TYPE_PRECISION *clover = (g.n_flavours == 2) ? op->clover_doublet_vectorized:op->clover_vectorized; +#else + OPERATOR_TYPE_PRECISION *clover = op->clover_vectorized; +#endif + for(int i=start; inum_parent_eig_vect, + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* + // C D ] -B* D* ] + // storage order: A, C, B, D + // note: minus sign of D = self_coupling - hopping_term is added here + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + // A + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + eta += num_eig_vect;//1 + phi += num_eig_vect;//1 + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + // C + eta += num_eig_vect;//2 + phi -= num_eig_vect;//0 + D += num_eig_vect2; + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + eta += num_eig_vect;//3 + phi += num_eig_vect;//1 + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + // B + eta -= 3*num_eig_vect;//0 + phi += num_eig_vect;//2 + D += num_eig_vect2; + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + eta += num_eig_vect;//1 + phi += num_eig_vect;//3 + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + // D + eta += num_eig_vect;//2 + phi -= num_eig_vect;//2 + D += num_eig_vect2; + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + eta += num_eig_vect;//3 + phi += num_eig_vect;//3 + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + } else { +#endif + // A + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + // C + eta += num_eig_vect; + D += num_eig_vect2; + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + // B + phi += num_eig_vect; + eta -= num_eig_vect; + D += num_eig_vect2; + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); + // D + eta += num_eig_vect; + D += num_eig_vect2; + pnmv_PRECISION( eta, D, phi, num_eig_vect, -sign ); +#ifdef HAVE_TM1p1 + } +#endif + } + + static inline void coarse_pn_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + config_PRECISION D, const int sign, + level_struct *l ) { + + int num_eig_vect = l->num_parent_eig_vect, + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); + // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* + // C D ] -B* D* ] + // storage order: A, C, B, D + // note: minus sign of D = self_coupling - hopping_term is added here + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + // A* + pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign ); + eta += num_eig_vect;//1 + phi += num_eig_vect;//1 + pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign ); + // -C* + eta -= num_eig_vect;//0 + phi += num_eig_vect;//2 + D += num_eig_vect2; + pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign ); + eta += num_eig_vect;//1 + phi += num_eig_vect;//3 + pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign ); + // -B* + eta += num_eig_vect;//2 + phi -= 3*num_eig_vect;//0 + D += num_eig_vect2; + pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign ); + eta += num_eig_vect;//3 + phi += num_eig_vect;//1 + pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign ); + // D* + eta -= num_eig_vect;//2 + phi += num_eig_vect;//2 + D += num_eig_vect2; + pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign ); + eta += num_eig_vect;//3 + phi += num_eig_vect;//3 + pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign ); + } else { +#endif + // A* + pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign ); + // -C* + phi += num_eig_vect; + D += num_eig_vect2; + pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign ); + // -B* + eta += num_eig_vect; + phi -= num_eig_vect; + D += num_eig_vect2; + pnmvh_PRECISION( eta, D, phi, num_eig_vect, sign ); + // D* + phi += num_eig_vect; + D += num_eig_vect2; + pnmvh_PRECISION( eta, D, phi, num_eig_vect, -sign ); +#ifdef HAVE_TM1p1 + } +#endif + } + + static inline void coarse_pn_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, OPERATOR_TYPE_PRECISION *D, const int sign, level_struct *l ) { +#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + int nv = l->num_parent_eig_vect; + int lda = 2*SIMD_LENGTH_PRECISION*((nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + switch (sign) { + case -1: + cgemv_padded_PRECISION( 2*nv, D, lda, nv, (float *)phi, (float *)eta); + break; + case +1: + default: + cgenmv_padded_PRECISION( 2*nv, D, lda, nv, (float *)phi, (float *)eta); + break; + } +#endif + } + + static inline void coarse_pn_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, const int amount, const int sign, level_struct *l, struct Thread *threading ) { + + START_NO_HYPERTHREADS(threading) + + int mu, i, num_site_var=l->num_lattice_site_var, + num_eig_vect = l->num_parent_eig_vect, + num_lattice_sites, start, end, core_start, core_end, + plus_dir_param, minus_dir_param; + vector_PRECISION in_pt, out_pt; + +#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + int num_link_var = SQUARE(2*num_eig_vect), + num_4link_var = 4*num_link_var; + config_PRECISION D_pt; + // dagger applied by functions daggered_hopp below + config_PRECISION D = op->D, D_dagger = op->D; +#else + int column_offset = 2*SIMD_LENGTH_PRECISION*((num_eig_vect+SIMD_LENGTH_PRECISION-1)/ + SIMD_LENGTH_PRECISION), + num_link_var = 2*2*num_eig_vect*column_offset, + num_4link_var = 4*num_link_var; + OPERATOR_TYPE_PRECISION *D_pt; + // dagger applied in D_dagger + OPERATOR_TYPE_PRECISION *D = op->D_vectorized, *D_dagger = op->D_transformed_vectorized; +#endif + +#ifndef COMM_HIDING_COARSEOP + int communicate = ( l->num_processes > 1 && op->c.comm ) ? 1:0; + int *neighbor_fw = op->neighbor_table; + int *neighbor_bw = op->backward_neighbor_table; +#else + int communicate = ( op->c.comm ) ? 1:0; + int *neighbor_fw = op->neighbor_table; +#endif + + switch (amount) { + case _EVEN_SITES: + minus_dir_param = _ODD_SITES; + plus_dir_param = _EVEN_SITES; + break; + case _ODD_SITES: + minus_dir_param = _EVEN_SITES; + plus_dir_param = _ODD_SITES; + break; + case _FULL_SYSTEM: + default: + minus_dir_param = _FULL_SYSTEM; + plus_dir_param = _FULL_SYSTEM; + break; + } + + // assumptions (1) self coupling has already been performed + // OR (2) "out" is initialized with zeros + set_boundary_PRECISION( out, 0, l, threading ); + + // communicate in -mu direction + MASTER(threading) + if ( communicate ) + for ( mu=0; mu<4; mu++ ) + ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + SYNC_CORES(threading); + + switch (amount) { + case _EVEN_SITES: + start = op->num_even_sites; + num_lattice_sites = op->num_odd_sites; + break; + case _ODD_SITES: + start = 0; + num_lattice_sites = op->num_even_sites; + break; + case _FULL_SYSTEM: + default: + start=0; + num_lattice_sites=l->num_inner_lattice_sites; + break; + } + end = start + num_lattice_sites; + compute_core_start_end_custom( start, end, &core_start, &core_end, l, threading, 1 ); + +#ifndef COMM_HIDING_COARSEOP + // prepare for sending to fw: compute hopping terms into forward boundary buffer + if ( communicate ) + for ( i=core_start; inum_inner_lattice_sites) //num_lattice_sites? + continue; + out_pt = out + num_site_var*neighbor_fw[5*i+1+mu]; +#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + coarse_pn_daggered_hopp_PRECISION( out_pt, in_pt, D_pt+mu*num_link_var, sign, l ); +#else + coarse_pn_hopp_PRECISION_vectorized( out_pt, in_pt, D_pt+mu*num_link_var, sign, l ); +#endif + } + } +#else + // compute U_mu^dagger coupling + for ( mu=0; mu<4; mu++ ) { + for ( i=core_start; ic), plus_dir_param, l ); + } + for ( mu=0; mu<4; mu++ ) { + // wait for -mu direction + ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); + } + END_LOCKED_MASTER(threading); + } + else + SYNC_CORES(threading); + + switch (amount) { + case _EVEN_SITES: + start = 0; + num_lattice_sites = op->num_even_sites; + break; + case _ODD_SITES: + start = op->num_even_sites; + num_lattice_sites = op->num_odd_sites; + break; + case _FULL_SYSTEM: + default: + start=0; + num_lattice_sites=l->num_inner_lattice_sites; + break; + } + end = start + num_lattice_sites; + compute_core_start_end_custom( start, end, &core_start, &core_end, l, threading, 1 ); + +#ifndef COMM_HIDING_COARSEOP + for ( i=core_start; i= l->num_inner_lattice_sites) + continue; + in_pt = in + num_site_var*neighbor_bw[5*i+1+mu]; + D_pt = D_dagger + num_4link_var*neighbor_bw[5*i+1+mu] + mu*num_link_var; +#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + coarse_pn_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, sign, l ); +#else + coarse_pn_hopp_PRECISION_vectorized( out_pt, in_pt, D_pt, sign, l ); +#endif + } + + // compute U_mu couplings + D_pt = D + num_4link_var*neighbor_fw[5*i]; + for( mu=0; mu<4; mu++ ) { + in_pt = in + num_site_var*neighbor_fw[5*i+1+mu]; +#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + coarse_pn_hopp_PRECISION( out_pt, in_pt, D_pt+mu*num_link_var, sign, l ); +#else + coarse_pn_hopp_PRECISION_vectorized( out_pt, in_pt, D_pt+mu*num_link_var, sign, l ); +#endif + } + } +#else + // compute U_mu couplings + for ( i=core_start; ic), plus_dir_param, l ); + } + END_LOCKED_MASTER(threading); + } + else + SYNC_CORES(threading); + + END_NO_HYPERTHREADS(threading); + } + +static inline void coarse_spinwise_pn_hopp_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, complex_PRECISION *phi, config_PRECISION D, int elements, level_struct *l, const int sign ) { + + int num_eig_vect = l->num_lattice_site_var/2; + int num_eig_vect2 = num_eig_vect*num_eig_vect; + complex_PRECISION *eta[2] = {eta1, eta2}; + // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* + // C D ] -B* D* ] + // storage order: A, C, B, D + + mm_PRECISION D_re; + mm_PRECISION D_im; + mm_PRECISION in_re; + mm_PRECISION in_im; + mm_PRECISION out_re; + mm_PRECISION out_im; + // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2) + for(int s=0; s<2; s++) { + // t is the row of the input matrix (in 2x2 block form) + for(int t=0; t<2; t++) { + for(int i=0; inum_lattice_site_var/2; + int num_eig_vect2 = num_eig_vect*num_eig_vect; + complex_PRECISION *eta[2] = {eta1, eta2}; + // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* + // C D ] -B* D* ] + // storage order: A, C, B, D + // note: minus sign of D = self_coupling - hopping_term is added here + + mm_PRECISION D_re; + mm_PRECISION D_im; + mm_PRECISION in_re; + mm_PRECISION in_im; + mm_PRECISION out_re; + mm_PRECISION out_im; + // A* + for(int i=0; inum_lattice_site_var/2, n2 = l->num_lattice_site_var; - - // set the matrix up + // + // output = [ A+E B + // C D+F ] LU decomposed + + register int i, j, k, n = l->num_parent_eig_vect, n2 = 2*n; + config_PRECISION clover = op->clover + n*(n2+1)*index; // A for ( j=0; jtm_term + n*(n+1)*index; + if (op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) { + // E for ( j=0; jnum_parent_eig_vect, n2 = 2*n, n3 = 3*n, n4 = 4*n; + // set the matrix up + // 0 + for ( j=0; jclover + n*(n2+1)*index; + // A + for ( j=0; jtm_term + n*(n+1)*index; + if (op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) { + // E + for ( j=0; jepsbar_term + n*(n+1)*index; + // G + for ( j=n; jnum_even_sites, &start, &end, l, threading, 1 ); + // even sites + coarse_self_couplings_PRECISION( y, x, op, start, end, l ); } - -void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, + level_struct *l, struct Thread *threading ) { - int n1 = op->num_even_sites; - int start; - int end; - compute_core_start_end_custom(0, n1, &start, &end, l, threading, 1); - // even sites -#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION - int offset = l->num_lattice_site_var; - coarse_self_couplings_PRECISION( y+start*offset, x+start*offset, op->clover+start*(offset*offset+offset)/2, (end-start)*offset, l ); -#ifdef HAVE_TM // tm_term - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - coarse_add_anti_block_diagonal_PRECISION( y+start*offset, x+start*offset, op->tm_term+start*(offset*offset/2+offset)/2, (end-start)*offset, l ); -#endif + int start, end; +#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION + int num_site_var=l->num_lattice_site_var, + oo_inv_size = SQUARE(num_site_var); +#ifdef HAVE_TM1p1 + config_PRECISION sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv:op->clover_oo_inv; #else - coarse_self_couplings_PRECISION_vectorized( y, x, op->clover_vectorized, start, end, l ); + config_PRECISION sc = op->clover_oo_inv; #endif -} + compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 ); + + x += num_site_var*(op->num_even_sites+start); + y += num_site_var*(op->num_even_sites+start); + sc += oo_inv_size*start; -void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - - int n1 = op->num_even_sites, n2 = op->num_odd_sites, - offset = l->num_lattice_site_var, ess = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1); - config_PRECISION sc = op->clover; - int start; - int end; - compute_core_start_end_custom(n1, n1+n2, &start, &end, l, threading, 1); - - x += start*offset; - y += start*offset; - sc += start*ess; - - // odd sites -#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION - int oss = l->num_lattice_site_var*l->num_lattice_site_var; for ( int i=start; i use standard non-vectorized multiplication - if ( l->level == 0 ) { - coarse_self_couplings_PRECISION( y, x, sc, (end-start)*offset, l ); -#ifdef HAVE_TM - int tms = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1); - config_PRECISION tm = op->tm_term + start*tms; - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - coarse_add_anti_block_diagonal_PRECISION( y, x, tm, (end-start)*offset, l ); -#endif - } else - coarse_self_couplings_PRECISION_vectorized( y-start*offset, x-start*offset, op->clover_vectorized, start, end, l ); + compute_core_start_end_custom( op->num_even_sites, l->num_inner_lattice_sites, &start, &end, l, threading, 1 ); + coarse_self_couplings_PRECISION( y, x, op, start, end, l ); #endif } - -void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { +void coarse_diag_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l ) { - int n1 = op->num_even_sites, n2 = op->num_odd_sites, start, end; + coarse_diag_ee_PRECISION( y, x, op, l, no_threading ); + coarse_diag_oo_PRECISION( y, x, op, l, no_threading ); +} - compute_core_start_end_custom(n1, n1+n2, &start, &end, l, threading, 1); +void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, + level_struct *l, struct Thread *threading ) { - // odd sites -#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION - int offset = l->num_lattice_site_var, ess = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1), - oss = l->num_lattice_site_var*l->num_lattice_site_var; - config_PRECISION sc = op->clover; - x += start*offset; - y += start*offset; - sc += n1*ess + (start-n1)*oss; + int start, end; + compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1 ); + // odd sites + int num_site_var = l->num_lattice_site_var, + oo_inv_size = SQUARE(num_site_var); + +#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION +#ifdef HAVE_TM1p1 + config_PRECISION sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv:op->clover_oo_inv; +#else + config_PRECISION sc = op->clover_oo_inv; +#endif +#else + int lda = SIMD_LENGTH_PRECISION*((num_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + oo_inv_size = 2*num_site_var*lda; +#ifdef HAVE_TM1p1 + OPERATOR_TYPE_PRECISION *sc = (g.n_flavours==2) ? op->clover_doublet_oo_inv_vectorized:op->clover_oo_inv_vectorized; +#else + OPERATOR_TYPE_PRECISION *sc = op->clover_oo_inv_vectorized; +#endif +#endif + + x += num_site_var*(op->num_even_sites+start); + y += num_site_var*(op->num_even_sites+start); + sc += oo_inv_size*start; + for ( int i=start; iclover_vectorized, start, end, l ); + for(int j=0; jnum_inner_lattice_sites, sc_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1), - nc_size = SQUARE(l->num_lattice_site_var), - t, z, y, x; +void coarse_oddeven_PRECISION_set_self_couplings( level_struct *l, struct Thread *threading ) { + operator_PRECISION_struct *op = &(l->oe_op_PRECISION); - config_PRECISION sc_in = in->clover, nc_in = in->D, Aee = NULL, Aoo = NULL; - int *le = l->local_lattice; - int oe_offset = op->oe_offset; - -#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION - int lu_dec_size = SQUARE(l->num_lattice_site_var); -#endif - - Aee = op->clover; - Aoo = op->clover + op->num_even_sites*sc_size; -#ifdef HAVE_TM - int jt=0, kt=0, tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1); - config_PRECISION tm_in = in->tm_term, TMee = NULL, TMoo = NULL; - TMee = op->tm_term; - TMoo = op->tm_term + op->num_even_sites*tm_size; -#endif - - START_LOCKED_MASTER(threading) - // self coupling - if ( reorder ) { - int k=0, index, *it = in->index_table, *dt = in->table_dim; - j=0; - for ( t=0; tnum_even_sites*sc_size; - for ( i=0; inum_even_sites*tm_size; -#endif - j = op->num_odd_sites; - for ( i=0; inum_parent_eig_vect, start, end; + + coarse_operator_PRECISION_set_self_couplings( op, l, threading ); + compute_core_start_end_custom( 0, op->num_odd_sites, &start, &end, l, threading, 1); + +#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION + + int size = SQUARE(2*nv); + for( int i=start; iclover_oo_inv+i*size, op, op->num_even_sites+i, l ); + +#ifdef HAVE_TM1p1 + int size_doublet = SQUARE(4*nv); + for( int i=start; iclover_doublet_oo_inv+i*size_doublet, op, + op->num_even_sites+i, l ); #endif - sc_in += sc_size; Aoo += lu_dec_size; - } + #else - for ( i=op->num_even_sites*sc_size; iindex_table, *dt = in->table_dim, site_size=4*nc_size; - config_PRECISION oAe=op->D, eAo=(op->D)+site_size*op->num_even_sites; - j=0; - for ( t=0; tD[i] = nc_in[i]; - } - END_LOCKED_MASTER(threading) -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - int start; - int end; - compute_core_start_end_custom(0, n, &start, &end, l, threading, 1); - int n_per_core = end-start; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_v = 2*l->num_lattice_site_var*column_offset; - copy_coarse_operator_to_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_vectorized + 4*start*offset_v, - n_per_core, l->num_lattice_site_var/2); - copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_transformed_vectorized + 4*start*offset_v, - n_per_core, l->num_lattice_site_var/2); - copy_coarse_operator_clover_to_vectorized_layout_PRECISION( - op->clover + start*sc_size, - op->clover_vectorized + start*offset_v, - n_per_core, l->num_lattice_site_var/2); -#ifdef HAVE_TM - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - add_tm_term_to_vectorized_layout_PRECISION( - op->tm_term + start*tm_size, - op->clover_vectorized + start*offset_v, - n_per_core, l->num_lattice_site_var/2); + int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + int size_v = 2*2*nv*column_offset; + for( int i=start; iclover_oo_inv_vectorized + i*size_v, + op->clover_vectorized + (op->num_even_sites+i)*size_v, column_offset ); + +#ifdef HAVE_TM1p1 + int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + int size_doublet_v = 2*4*nv*column_doublet_offset; + for( int i=start; iclover_doublet_oo_inv_vectorized + i*size_doublet_v, + op->clover_doublet_vectorized + (op->num_even_sites+i)*size_doublet_v, column_doublet_offset ); #endif - SYNC_CORES(threading) - - compute_core_start_end_custom(op->num_even_sites, n, &start, &end, l, threading, 1); - OPERATOR_TYPE_PRECISION tmp[offset_v] __attribute__((aligned(64))); - for(int a=start; aclover_vectorized + a*offset_v)[i]; - cgem_inverse(l->num_lattice_site_var, op->clover_vectorized + a*offset_v, tmp, column_offset); - } - SYNC_CORES(threading) #endif } -void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l ) { +void coarse_oddeven_PRECISION_set_couplings( level_struct *l, struct Thread *threading ) { + + coarse_oddeven_PRECISION_set_self_couplings( l, threading ); + coarse_operator_PRECISION_set_neighbor_couplings( &(l->oe_op_PRECISION), l, threading ); + +} + +void coarse_oddeven_alloc_PRECISION( level_struct *l ) { - int n=l->num_inner_lattice_sites, oe_offset=0, mu, nu, - lu_dec_size = SQUARE(l->num_lattice_site_var), - nc_size = SQUARE(l->num_lattice_site_var), bs, **bt = NULL, - *eot = NULL, *nt = NULL, *tt = NULL, t, z, y, x, le[4], N[4]; + int nv = l->num_parent_eig_vect, + oe_offset=0, mu, **bt = NULL, + *eot = NULL, *nt = NULL, *tt = NULL, t, z, y, x, le[4], N[4]; operator_PRECISION_struct *op = &(l->oe_op_PRECISION); + operator_PRECISION_alloc( op, _ODDEVEN, l ); + + // buffers + MALLOC( op->buffer, complex_PRECISION*, 2 ); + op->buffer[0] = NULL; +#ifdef HAVE_TM1p1 + MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size ); + op->buffer[1] = op->buffer[0] + 2*l->vector_size; +#else + MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size ); + op->buffer[1] = op->buffer[0] + l->vector_size; +#endif + for ( mu=0; mu<4; mu++ ) { le[mu] = l->local_lattice[mu]; N[mu] = le[mu]+1; @@ -434,801 +476,160 @@ void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, op->num_even_sites++; } } + +#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - MALLOC( op->D, complex_PRECISION, 4*nc_size*n ); - MALLOC( op->clover, complex_PRECISION, lu_dec_size*n ); -#ifdef HAVE_TM - int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1); - MALLOC( l->oe_op_PRECISION.tm_term, complex_PRECISION, tm_size*n ); + MALLOC( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites ); +#ifdef HAVE_TM1p1 + MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites ); +#endif + +#else + int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION ); +#ifdef HAVE_TM1p1 + int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites, 4*SIMD_LENGTH_PRECISION ); #endif -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - // 2 is for complex, 4 is for 4 directions - MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n, 64 ); - MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n, 64 ); - MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*l->num_lattice_site_var*column_offset*n, 64 ); + #endif - coarse_oddeven_setup_PRECISION_set_couplings( in, reorder, l, no_threading ); - // define data layout - MALLOC( op->index_table, int, N[T]*N[Z]*N[Y]*N[X] ); eot = op->index_table; - define_eot( eot, N, l ); - + // neighbor table, translation table - MALLOC( op->neighbor_table, int, 5*N[T]*N[Z]*N[Y]*N[X] ); - MALLOC( op->backward_neighbor_table, int, 5*N[T]*N[Z]*N[Y]*N[X] ); - MALLOC( op->translation_table, int, le[T]*le[Z]*le[Y]*le[X] ); nt = op->neighbor_table; tt = op->translation_table; - define_nt_bt_tt( nt, op->backward_neighbor_table, NULL, tt, eot, N, l ); - + // boundary table - for ( mu=0; mu<4; mu++ ) { - bs = 1; - le[mu] = 1; - for ( nu=0; nu<4; nu++ ) - bs *= le[nu]; - - MALLOC( op->c.boundary_table[2*mu], int, bs ); - op->c.boundary_table[2*mu+1] = op->c.boundary_table[2*mu]; - - le[mu] = l->local_lattice[mu]; - } - bt = op->c.boundary_table; define_eo_bt( bt, eot, op->c.num_even_boundary_sites, op->c.num_odd_boundary_sites, op->c.num_boundary_sites, N, l ); - MALLOC( op->buffer, complex_PRECISION*, 2 ); - op->buffer[0] = NULL; - MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size ); - op->buffer[1] = op->buffer[0] + l->vector_size; - ghost_alloc_PRECISION( 0, &(op->c), l ); + // ghost ghost_sendrecv_init_PRECISION( _COARSE_GLOBAL, &(op->c), l ) ; + + // solver if ( l->level == 0 ) l->p_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var; else l->sp_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var; -} - -void coarse_oddeven_re_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l, struct Thread *threading ) { - coarse_oddeven_setup_PRECISION_set_couplings( in, reorder, l, threading ); -} - -void coarse_oddeven_free_PRECISION( level_struct *l ) { - - int mu, nu, nc_size = SQUARE(l->num_lattice_site_var), - *ll = l->local_lattice, n = l->num_inner_lattice_sites, bs; - - ghost_free_PRECISION( &(l->oe_op_PRECISION.c), l ); - FREE( l->oe_op_PRECISION.D, complex_PRECISION, 4*nc_size*n ); -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - FREE_HUGEPAGES( l->oe_op_PRECISION.D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n ); - FREE_HUGEPAGES( l->oe_op_PRECISION.D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n ); - FREE_HUGEPAGES( l->oe_op_PRECISION.clover_vectorized, OPERATOR_TYPE_PRECISION, 2*l->num_lattice_site_var*column_offset*n ); -#endif -#ifdef HAVE_TM - int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1); - FREE( l->oe_op_PRECISION.tm_term, complex_PRECISION, tm_size*n ); -#endif - FREE( l->oe_op_PRECISION.clover, complex_PRECISION, nc_size*n ); - FREE( l->oe_op_PRECISION.index_table, int, (ll[T]+1)*(ll[Z]+1)*(ll[Y]+1)*(ll[X]+1) ); - FREE( l->oe_op_PRECISION.neighbor_table, int, 5*(ll[T]+1)*(ll[Z]+1)*(ll[Y]+1)*(ll[X]+1) ); - FREE( l->oe_op_PRECISION.backward_neighbor_table, int, 5*(ll[T]+1)*(ll[Z]+1)*(ll[Y]+1)*(ll[X]+1) ); - FREE( l->oe_op_PRECISION.translation_table, int, ll[T]*ll[Z]*ll[Y]*ll[X] ); - - for ( mu=0; mu<4; mu++ ) { - bs = 1; - for ( nu=0; nu<4; nu++ ) - if ( mu != nu ) - bs *= ll[nu]; - - FREE( l->oe_op_PRECISION.c.boundary_table[2*mu], int, bs ); - l->oe_op_PRECISION.c.boundary_table[2*mu+1] = NULL; - } - - FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 2*l->vector_size ); - FREE( l->oe_op_PRECISION.buffer, complex_PRECISION*, 2 ); } +void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l, + struct Thread *threading ) { -void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { + operator_PRECISION_struct *op = &(l->oe_op_PRECISION); - START_NO_HYPERTHREADS(threading) + START_LOCKED_MASTER(threading) + int ns=l->num_inner_lattice_sites, nv = l->num_parent_eig_vect, i, + D_size = 4*SQUARE(2*nv), + clover_size = (nv)*(nv*2+1), + block_size = (nv)*(nv+1); + config_PRECISION D_in = in->D, + clover_in = in->clover, + odd_proj_in = in->odd_proj; - int mu, i, index, num_site_var=l->num_lattice_site_var, - num_4link_var=4*l->num_lattice_site_var*l->num_lattice_site_var, - num_link_var=l->num_lattice_site_var*l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - config_PRECISION D_pt; + // neighbor couplings + if ( reorder ) { + int t, z, y, x, index, *le = l->local_lattice, oe_offset = op->oe_offset, + *it = in->index_table, *dt = in->table_dim; + config_PRECISION D_oe = op->D, + D_eo = (op->D)+D_size*op->num_even_sites, + clover_ee = op->clover, + clover_oo = (op->clover)+clover_size*op->num_even_sites, + odd_proj_ee = op->odd_proj, + odd_proj_oo = op->odd_proj+block_size*op->num_even_sites; - int core_start; - int core_end; - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_MASTER(threading) - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu^dagger coupling - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu couplings - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index]; - index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - - D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - - D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); + for ( t=0; tneighbor_table[index+X]; - coarse_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); + } else { + for ( i=0; iD[i] = D_in[i]; + for ( i=0; iclover[i] = clover_in[i]; + for ( i=0; iodd_proj[i] = odd_proj_in[i]; } + } END_LOCKED_MASTER(threading) + + op->m0 = in->m0; - END_NO_HYPERTHREADS(threading) -} - - -void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { - -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION -#ifndef COMM_HIDING_COARSEOP - int sign = -1; - coarse_pn_hopping_term_PRECISION_vectorized( out, in, op, amount, l, sign, threading); -#else - coarse_n_hopping_term_PRECISION_vectorized( out, in, op, amount, l, threading ); +#ifdef HAVE_TM + tm_term_PRECISION_setup( in->mu, in->mu_even_shift, in->mu_odd_shift, op, l, threading ); +#endif +#ifdef HAVE_TM1p1 + epsbar_term_PRECISION_setup( in->epsbar, in->epsbar_ig5_even_shift, in->epsbar_ig5_odd_shift, op, l, threading ); #endif - return; -#else - START_NO_HYPERTHREADS(threading) - - int mu, i, index, num_site_var=l->num_lattice_site_var, - num_4link_var=4*l->num_lattice_site_var*l->num_lattice_site_var, - num_link_var=l->num_lattice_site_var*l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - config_PRECISION D_pt; - - int core_start; - int core_end; - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - set_boundary_PRECISION( out, 0, l, threading ); - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } + coarse_oddeven_PRECISION_set_couplings( l, threading ); - START_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_MASTER(threading) - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu^dagger coupling - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index] + 0*num_link_var; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index] + 1*num_link_var; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index] + 2*num_link_var; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index] + 3*num_link_var; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_n_daggered_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu couplings - for ( i=core_start; ineighbor_table[index]; - D_pt = op->D + num_4link_var*op->neighbor_table[index]; - index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - - D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - - D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - - D_pt += num_link_var; - in_pt = in + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION( out_pt, in_pt, D_pt, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - END_NO_HYPERTHREADS(threading) -#endif } -void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { - -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, index, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset; - - int core_start; - int core_end; - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_MASTER(threading) - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu^dagger coupling - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) +void coarse_oddeven_free_PRECISION( level_struct *l ) { + + int nv = l->num_parent_eig_vect, vs = l->vector_size; + operator_PRECISION_struct *op = &(l->oe_op_PRECISION); - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu couplings - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index]; - index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - D_vectorized += vectorized_link_offset; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - in_pt = in + num_site_var*op->neighbor_table[index+X]; - D_vectorized += vectorized_link_offset; - coarse_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } + operator_PRECISION_free( op, _ODDEVEN, l ); + coarse_operator_PRECISION_free_vectorized( op, l ); - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) +#ifndef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION - END_NO_HYPERTHREADS(threading) + FREE( op->clover_oo_inv, complex_PRECISION, SQUARE(2*nv)*op->num_odd_sites ); +#ifdef HAVE_TM1p1 + FREE( op->clover_doublet_oo_inv, complex_PRECISION, SQUARE(4*nv)*op->num_odd_sites ); #endif -} - - -void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, int sign, struct Thread *threading ) { - -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int link_offset = 2*l->num_lattice_site_var*column_offset; - int *neighbor_fw = op->neighbor_table; - int *neighbor_bw = op->backward_neighbor_table; - - int core_start; - int core_end; - - void (*coarse_hopp)(vector_PRECISION eta, vector_PRECISION phi, OPERATOR_TYPE_PRECISION *D, level_struct *l); - if(sign == +1) - coarse_hopp = coarse_hopp_PRECISION_vectorized; - else - coarse_hopp = coarse_n_hopp_PRECISION_vectorized; - - - if ( l->num_processes > 1 && op->c.comm ) { - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // send in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - END_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // prepare for sending to fw: compute hopping terms into forward boundary buffer - for ( i=core_start; inum_inner_lattice_sites) - continue; - out_pt = out + num_site_var*neighbor_fw[5*i+1+mu]; - in_pt = in + num_site_var*neighbor_fw[5*i]; - D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); - } - } - START_LOCKED_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // send in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - END_LOCKED_MASTER(threading) - } - else - SYNC_CORES(threading) - - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - for ( i=core_start; i= l->num_inner_lattice_sites) - continue; - D_vectorized = op->D_transformed_vectorized + 4*link_offset*neighbor_bw[5*i+1+mu] + mu*link_offset; - in_pt = in + num_site_var*neighbor_bw[5*i+1+mu]; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); - } - - // compute U_mu couplings - for(int mu=0; mu<4; mu++) { - D_vectorized = op->D_vectorized + 4*link_offset*neighbor_fw[5*i] + mu*link_offset; - in_pt = in + num_site_var*neighbor_fw[5*i+1+mu]; - coarse_hopp( out_pt, in_pt, D_vectorized, l ); - } - } - - - // wait for terms from bw and add them - if ( l->num_processes > 1 && op->c.comm ) { - START_LOCKED_MASTER(threading) - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - END_LOCKED_MASTER(threading) - } - else - SYNC_CORES(threading) - END_NO_HYPERTHREADS(threading) +#else + int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 2*2*nv*column_offset*op->num_odd_sites ); +#ifdef HAVE_TM1p1 + int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*4*nv*column_doublet_offset*op->num_odd_sites ); #endif -} - - -void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { - -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - START_NO_HYPERTHREADS(threading) - - int mu, i, index, num_site_var=l->num_lattice_site_var, - start=0, num_lattice_sites=l->num_inner_lattice_sites, - plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - vector_PRECISION in_pt, out_pt; - - OPERATOR_TYPE_PRECISION *D_vectorized; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset; - - int core_start; - int core_end; - - // assumptions (1) self coupling has already been performed - // OR (2) "out" is initialized with zeros - set_boundary_PRECISION( out, 0, l, threading ); - - if ( amount == _EVEN_SITES ) { - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - START_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in -mu direction - ghost_sendrecv_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_MASTER(threading) - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } else if ( amount == _ODD_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // D is applied in an input-centric way - // this makes threading a bit ugly, is there a better way? - // compute U_mu^dagger coupling - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 0*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 1*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 2*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - SYNC_CORES(threading) - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_transformed_vectorized + 4*vectorized_link_offset*op->neighbor_table[index] + 3*vectorized_link_offset; - index++; - out_pt = out + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // communicate in +mu direction - ghost_sendrecv_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - for ( mu=0; mu<4; mu++ ) { - // wait for -mu direction - ghost_wait_PRECISION( in, mu, -1, &(op->c), minus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - - if ( amount == _EVEN_SITES ) { - start = 0; num_lattice_sites = op->num_even_sites; - } else if ( amount == _ODD_SITES ) { - start = op->num_even_sites, num_lattice_sites = op->num_odd_sites; - } - compute_core_start_end_custom(start, start+num_lattice_sites, &core_start, &core_end, l, threading, 1); - - // compute U_mu couplings - for ( i=core_start; ineighbor_table[index]; - D_vectorized = op->D_vectorized + 4*vectorized_link_offset*op->neighbor_table[index]; - index++; - in_pt = in + num_site_var*op->neighbor_table[index+T]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Z]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+Y]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - - D_vectorized += vectorized_link_offset; - in_pt = in + num_site_var*op->neighbor_table[index+X]; - coarse_n_hopp_PRECISION_vectorized( out_pt, in_pt, D_vectorized, l ); - } - - START_LOCKED_MASTER(threading) - if ( op->c.comm ) { - for ( mu=0; mu<4; mu++ ) { - // wait for +mu direction - ghost_wait_PRECISION( out, mu, +1, &(op->c), plus_dir_param, l ); - } - } - END_LOCKED_MASTER(threading) - END_NO_HYPERTHREADS(threading) #endif + +#ifdef HAVE_TM1p1 + FREE( op->buffer[0], complex_PRECISION, 4*vs ); +#else + FREE( op->buffer[0], complex_PRECISION, 2*vs ); +#endif + FREE( op->buffer, complex_PRECISION*, 2 ); } - void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { SYNC_CORES(threading) @@ -1236,14 +637,14 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECIS coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( p->b, p->x, op, _EVEN_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( p->b, p->x, op, _EVEN_SITES, -1, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); fgmres_PRECISION( p, l, threading ); // even to odd PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( p->b, p->x, op, _ODD_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( p->b, p->x, op, _ODD_SITES, -1, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); PROF_PRECISION_START( _SC, threading ); coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); @@ -1251,80 +652,76 @@ void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECIS SYNC_CORES(threading) } - void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - // start and end indices for vector functions depending on thread - int start; - int end; - // compute start and end indices for core - // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads - compute_core_start_end(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start, &end, l, threading); - + int start = op->num_even_sites*l->num_lattice_site_var; + int end = l->inner_vector_size; vector_PRECISION *tmp = op->buffer; SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); coarse_diag_ee_PRECISION( out, in, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); - SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start, end, l ); + SYNC_CORES(threading); + vector_PRECISION_define_zero( tmp[0], start, end, l, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, +1, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading ); PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, -1, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); } void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - int start_even, end_even, start_odd, end_odd; - compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var ); - compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var ); + int start_even = 0, end_even = op->num_even_sites*l->num_lattice_site_var, + start_odd = end_even, end_odd = l->inner_vector_size; + int thread_start_even, thread_end_even, thread_start_odd, thread_end_odd; + compute_core_start_end_custom( start_even, end_even, &thread_start_even, &thread_end_even, l, threading, l->num_lattice_site_var ); + compute_core_start_end_custom( start_odd, end_odd, &thread_start_odd, &thread_end_odd, l, threading, l->num_lattice_site_var ); vector_PRECISION tmp = op->buffer[0]; - SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start_even, end_even, l ); + SYNC_CORES(threading); + vector_PRECISION_define_zero( tmp, start_even, end_even, l, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( p->b, p->b, thread_start_odd, thread_end_odd, l ); SYNC_CORES(threading) coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); SYNC_CORES(threading) - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( p->b, p->b, thread_start_odd, thread_end_odd, l ); PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( tmp, p->x, op, _EVEN_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( tmp, p->x, op, _EVEN_SITES, -1, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); - coarse_gamma5_PRECISION( tmp, tmp, start_even, end_even, l ); + coarse_gamma5_PRECISION( tmp, tmp, thread_start_even, thread_end_even, l ); SYNC_CORES(threading) - vector_PRECISION_plus( p->b, p->b, tmp, start_even, end_even, l ); + vector_PRECISION_plus( p->b, p->b, tmp, thread_start_even, thread_end_even, l ); fgmres_PRECISION( p, l, threading ); SYNC_CORES(threading) - coarse_gamma5_PRECISION( p->b, p->b, start_odd, end_odd, l ); + coarse_gamma5_PRECISION( p->b, p->b, thread_start_odd, thread_end_odd, l ); SYNC_CORES(threading) coarse_diag_oo_inv_PRECISION( p->x, p->b, op, l, threading ); SYNC_CORES(threading) // even to odd PROF_PRECISION_START( _NC, threading ); - vector_PRECISION_define( tmp, 0, start_odd, end_odd, l ); - SYNC_CORES(threading) - coarse_n_hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); + vector_PRECISION_define_zero( tmp, start_odd, end_odd, l, threading ); + SYNC_CORES(threading); + coarse_pn_hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, -1, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); coarse_diag_oo_inv_PRECISION( p->b, tmp, op, l, threading ); - vector_PRECISION_plus( p->x, p->x, p->b, start_odd, end_odd, l ); + vector_PRECISION_plus( p->x, p->x, p->b, thread_start_odd, thread_end_odd, l ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) @@ -1333,9 +730,10 @@ void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PR void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - int start_even, end_even, start_odd, end_odd; - compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var ); - compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var ); + int start_even = 0, end_even = op->num_even_sites*l->num_lattice_site_var, + start_odd = end_even, end_odd = l->inner_vector_size; + int thread_start_even, thread_end_even; + compute_core_start_end_custom( start_even, end_even, &thread_start_even, &thread_end_even, l, threading, l->num_lattice_site_var ); vector_PRECISION *tmp = op->buffer; @@ -1343,20 +741,20 @@ void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_P PROF_PRECISION_START( _SC, threading ); coarse_diag_ee_PRECISION( out, in, op, l, threading ); PROF_PRECISION_STOP( _SC, 0, threading ); - SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); + SYNC_CORES(threading); + vector_PRECISION_define_zero( tmp[0], start_odd, end_odd, l, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - coarse_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, +1, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); PROF_PRECISION_START( _SC, threading ); coarse_diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, threading ); PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_n_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( out, tmp[1], op, _EVEN_SITES, -1, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) - coarse_gamma5_PRECISION( out, out, start_even, end_even, l ); + coarse_gamma5_PRECISION( out, out, thread_start_even, thread_end_even, l ); SYNC_CORES(threading) } @@ -1373,10 +771,10 @@ void coarse_odd_even_PRECISION_test( vector_PRECISION out, vector_PRECISION in, // transformation part vector_PRECISION_copy( buf1, in, 0, l->inner_vector_size, l ); // even to odd - vector_PRECISION_define( out, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); - END_LOCKED_MASTER(threading) + vector_PRECISION_define_zero( out, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l, no_threading ); + END_LOCKED_MASTER(threading); - coarse_hopping_term_PRECISION( out, buf1, &(l->oe_op_PRECISION), _ODD_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( out, buf1, &(l->oe_op_PRECISION), _ODD_SITES, +1, l, threading ); coarse_diag_oo_inv_PRECISION( buf2, out, &(l->oe_op_PRECISION), l, threading ); START_LOCKED_MASTER(threading) @@ -1398,13 +796,13 @@ void coarse_odd_even_PRECISION_test( vector_PRECISION out, vector_PRECISION in, if ( g.method == 6 ) { START_LOCKED_MASTER(threading) coarse_gamma5_PRECISION( out, out, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); - vector_PRECISION_define( buf1, 0, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); - coarse_hopping_term_PRECISION( buf1, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); + vector_PRECISION_define_zero( buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l, no_threading ); + coarse_pn_hopping_term_PRECISION( buf1, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, +1, l, no_threading ); coarse_gamma5_PRECISION( buf1, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); vector_PRECISION_plus( out, out, buf1, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l ); END_LOCKED_MASTER(threading) } else { - coarse_hopping_term_PRECISION( out, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, l, threading ); + coarse_pn_hopping_term_PRECISION( out, buf2, &(l->oe_op_PRECISION), _EVEN_SITES, +1, l, threading ); } PUBLIC_FREE( buf1, complex_PRECISION, 2*l->vector_size ); diff --git a/src/coarse_oddeven_generic.h b/src/coarse_oddeven_generic.h index 807632a..2d9e687 100644 --- a/src/coarse_oddeven_generic.h +++ b/src/coarse_oddeven_generic.h @@ -23,37 +23,24 @@ #define COARSE_ODDEVEN_PRECISION_HEADER struct Thread; + + void coarse_oddeven_alloc_PRECISION( level_struct *l ); -#ifndef HAVE_TM - void coarse_selfcoupling_LU_decomposition_PRECISION( config_PRECISION output, config_PRECISION input, level_struct *l ); -#else - void coarse_selfcoupling_LU_decomposition_PRECISION( const config_PRECISION output, config_PRECISION input, config_PRECISION input_anti, level_struct *l ); -#endif - void coarse_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION A, level_struct *l ); - - void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l ); - void coarse_oddeven_re_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l, struct Thread *threading ); + void coarse_oddeven_setup_PRECISION( operator_PRECISION_struct *in, int reorder, level_struct *l, + struct Thread *threading ); + void coarse_oddeven_PRECISION_set_self_couplings( level_struct *l, struct Thread *threading ); + void coarse_oddeven_free_PRECISION( level_struct *l ); - void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - - void coarse_diag_ee_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_diag_oo_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_diag_oo_inv_PRECISION( vector_PRECISION y, vector_PRECISION x, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ); - void coarse_n_hopping_term_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ); - void coarse_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ); - void coarse_pn_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, int sign, struct Thread *threading ); - void coarse_n_hopping_term_PRECISION_vectorized( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ); - - void coarse_odd_even_PRECISION_test( vector_PRECISION c4, vector_PRECISION c1, level_struct *l, struct Thread *threading ); + void coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, + struct Thread *threading ); + void coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, + operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void g5D_coarse_solve_odd_even_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, + level_struct *l, struct Thread *threading ); + void g5D_coarse_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, + operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void coarse_odd_even_PRECISION_test( vector_PRECISION c4, vector_PRECISION c1, + level_struct *l, struct Thread *threading ); #endif diff --git a/src/coarse_operator_generic.c b/src/coarse_operator_generic.c index 98a56fc..9786b5e 100644 --- a/src/coarse_operator_generic.c +++ b/src/coarse_operator_generic.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. * * This file is part of the DDalphaAMG solver library. * @@ -24,34 +24,46 @@ void coarse_operator_PRECISION_alloc( level_struct *l ) { int nd = l->next_level->num_inner_lattice_sites, - k = l->next_level->num_lattice_site_var; + k = l->next_level->num_parent_eig_vect*2; l->next_level->D_size = k*k*4*nd; l->next_level->clover_size = ((k*(k+1))/2)*nd; -#ifdef HAVE_TM l->next_level->block_size = ((k/2*(k/2+1)))*nd; -#endif operator_PRECISION_alloc( &(l->next_level->op_PRECISION), _ORDINARY, l->next_level ); -} +} void coarse_operator_PRECISION_free( level_struct *l ) { operator_PRECISION_free( &(l->next_level->op_PRECISION), _ORDINARY, l->next_level ); -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - operator_PRECISION_struct *op = &(l->next_level->s_PRECISION.op); + coarse_operator_PRECISION_free_vectorized( &(l->next_level->s_PRECISION.op), l->next_level ); +} + +void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l ) { + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION if( op->D_vectorized != NULL ) { - int n2 = 2*l->next_level->num_lattice_sites-l->next_level->num_inner_lattice_sites, n = l->next_level->num_inner_lattice_sites; - int column_offset = SIMD_LENGTH_PRECISION*((l->next_level->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + int n2 = (l->depth>0 && l->level>0) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites):l->num_inner_lattice_sites; + int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); // 2 is for complex, 4 is for 4 directions - FREE_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->next_level->num_lattice_site_var*column_offset*n2 ); - FREE_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->next_level->num_lattice_site_var*column_offset*n2 ); - FREE_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*l->next_level->num_lattice_site_var*column_offset*n ); + FREE_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 ); + FREE_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*2*l->num_parent_eig_vect*column_offset*n2 ); } #endif -} +#ifdef OPTIMIZED_SELF_COUPLING_PRECISION + if( op->clover_vectorized != NULL ) { + int n = l->num_inner_lattice_sites; + int column_offset = SIMD_LENGTH_PRECISION*((2*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + FREE_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*2*l->num_parent_eig_vect*column_offset*n ); +#ifdef HAVE_TM1p1 + int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + FREE_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_parent_eig_vect*column_doublet_offset*n ); +#endif + } +#endif +} void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { @@ -61,10 +73,12 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { vector_PRECISION buffer1 = l->vbuf_PRECISION[4], buffer2 = l->vbuf_PRECISION[5]; int mu, n = l->num_eig_vect, i, j, - D_size = l->next_level->D_size, - clover_size = l->next_level->clover_size; + D_size = l->next_level->D_size, + clover_size = l->next_level->clover_size, + block_size = l->next_level->block_size; void (*aggregate_self_coupling)() = (l->depth==0)?d_plus_clover_aggregate_PRECISION:coarse_aggregate_self_couplings_PRECISION, (*aggregate_neighbor_coupling)() = (l->depth==0)?d_neighbor_aggregate_PRECISION:coarse_aggregate_neighbor_couplings_PRECISION; + void (*aggregate_block)() = (l->depth==0)?diagonal_aggregate_PRECISION:coarse_aggregate_block_diagonal_PRECISION; operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level ); @@ -72,17 +86,8 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { l->next_level->op_PRECISION.D[j] = _COMPLEX_PRECISION_ZERO; for ( j=0; jnext_level->op_PRECISION.clover[j] = _COMPLEX_PRECISION_ZERO; -#ifdef HAVE_TM - int block_size = l->next_level->block_size; - - void (*aggregate_tm_term)() = (l->depth==0)?diagonal_aggregate_PRECISION:coarse_aggregate_anti_block_diagonal_PRECISION, - (*aggregate_odd_proj)() = (l->depth==0)?diagonal_aggregate_PRECISION:coarse_aggregate_block_diagonal_PRECISION; - for ( j=0; jnext_level->op_PRECISION.tm_term[j] = _COMPLEX_PRECISION_ZERO; - for ( j=0; jnext_level->op_PRECISION.odd_proj[j] = _COMPLEX_PRECISION_ZERO; -#endif + l->next_level->op_PRECISION.odd_proj[j] = _COMPLEX_PRECISION_ZERO; // for all test vectors V[i]: for ( i=0; is_PRECISION), l ); // calculate selfcoupling entries of the coarse grid operator set_coarse_self_coupling_PRECISION( buffer1, buffer2, V, i, l ); -#ifdef HAVE_TM - //tm_term - aggregate_tm_term( buffer1, buffer2, V[i], l->s_PRECISION.op.tm_term, l ); - set_block_diagonal_PRECISION( buffer1, buffer2, V, i, l->next_level->op_PRECISION.tm_term, l ); //odd_proj - aggregate_odd_proj( buffer1, buffer2, V[i], l->s_PRECISION.op.odd_proj, l ); + aggregate_block( buffer1, buffer2, V[i], l->s_PRECISION.op.odd_proj, l ); set_block_diagonal_PRECISION( buffer1, buffer2, V, i, l->next_level->op_PRECISION.odd_proj, l ); -#endif - + for ( mu=0; mu<4; mu++ ) { // finish updating ghostcells of V[i] negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l ); @@ -112,13 +112,45 @@ void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ) { } } + coarse_operator_PRECISION_setup_finalize( l, no_threading ); + t1 = MPI_Wtime(); if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 ); } +void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *threading ) { + + int block_size = l->next_level->block_size; + + l->next_level->op_PRECISION.m0 = l->s_PRECISION.op.m0; +#ifdef HAVE_TM + //tm_term + PRECISION mf = (g.mu_factor[l->depth]) ? g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth]:0; + if ( mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_even_shift == 0 && + mf*l->s_PRECISION.op.mu + mf*l->s_PRECISION.op.mu_odd_shift == 0 ) + vector_PRECISION_define_zero( l->next_level->op_PRECISION.tm_term, 0, block_size, l->next_level, threading ); + else + tm_term_PRECISION_setup( mf*l->s_PRECISION.op.mu, mf*l->s_PRECISION.op.mu_even_shift, + mf*l->s_PRECISION.op.mu_odd_shift, &(l->next_level->op_PRECISION), + l->next_level, threading ); +#endif +#ifdef HAVE_TM1p1 + //eps_term + PRECISION ef = (g.epsbar_factor[l->depth]) ? g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth]:0; + if ( ef*l->s_PRECISION.op.epsbar == 0 && ef*l->s_PRECISION.op.epsbar_ig5_even_shift == 0 && + ef*l->s_PRECISION.op.epsbar_ig5_odd_shift == 0 ) + vector_PRECISION_define_zero( l->next_level->op_PRECISION.epsbar_term, 0, block_size, l->next_level, threading ); + else + epsbar_term_PRECISION_setup( ef*l->s_PRECISION.op.epsbar, ef*l->s_PRECISION.op.epsbar_ig5_even_shift, + ef*l->s_PRECISION.op.epsbar_ig5_odd_shift, &(l->next_level->op_PRECISION), + l->next_level, threading ); +#endif + +} + void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, - vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ) { + vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ) { // U(x) = [ A 0 , A=A*, D=D* // 0 D ] @@ -126,9 +158,10 @@ void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION s // suitable for tm_term and odd_proj int i, j, k, m, k1, k2, num_aggregates = l->is_PRECISION.num_agg, - num_eig_vect = l->next_level->num_lattice_site_var/2, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2, - block_site_size = (num_eig_vect*(num_eig_vect+1)); + num_eig_vect = l->next_level->num_parent_eig_vect, + aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates, + offset = l->num_parent_eig_vect, + block_site_size = (num_eig_vect*(num_eig_vect+1)); vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; config_PRECISION block_pt; @@ -157,9 +190,10 @@ void set_coarse_self_coupling_PRECISION( vector_PRECISION spin_0_1, vector_PRECI vector_PRECISION *V, const int n, level_struct *l ) { int i, j, k, m, k1, k2, num_aggregates = l->is_PRECISION.num_agg, - num_eig_vect = l->next_level->num_lattice_site_var/2, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2, - clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2; + num_eig_vect = l->next_level->num_parent_eig_vect, + aggregate_size = l->num_inner_lattice_sites*l->num_parent_eig_vect*2/num_aggregates, + offset = l->num_parent_eig_vect, + clover_site_size = (num_eig_vect*(2*num_eig_vect+1)); vector_PRECISION spin_0_1_pt, spin_2_3_pt, interpolation_data; config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover; @@ -211,8 +245,8 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P vector_PRECISION *V, const int mu, const int n, level_struct *l ) { int i, i1, j, k, k1, k2, m, num_aggregates = l->is_PRECISION.num_agg, - num_eig_vect = l->next_level->num_lattice_site_var/2, - offset = l->num_lattice_site_var/2, nlsv = l->num_lattice_site_var, + num_eig_vect = l->next_level->num_parent_eig_vect, + offset = l->num_parent_eig_vect, nlsv = l->num_parent_eig_vect*2, D_link_size = num_eig_vect*num_eig_vect*4, *index_dir = l->is_PRECISION.agg_boundary_index[mu], aggregate_boundary_sites = l->is_PRECISION.agg_boundary_length[mu]/num_aggregates; @@ -261,42 +295,52 @@ void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION spin_0_1, vector_P } } -#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) int n = s->num_block_sites, *length = s->dir_length, **index = s->index, - *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var; + *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var, num_eig_vect = l->num_parent_eig_vect; vector_PRECISION lphi = phi+start, leta = eta+start; - int hopp_size = 4 * SQUARE( l->num_lattice_site_var ); - config_PRECISION D_pt, D = s->op.D + (start/m)*hopp_size; // site-wise self coupling - int clov_size = ( (l->num_lattice_site_var*(l->num_lattice_site_var+1))/2 ); - config_PRECISION clover = s->op.clover + (start/m)*clov_size; - coarse_self_couplings_PRECISION( leta, lphi, clover, n*m, l ); + coarse_self_couplings_PRECISION( eta, phi, &(s->op), (start/m), (start/m)+n, l); -#ifdef HAVE_TM - int tm_term_size = ( (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)) ); - config_PRECISION tm_term = s->op.tm_term + (start/m)*tm_term_size; - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - coarse_add_anti_block_diagonal_PRECISION( leta, lphi, tm_term, n*m, l ); -#endif // inner block couplings +#ifndef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + int hopp_size = 4 * SQUARE( num_eig_vect*2 ); + config_PRECISION D_pt, D = s->op.D + (start/m)*hopp_size; + for ( int mu=0; mu<4; mu++ ) { ind = index[mu]; // mu direction for ( int i=0; iop.D_vectorized + + (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset; + OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + + (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset; + ind = index[mu]; // mu direction + for ( int i=0; iop.D; - vector_PRECISION_define( eta1, 0, 0, l->vector_size, l ); - vector_PRECISION_define( eta2, 0, 0, l->vector_size, l ); + vector_PRECISION_define_zero( eta1, 0, l->vector_size, l, no_threading ); + vector_PRECISION_define_zero( eta2, 0, l->vector_size, l, no_threading ); coarse_spinwise_self_couplings_PRECISION( eta1, eta2, phi, s->op.clover, l->inner_vector_size, l ); for ( mu=0; mu<4; mu++ ) { // direction mu @@ -334,8 +378,8 @@ void coarse_aggregate_neighbor_couplings_PRECISION( vector_PRECISION eta1, vecto vector_PRECISION eta1_pt, eta2_pt, phi_pt; config_PRECISION D_pt, D = s->op.D; - vector_PRECISION_define( eta1, 0, 0, l->vector_size, l ); - vector_PRECISION_define( eta2, 0, 0, l->vector_size, l ); + vector_PRECISION_define_zero( eta1, 0, l->vector_size, l, no_threading ); + vector_PRECISION_define_zero( eta2, 0, l->vector_size, l, no_threading ); // requires the positive boundaries of phi to be communicated befor for ( i=0; inum_lattice_site_var, - num_eig_vect = l->num_lattice_site_var/2, - clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2, - clover_step_size2 = SQUARE(l->num_lattice_site_var/2); - config_PRECISION clover_pt = clover; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - while ( phi_pt < phi_end_pt ) { - // A - mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); - clover_pt += clover_step_size1; eta_pt += num_eig_vect; phi_pt += num_eig_vect; - // D - mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); - clover_pt += clover_step_size1; phi_pt -= num_eig_vect; - // C = -B* - nmvh_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); - phi_pt += num_eig_vect; eta_pt -= num_eig_vect; - // B - mv_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); - clover_pt += clover_step_size2; phi_pt += num_eig_vect; eta_pt += site_var; - } -} - - void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION block, level_struct *l ) { int length = l->inner_vector_size, - num_eig_vect = l->num_lattice_site_var/2, + num_eig_vect = l->num_parent_eig_vect, block_step_size = (num_eig_vect * (num_eig_vect+1))/2; config_PRECISION block_pt = block; vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2, phi_end_pt=phi+length; @@ -389,71 +403,26 @@ void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PR // 0 D ] // storage order: upper triangle of A, upper triangle of D, columnwise // diagonal coupling - vector_PRECISION_define( eta1_pt, 0, 0, l->vector_size, l ); - vector_PRECISION_define( eta2_pt, 0, 0, l->vector_size, l ); while ( phi_pt < phi_end_pt ) { // A mvp_PRECISION( eta1_pt, block_pt, phi_pt, num_eig_vect ); + vector_PRECISION_define_zero( eta2_pt, 0, num_eig_vect, l, no_threading ); block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect; // D + vector_PRECISION_define_zero( eta1_pt, 0, num_eig_vect, l, no_threading ); mvp_PRECISION( eta2_pt, block_pt, phi_pt, num_eig_vect ); block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect; } } -void coarse_aggregate_anti_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, - config_PRECISION block, level_struct *l ) { - int length = l->inner_vector_size, - num_eig_vect = l->num_lattice_site_var/2, - block_step_size = (num_eig_vect * (num_eig_vect+1))/2; - config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2, phi_end_pt=phi+length; - // U(x) = [ A 0 , A=-A*, D=-D* diag. excluded - // 0 D ] - // storage order: upper triangle of A, upper triangle of D, columnwise - // diagonal coupling - vector_PRECISION_define( eta1_pt, 0, 0, l->vector_size, l ); - vector_PRECISION_define( eta2_pt, 0, 0, l->vector_size, l ); - while ( phi_pt < phi_end_pt ) { - // A - amvp_PRECISION( eta1_pt, block_pt, phi_pt, num_eig_vect ); - block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect; - // D - amvp_PRECISION( eta2_pt, block_pt, phi_pt, num_eig_vect ); - block_pt += block_step_size; eta1_pt += num_eig_vect; eta2_pt += num_eig_vect; phi_pt += num_eig_vect; - } -} - - -void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, - config_PRECISION block, int length, level_struct *l ) { - - int num_eig_vect = l->num_lattice_site_var/2, - block_step_size = (num_eig_vect * (num_eig_vect+1))/2; - config_PRECISION block_pt = block; - vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; - // U(x) = [ A 0 , A=-A*, D=-D* diag. excluded - // 0 D ] - // storage order: upper triangle of A, upper triangle of D, columnwise - // diagonal coupling - while ( phi_pt < phi_end_pt ) { - // A - amvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); - block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; - // D - amvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); - block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; - } -} - void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION clover, int length, level_struct *l ) { - int num_eig_vect = l->num_lattice_site_var/2, + int num_eig_vect = l->num_parent_eig_vect, clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2, - clover_step_size2 = SQUARE(l->num_lattice_site_var/2); + clover_step_size2 = SQUARE(num_eig_vect); config_PRECISION clover_pt = clover; vector_PRECISION phi_pt=phi, eta1_pt=eta1, eta2_pt=eta2+num_eig_vect, phi_end_pt=phi+length; // U(x) = [ A B , A=A*, D=D*, C = -B* @@ -476,12 +445,127 @@ void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRE } } +void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, + struct Thread *threading ) { + + coarse_operator_PRECISION_set_neighbor_couplings( op, l, threading ); + coarse_operator_PRECISION_set_self_couplings( op, l, threading ); + +} + +void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l, + struct Thread *threading ) { + +#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + int nc_size = SQUARE(l->num_parent_eig_vect*2); + int n1, n2; + int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + int offset_v = 4*l->num_parent_eig_vect*column_offset; + + if ( l->depth > 0 && l->level>0 ) { + n1 = l->num_lattice_sites; + n2 = 2*l->num_lattice_sites-l->num_inner_lattice_sites; + } else { + n1 = l->num_inner_lattice_sites; + n2 = l->num_inner_lattice_sites; + } + int start, end; + compute_core_start_end_custom(0, n1, &start, &end, l, threading, 1); + int n_per_core = end-start; + START_LOCKED_MASTER(threading) + if( op->D_vectorized == NULL ) { + // 2 is for complex, 4 is for 4 directions + MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 ); + MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 4*offset_v*n2, 64 ); + } + END_LOCKED_MASTER(threading) + + copy_coarse_operator_to_vectorized_layout_PRECISION( + op->D + 4*start*nc_size, + op->D_vectorized + 4*start*offset_v, + n_per_core, l->num_parent_eig_vect); + copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( + op->D + 4*start*nc_size, + op->D_transformed_vectorized + 4*start*offset_v, + n_per_core, l->num_parent_eig_vect); + // vectorize negative boundary + if ( n2>n1 ) { + compute_core_start_end_custom(n1, n2, &start, &end, l, threading, 1); + n_per_core = end-start; + copy_coarse_operator_to_vectorized_layout_PRECISION( + op->D + 4*start*nc_size, + op->D_vectorized + 4*start*offset_v, + n_per_core, l->num_parent_eig_vect); + copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( + op->D + 4*start*nc_size, + op->D_transformed_vectorized + 4*start*offset_v, + n_per_core, l->num_parent_eig_vect); + } + SYNC_CORES(threading) +#endif + +} + +void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, + struct Thread *threading ) { + +#ifdef OPTIMIZED_COARSE_SELF_COUPLING_PRECISION + int n = l->num_inner_lattice_sites, nv = l->num_parent_eig_vect; + int sc_size = (nv)*(nv*2+1); + int start, end; + compute_core_start_end_custom(0, n, &start, &end, l, threading, 1); + int n_per_core = end-start; -#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION -void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { } -void coarse_operator_PRECISION_set_couplings_clover( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { } + int column_offset = SIMD_LENGTH_PRECISION*((2*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + int offset_v = 2*2*nv*column_offset; + if( op->clover_vectorized == NULL ) { + START_LOCKED_MASTER(threading) + MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, offset_v*n, 64 ); + END_LOCKED_MASTER(threading) + } + copy_coarse_operator_clover_to_vectorized_layout_PRECISION( + op->clover + start*sc_size, + op->clover_vectorized + start*offset_v, + n_per_core, nv); +#ifdef HAVE_TM + int tm_size = (nv)*(nv+1); + if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) + add_tm_term_to_vectorized_layout_PRECISION( + op->tm_term + start*tm_size, + op->clover_vectorized + start*offset_v, + n_per_core, nv); #endif +#ifdef HAVE_TM1p1 + int column_doublet_offset = SIMD_LENGTH_PRECISION*((4*nv+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + int offset_doublet_v = 2*4*nv*column_doublet_offset; + int eps_size = (nv)*(nv+1); + if( op->clover_doublet_vectorized == NULL ) { + START_LOCKED_MASTER(threading) + MALLOC_HUGEPAGES( op->clover_doublet_vectorized, OPERATOR_TYPE_PRECISION, offset_doublet_v*n, 64 ); + END_LOCKED_MASTER(threading) + } + copy_coarse_operator_clover_to_doublet_vectorized_layout_PRECISION( + op->clover + start*sc_size, + op->clover_doublet_vectorized + start*offset_doublet_v, + n_per_core, nv); + if ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) + add_epsbar_term_to_doublet_vectorized_layout_PRECISION( + op->epsbar_term + start*eps_size, + op->clover_doublet_vectorized + start*offset_doublet_v, + n_per_core, nv); +#ifdef HAVE_TM + if ( op->mu + op->mu_odd_shift != 0.0 || op->mu + op->mu_even_shift != 0.0 ) + add_tm_term_to_doublet_vectorized_layout_PRECISION( + op->tm_term + start*tm_size, + op->clover_doublet_vectorized + start*offset_doublet_v, + n_per_core, nv); +#endif +#endif + SYNC_CORES(threading) +#endif +} + void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { int j, k=l->num_lattice_site_var/2; @@ -513,24 +597,66 @@ void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int st } } -#ifndef VECTORIZE_COARSE_OPERATOR_PRECISION +void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ) { + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + int j, k=l->num_lattice_site_var/4; + vector_PRECISION eta_end; + + eta_end = eta+end; + phi += start; + eta += start; + + ASSERT( eta != phi ); + while ( eta < eta_end ) { + phi += k; + for ( j=0; jclover, l->inner_vector_size, l ); -#ifdef HAVE_TM - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - coarse_add_anti_block_diagonal_PRECISION( eta, phi, op->tm_term, l->inner_vector_size, l ); -#endif - END_LOCKED_MASTER(threading) + int start; + int end; + compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &end, l, threading, 1); + + coarse_self_couplings_PRECISION( eta, phi, op, start, end, l); + PROF_PRECISION_STOP( _SC, 1, threading ); PROF_PRECISION_START( _NC, threading ); - coarse_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, l, threading ); + + coarse_pn_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, +1, l, threading ); + PROF_PRECISION_STOP( _NC, 1, threading ); } -#endif void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { @@ -557,7 +683,7 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr if ( !l->idle ) { int vs = l->vector_size, ivs = l->inner_vector_size, cvs = l->next_level->vector_size, civs = l->next_level->inner_vector_size; - double diff; + PRECISION diff = 0; vector_PRECISION vp1=NULL, vp2, vp3, vp4, vc1=NULL, vc2, vc3; PUBLIC_MALLOC( vp1, complex_PRECISION, 4*vs ); @@ -568,133 +694,175 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr vp2 = vp1 + vs; vp3 = vp2 + vs; vp4 = vp3 + vs; vc2 = vc1 + cvs; vc3 = vc2 + cvs; START_LOCKED_MASTER(threading) -#ifdef INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION - double norm = 0.0; - double dot = 0.0; - float *op = (float *)l->is_PRECISION.operator; - float *op2 = (float *)(l->is_PRECISION.operator+0*SIMD_LENGTH_PRECISION*l->vector_size)+1; - for ( int i=0; iinner_vector_size; i++ ) - norm += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); - for ( int i=0; iinner_vector_size; i++ ) - dot += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op2[2*i*SIMD_LENGTH_PRECISION+0] + I*op2[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); - diff = dot/norm; +#ifdef HAVE_TM1p1 + if(g.n_flavours == 1) +#endif + { +#ifdef OPTIMIZED_INTERPOLATION_OPERATOR_PRECISION + double norm = 0.0; + double dot = 0.0; + float *op = (float *)l->is_PRECISION.operator; + float *op2 = (float *)(l->is_PRECISION.operator+0*SIMD_LENGTH_PRECISION*l->vector_size)+1; + for ( int i=0; iinner_vector_size; i++ ) + norm += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); + for ( int i=0; iinner_vector_size; i++ ) + dot += (op[2*i*SIMD_LENGTH_PRECISION+0] + I*op[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION])*conj(op2[2*i*SIMD_LENGTH_PRECISION+0] + I*op2[2*i*SIMD_LENGTH_PRECISION+SIMD_LENGTH_PRECISION]); + diff = dot/norm; #else - diff = global_inner_product_PRECISION( l->is_PRECISION.interpolation[0], l->is_PRECISION.interpolation[1], 0, ivs, l, no_threading ) - / global_norm_PRECISION( l->is_PRECISION.interpolation[0], 0, ivs, l, no_threading ); + diff = global_inner_product_PRECISION( l->is_PRECISION.interpolation[0], l->is_PRECISION.interpolation[1], 0, ivs, l, no_threading ) + / global_norm_PRECISION( l->is_PRECISION.interpolation[0], 0, ivs, l, no_threading ); #endif - printf0("depth: %d, correctness of block_gram_schmidt: %le\n", l->depth, cabs(diff) ); - if(diff > g.test) g.test = diff; + test0_PRECISION("depth: %d, correctness of block_gram_schmidt: %le\n", l->depth, cabs(diff) ); + } if ( !l->next_level->idle ) - vector_PRECISION_define_random( vc1, 0, civs, l->next_level ); + vector_PRECISION_define_random( vc1, 0, civs, l->next_level, no_threading ); vector_PRECISION_distribute( vc2, vc1, l->next_level ); vector_PRECISION_gather( vc3, vc2, l->next_level ); if ( !l->next_level->idle ) { vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); } - printf0("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; - + test0_PRECISION("depth: %d, correctness of gather( distribute( phi_c ) ) : %le\n", l->depth, diff ); + if ( !l->next_level->idle ) - vector_PRECISION_define_random( vc1, 0, civs, l->next_level ); - interpolate3_PRECISION( vp1, vc1, l, no_threading ); + vector_PRECISION_define_random( vc1, 0, civs, l->next_level, no_threading ); + vector_PRECISION_define_zero( vp1, 0, ivs, l, no_threading ); + interpolate_PRECISION( vp1, vc1, l, no_threading ); restrict_PRECISION( vc2, vp1, l, no_threading ); if ( !l->next_level->idle ) { vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level ); diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); - printf0("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; + test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c: %le\n", l->depth, abs_PRECISION(diff) ); } - - END_LOCKED_MASTER(threading) + + END_LOCKED_MASTER(threading); if(threading->n_core>1) { - interpolate3_PRECISION( vp1, vc1, l, threading ); + vector_PRECISION_define_zero( vp1, 0, ivs, l, threading ); + interpolate_PRECISION( vp1, vc1, l, threading ); restrict_PRECISION( vc2, vp1, l, threading ); START_LOCKED_MASTER(threading) if ( !l->next_level->idle ) { vector_PRECISION_minus( vc3, vc1, vc2, 0, civs, l->next_level ); diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); - printf0("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; + test0_PRECISION("depth: %d, correctness of ( P* P - 1 ) phi_c with threading: %le\n", l->depth, diff ); } END_LOCKED_MASTER(threading) } -#ifdef HAVE_TM - int tm_site_size = (l->next_level->num_lattice_site_var/2*(l->next_level->num_lattice_site_var/2+1)); - config_PRECISION tm_term=NULL; - PUBLIC_MALLOC( tm_term, complex_PRECISION, tm_site_size*l->next_level->num_inner_lattice_sites ); + START_LOCKED_MASTER(threading) + if (l->depth==0) + gamma5_PRECISION( vp2, vp1, l, no_threading ); + else + coarse_gamma5_PRECISION( vp2, vp1, 0, ivs, l ); + restrict_PRECISION( vc2, vp2, l, no_threading ); + coarse_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level ); + if ( !l->next_level->idle ) { + vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + test0_PRECISION("depth: %d, correctness of ( g5_c P* g5 P - 1 ) phi_c: %le\n", l->depth, diff ); + } +#ifdef HAVE_TM1p1 + if(g.n_flavours == 2) { + if (l->depth==0) + tau1_gamma5_PRECISION( vp2, vp1, l, no_threading ); + else + coarse_tau1_gamma5_PRECISION( vp2, vp1, 0, ivs, l ); + restrict_PRECISION( vc2, vp2, l, no_threading ); + coarse_tau1_gamma5_PRECISION( vc3, vc2, 0, civs, l->next_level ); + if ( !l->next_level->idle ) { + vector_PRECISION_minus( vc2, vc1, vc3, 0, civs, l->next_level ); + diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + test0_PRECISION("depth: %d, correctness of ( tau1 g5_c P* tau1 g5 P - 1 ) phi_c: %le\n", l->depth, diff ); + } + } +#endif + END_LOCKED_MASTER(threading) START_LOCKED_MASTER(threading) - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) { - tm_term_PRECISION_setup( tm_term, l->next_level->s_PRECISION.op.odd_proj, l->next_level, no_threading ); + vector_PRECISION_define_zero( vp2, 0, ivs, l, no_threading ); + if (l->depth==0) + add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs ); + else + coarse_add_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.odd_proj, ivs, l ); + restrict_PRECISION( vc2, vp2, l, no_threading ); + + vector_PRECISION_scale( vc2, vc2, -1.0, 0, civs, l->next_level, no_threading ); + coarse_add_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.odd_proj, civs, l->next_level ); + diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + test0_PRECISION("depth: %d, correctness of ( P* 1odd P - 1odd_c ) phi_c: %le\n", l->depth, diff ); + END_LOCKED_MASTER(threading) + +#ifdef HAVE_TM + START_LOCKED_MASTER(threading) + if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { + vector_PRECISION_define_zero( vp2, 0, ivs, l, no_threading ); + if (l->depth==0) + add_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs ); + else + coarse_add_anti_block_diagonal_PRECISION( vp2, vp1, l->s_PRECISION.op.tm_term, ivs, l ); + restrict_PRECISION( vc2, vp2, l, no_threading ); - vector_PRECISION_define( vc2, 0, 0, civs, l ); - vector_PRECISION_define( vc3, 0, 0, civs, l ); + vector_PRECISION_scale( vc2, vc2, -g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth], 0, civs, l->next_level, no_threading ); coarse_add_anti_block_diagonal_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.tm_term, civs, l->next_level ); - coarse_add_anti_block_diagonal_PRECISION( vc3, vc1, tm_term, civs, l->next_level ); - - vector_PRECISION_minus( vc3, vc3, vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); - printf0("depth: %d, correctness of building tm_term: %le\n", l->next_level->depth, diff ); - if(diff > g.test) g.test = diff; + diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + test0_PRECISION("depth: %d, correctness of ( P* tm P - tm_c ) phi_c: %le\n", l->depth, diff ); } - END_LOCKED_MASTER(threading) - - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - if(threading->n_core>1) { - - tm_term_PRECISION_setup( tm_term, l->next_level->s_PRECISION.op.odd_proj, l->next_level, no_threading ); - - START_LOCKED_MASTER(threading) - vector_PRECISION_define( vc3, 0, 0, civs, l ); - coarse_add_anti_block_diagonal_PRECISION( vc3, vc1, tm_term, civs, l->next_level ); - - vector_PRECISION_minus( vc3, vc3, vc2, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); - printf0("depth: %d, correctness of building tm_term with threading: %le\n", l->next_level->depth, diff ); - if(diff > g.test) g.test = diff; - END_LOCKED_MASTER(threading) - } - - PUBLIC_FREE( tm_term, complex_PRECISION, tm_site_size*l->next_level->num_inner_lattice_sites ); + END_LOCKED_MASTER(threading) +#endif +#ifdef HAVE_TM1p1 + START_LOCKED_MASTER(threading) + if ( g.n_flavours == 2 && + ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) { + vector_PRECISION_define_zero( vp2, 0, ivs, l, no_threading ); + if (l->depth==0) + apply_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs ); + else + coarse_add_doublet_coupling_PRECISION( vp2, vp1, l->s_PRECISION.op.epsbar_term, ivs, l ); + restrict_PRECISION( vc2, vp2, l, no_threading ); + + vector_PRECISION_scale( vc2, vc2, -g.epsbar_factor[l->next_level->depth]/g.epsbar_factor[l->depth], 0, civs, l->next_level, no_threading ); + coarse_add_doublet_coupling_PRECISION( vc2, vc1, l->next_level->s_PRECISION.op.epsbar_term, civs, l->next_level ); + diff = global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc1, 0, civs, l->next_level, no_threading ); + test0_PRECISION("depth: %d, correctness of ( P* eps P - eps_c ) phi_c: %le\n", l->depth, diff ); + } + END_LOCKED_MASTER(threading) #endif - + if ( l->level > 0 ) { START_LOCKED_MASTER(threading) - interpolate3_PRECISION( vp1, vc1, l, no_threading ); - apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); + vector_PRECISION_define_zero( vp1, 0, ivs, l, no_threading ); + interpolate_PRECISION( vp1, vc1, l, no_threading ); + apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); + #ifdef HAVE_TM - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - if (g.tm_mu_factor[l->depth] != g.tm_mu_factor[l->next_level->depth]) { - vector_PRECISION_scale( vp3, vp1, (g.tm_mu_factor[l->next_level->depth]/g.tm_mu_factor[l->depth])-1., - 0, ivs, l->next_level ); - if(l->depth == 0) - add_diagonal_PRECISION( vp2, vp3, l->s_PRECISION.op.tm_term, ivs ); - else - coarse_add_anti_block_diagonal_PRECISION( vp2, vp3, l->s_PRECISION.op.tm_term, ivs, l ); - } -#endif - + if (g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + if (g.mu_factor[l->depth] != g.mu_factor[l->next_level->depth]) { + vector_PRECISION_scale( vp3, vp1, (g.mu_factor[l->next_level->depth]/g.mu_factor[l->depth])-1., 0, ivs, l, no_threading ); + if(l->depth == 0) + add_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs ); + else + coarse_add_anti_block_diagonal_PRECISION( vp2, vp3, l->p_PRECISION.op->tm_term, ivs, l ); + } +#endif restrict_PRECISION( vc2, vp2, l, no_threading ); + if ( !l->next_level->idle ) { if ( l->level==1 && g.odd_even ) coarse_odd_even_PRECISION_test( vc3, vc1, l->next_level, no_threading ); else apply_operator_PRECISION( vc3, vc1, &(l->next_level->p_PRECISION), l->next_level, no_threading ); - + vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level ); - diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); + diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) /global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); + if ( l->level==1 && g.odd_even ) { - printf0("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c: %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; + test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c: %le\n", l->depth, diff ); } else { - printf0("depth: %d, correctness of ( P* D P - D_c ) phi_c: %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; - } + test0_PRECISION("depth: %d, correctness of ( P* D P - D_c ) phi_c: %le\n", l->depth, diff ); + } } END_LOCKED_MASTER(threading) @@ -710,21 +878,19 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr vector_PRECISION_minus( vc3, vc2, vc3, 0, civs, l->next_level ); diff = global_norm_PRECISION( vc3, 0, civs, l->next_level, no_threading ) / global_norm_PRECISION( vc2, 0, civs, l->next_level, no_threading ); if ( l->level==1 && g.odd_even ) { //TODO: this test doesn't work without SSE!! - printf0("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; - } else { - printf0("depth: %d, correctness of ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; - } - } + test0_PRECISION("depth: %d, correctness of odd even preconditioned ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff ); + } else { + test0_PRECISION("depth: %d, correctness of ( P* D P - D_c ) phi_c with D_c threaded: %le\n", l->depth, diff ); + } + } END_LOCKED_MASTER(threading) - } + } } START_LOCKED_MASTER(threading) - + /* if ( l->level > 0 && l->depth > 0 && g.method == 3 && g.odd_even ) { - vector_PRECISION_define_random( vp1, 0, ivs, l ); + vector_PRECISION_define_random( vp1, 0, ivs, l, no_threading ); block_to_oddeven_PRECISION( vp4, vp1, l, no_threading ); coarse_diag_ee_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading ); coarse_diag_oo_PRECISION( vp3, vp4, &(l->oe_op_PRECISION), l, no_threading ); @@ -733,8 +899,7 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l ); diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading ); - printf0("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; + test0_PRECISION("depth: %d, correctness of odd even layout (smoother): %le\n", l->depth, diff ); block_to_oddeven_PRECISION( vp4, vp1, l, no_threading ); coarse_odd_even_PRECISION_test( vp3, vp4, l, no_threading ); @@ -742,10 +907,9 @@ void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *thr apply_operator_PRECISION( vp2, vp1, &(l->p_PRECISION), l, no_threading ); vector_PRECISION_minus( vp4, vp4, vp2, 0, ivs, l ); diff = global_norm_PRECISION( vp4, 0, ivs, l, no_threading ) / global_norm_PRECISION( vp2, 0, ivs, l, no_threading ); - printf0("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff ); - if(diff > g.test) g.test = diff; - } - + test0_PRECISION("depth: %d, correctness of odd even preconditioned operator (smoother): %le\n", l->depth, diff ); + } + */ FREE( vp1, complex_PRECISION, 4*vs ); FREE( vc1, complex_PRECISION, 3*cvs ); END_LOCKED_MASTER(threading) diff --git a/src/coarse_operator_generic.h b/src/coarse_operator_generic.h index c730412..8b42e66 100644 --- a/src/coarse_operator_generic.h +++ b/src/coarse_operator_generic.h @@ -22,27 +22,29 @@ #ifndef COARSE_OPERATOR_PRECISION_HEADER #define COARSE_OPERATOR_PRECISION_HEADER - #include "blas_vectorized.h" + #include "simd_blas_PRECISION.h" struct Thread; void coarse_operator_PRECISION_alloc( level_struct *l ); void coarse_operator_PRECISION_free( level_struct *l ); + void coarse_operator_PRECISION_free_vectorized( operator_PRECISION_struct *op, level_struct *l ); void coarse_operator_PRECISION_setup( vector_PRECISION *V, level_struct *l ); + void coarse_operator_PRECISION_setup_finalize( level_struct *l, struct Thread *threading ); void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void coarse_operator_PRECISION_set_couplings_clover( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void coarse_operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void coarse_operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void set_coarse_self_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2, vector_PRECISION *V, const int n, level_struct *l ); void set_coarse_neighbor_coupling_PRECISION( vector_PRECISION buffer1, vector_PRECISION buffer2, vector_PRECISION *V, const int mu, const int n, level_struct *l ); - void coarse_self_couplings_PRECISION( vector_PRECISION eta, config_PRECISION clover, - vector_PRECISION phi, int length, level_struct *l ); void coarse_spinwise_self_couplings_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION clover, int length, level_struct *l ); void coarse_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ); + void coarse_tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int end, level_struct *l ); void apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_apply_coarse_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, @@ -59,12 +61,7 @@ void set_block_diagonal_PRECISION( vector_PRECISION spin_0_1, vector_PRECISION spin_2_3, vector_PRECISION *V, const int n, config_PRECISION block, level_struct *l ); void coarse_aggregate_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION block, level_struct *l ); - - void coarse_aggregate_anti_block_diagonal_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION block, level_struct *l ); - - void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION block, int length, level_struct *l ); - - + void coarse_operator_PRECISION_test_routine( level_struct *l, struct Thread *threading ); // eta += D*phi, D stored columnwise @@ -124,8 +121,42 @@ } } + // eta += D*phi, D hermitian and stored columnwise packed + static inline void pmvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, + const vector_PRECISION phi, const register int n ) { + register int i, j, k; + + eta[0] += D[0]*phi[0]; + for ( i=1, k=1; inum_lattice_site_var/2, - num_eig_vect2 = SQUARE(l->num_lattice_site_var/2); - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here + // eta -= D*phi, D anti-hermitian and stored columnwise packed + static inline void mamvp_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, + const vector_PRECISION phi, const register int n ) { + register int i, j, k; - // A - nmv_PRECISION( eta, D, phi, num_eig_vect ); - // C - eta += num_eig_vect; - D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); - // B - phi += num_eig_vect; - eta -= num_eig_vect; - D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); - // D - eta += num_eig_vect; - D += num_eig_vect2; - nmv_PRECISION( eta, D, phi, num_eig_vect ); + eta[0] -= D[0]*phi[0]; + for ( i=1, k=1; inum_lattice_site_var/2, - num_eig_vect2 = SQUARE(l->num_lattice_site_var/2); - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - - // A* - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - // -C* - phi += num_eig_vect; - D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); - // -B* - eta += num_eig_vect; - phi -= num_eig_vect; - D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); - // D* - phi += num_eig_vect; - D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); + int site_var = l->num_lattice_site_var, + num_eig_vect = l->num_parent_eig_vect, + clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2, + clover_step_size2 = SQUARE(num_eig_vect); + config_PRECISION clover_pt = clover; + vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + // U(x) = [ A B , A=A*, D=D*, C = -B* + // C D ] + // storage order: upper triangle of A, upper triangle of D, B, columnwise + // diagonal coupling +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + while ( phi_pt < phi_end_pt ) { + // A + mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//1 + mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + // D + eta_pt += num_eig_vect;//2 + phi_pt += num_eig_vect;//2 + clover_pt += clover_step_size1; + mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//3 + mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + // C = -B* + eta_pt -= num_eig_vect;//2 + phi_pt -= 3*num_eig_vect;//0 + clover_pt += clover_step_size1; + nmvh_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//3 + phi_pt += num_eig_vect;//1 + nmvh_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + // B + eta_pt -= 3*num_eig_vect;//0 + phi_pt += num_eig_vect;//2 + mv_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect;//1 + phi_pt += num_eig_vect;//3 + mv_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + eta_pt += 3*num_eig_vect;//4 + phi_pt += num_eig_vect;//4 + clover_pt += clover_step_size2; + } + } else +#endif + while ( phi_pt < phi_end_pt ) { + // A + mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + clover_pt += clover_step_size1; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + // D + mvp_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + clover_pt += clover_step_size1; phi_pt -= num_eig_vect; + // C = -B* + nmvh_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + phi_pt += num_eig_vect; eta_pt -= num_eig_vect; + // B + mv_PRECISION( eta_pt, clover_pt, phi_pt, num_eig_vect ); + clover_pt += clover_step_size2; phi_pt += num_eig_vect; eta_pt += site_var; + } } - - static inline void coarse_n_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, - config_PRECISION D, level_struct *l ) { - - int num_eig_vect = l->num_lattice_site_var/2, - num_eig_vect2 = SQUARE(l->num_lattice_site_var/2); - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - // A - mv_PRECISION( eta, D, phi, num_eig_vect ); - // C - eta += num_eig_vect; - D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); - // B - phi += num_eig_vect; - eta -= num_eig_vect; - D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); - // D - eta += num_eig_vect; - D += num_eig_vect2; - mv_PRECISION( eta, D, phi, num_eig_vect ); + static inline void coarse_add_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + config_PRECISION block, int length, level_struct *l ) { + + int num_eig_vect = l->num_parent_eig_vect, + block_step_size = (num_eig_vect * (num_eig_vect+1))/2; + config_PRECISION block_pt = block; + vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + // U(x) = [ A 0 , A=A*, D=D* diag. excluded + // 0 D ] + // storage order: upper triangle of A, upper triangle of D, columnwise + // diagonal coupling +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + while ( phi_pt < phi_end_pt ) { + // A + pmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect; phi_pt += num_eig_vect; + mmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + // D + pmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect; phi_pt += num_eig_vect; + mmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + } + } else +#endif + while ( phi_pt < phi_end_pt ) { + // A + pmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + // D + pmvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + } } - static inline void coarse_n_daggered_hopp_PRECISION( vector_PRECISION eta, vector_PRECISION phi, - config_PRECISION D, level_struct *l ) { + static inline void coarse_add_anti_block_diagonal_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + config_PRECISION block, int length, level_struct *l ) { - int num_eig_vect = l->num_lattice_site_var/2, - num_eig_vect2 = SQUARE(l->num_lattice_site_var/2); - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - - // A* - mvh_PRECISION( eta, D, phi, num_eig_vect ); - // -C* - phi += num_eig_vect; - D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - // -B* - eta += num_eig_vect; - phi -= num_eig_vect; - D += num_eig_vect2; - nmvh_PRECISION( eta, D, phi, num_eig_vect ); - // D* - phi += num_eig_vect; - D += num_eig_vect2; - mvh_PRECISION( eta, D, phi, num_eig_vect ); + int num_eig_vect = l->num_parent_eig_vect, + block_step_size = (num_eig_vect * (num_eig_vect+1))/2; + config_PRECISION block_pt = block; + vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + // U(x) = [ A 0 , A=-A*, D=-D* diag. excluded + // 0 D ] + // storage order: upper triangle of A, upper triangle of D, columnwise + // diagonal coupling +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + while ( phi_pt < phi_end_pt ) { + // A + pamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect; phi_pt += num_eig_vect; + mamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + // D + pamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + eta_pt += num_eig_vect; phi_pt += num_eig_vect; + mamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + } + } else +#endif + while ( phi_pt < phi_end_pt ) { + // A + pamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + // D + pamvp_PRECISION( eta_pt, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += num_eig_vect; phi_pt += num_eig_vect; + } } + static inline void coarse_add_doublet_coupling_PRECISION( vector_PRECISION eta, vector_PRECISION phi, + config_PRECISION block, int length, level_struct *l ) { + +#ifdef HAVE_TM1p1 + int num_eig_vect = l->num_parent_eig_vect, + block_step_size = (num_eig_vect * (num_eig_vect+1))/2; + config_PRECISION block_pt = block; + vector_PRECISION phi_pt=phi, eta_pt=eta, phi_end_pt=phi+length; + // U(x) = [ 0 A , A=-A*, D=-D* diag. excluded + // D 0 ] + // storage order: upper triangle of A, upper triangle of D, columnwise + // diagonal coupling + + while ( phi_pt < phi_end_pt ) { + // A + pamvp_PRECISION( eta_pt, block_pt, phi_pt+num_eig_vect, num_eig_vect ); + pamvp_PRECISION( eta_pt+num_eig_vect, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += 2*num_eig_vect; phi_pt += 2*num_eig_vect; + // D + pamvp_PRECISION( eta_pt, block_pt, phi_pt+num_eig_vect, num_eig_vect ); + pamvp_PRECISION( eta_pt+num_eig_vect, block_pt, phi_pt, num_eig_vect ); + block_pt += block_step_size; eta_pt += 2*num_eig_vect; phi_pt += 2*num_eig_vect; + } +#else + warning0("coarse_add_doublet_coupling_PRECISION called without HAVE_TM1p1 defined.\n"); + return; +#endif +} + static inline void coarse_spinwise_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION D, level_struct *l ) { - int num_eig_vect = l->num_lattice_site_var/2, - num_eig_vect2 = SQUARE(l->num_lattice_site_var/2); + int num_eig_vect = l->num_parent_eig_vect, + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -281,8 +388,8 @@ static inline void coarse_spinwise_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION D, level_struct *l ) { - int num_eig_vect = l->num_lattice_site_var/2, - num_eig_vect2 = SQUARE(l->num_lattice_site_var/2); + int num_eig_vect = l->num_parent_eig_vect, + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -309,8 +416,8 @@ static inline void coarse_spinwise_n_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION D, level_struct *l ) { - int num_eig_vect = l->num_lattice_site_var/2, - num_eig_vect2 = SQUARE(l->num_lattice_site_var/2); + int num_eig_vect = l->num_parent_eig_vect, + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D @@ -336,8 +443,8 @@ static inline void coarse_spinwise_n_daggered_hopp_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION D, level_struct *l ) { - int num_eig_vect = l->num_lattice_site_var/2, - num_eig_vect2 = SQUARE(l->num_lattice_site_var/2); + int num_eig_vect = l->num_parent_eig_vect, + num_eig_vect2 = SQUARE(l->num_parent_eig_vect); // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* // C D ] -B* D* ] // storage order: A, C, B, D diff --git a/src/data_generic.c b/src/data_generic.c deleted file mode 100644 index 950c814..0000000 --- a/src/data_generic.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -// vector storage for PRECISION precision -void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_PRECISION_START( _SET ); - if ( phi != NULL ) { - int i; - for ( i=start; ischwarz_vector_size = 2*l->vector_size - l->inner_vector_size; } +void data_layout_n_flavours( int nf, level_struct *l, struct Thread *threading ) { + + ASSERT(nf>0); + ASSERT(l->depth == 0); + +#ifdef HAVE_TM1p1 + ASSERT(nf<=2); + + if( g.n_flavours == nf ) + return; + else + g.n_flavours = nf; + + START_LOCKED_MASTER(threading) + struct level_struct *l_tmp = l; + + while(1) { + if(l_tmp->depth == 0) + l_tmp->num_lattice_site_var = nf * 12; + else + l_tmp->num_lattice_site_var = nf * 2 * l_tmp->num_parent_eig_vect; + + l_tmp->inner_vector_size = l_tmp->num_inner_lattice_sites * l_tmp->num_lattice_site_var; + + l_tmp->vector_size = l_tmp->num_lattice_sites * l_tmp->num_lattice_site_var; + l_tmp->schwarz_vector_size = 2*l_tmp->vector_size - l_tmp->inner_vector_size; + + if(l_tmp->depth == 0) { + g.p.v_end = l_tmp->inner_vector_size; + g.p_MP.sp.v_end = l_tmp->inner_vector_size; + g.p_MP.dp.v_end = l_tmp->inner_vector_size; + } + + if ( g.mixed_precision ) { + l_tmp->s_float.block_vector_size = l_tmp->s_float.num_block_sites*l_tmp->num_lattice_site_var; + l_tmp->p_float.v_end = l_tmp->inner_vector_size; + l_tmp->sp_float.v_end = l_tmp->inner_vector_size; + l_tmp->dummy_p_float.v_end = l_tmp->inner_vector_size; + if ( (g.method >= 4 && g.odd_even) || (!l_tmp->idle && l_tmp->level == 0 && g.odd_even) ) { + if ( l_tmp->level == 0 ) + l_tmp->p_float.v_end = l_tmp->oe_op_float.num_even_sites*l_tmp->num_lattice_site_var; + else + l_tmp->sp_float.v_end = l_tmp->oe_op_float.num_even_sites*l_tmp->num_lattice_site_var; + } + + } else { + l_tmp->s_double.block_vector_size = l_tmp->s_double.num_block_sites*l_tmp->num_lattice_site_var; + l_tmp->p_double.v_end = l_tmp->inner_vector_size; + l_tmp->sp_double.v_end = l_tmp->inner_vector_size; + l_tmp->dummy_p_double.v_end = l_tmp->inner_vector_size; + if ( (g.method >= 4 && g.odd_even) || (!l_tmp->idle && l_tmp->level == 0 && g.odd_even) ) { + if ( l_tmp->level == 0 ) + l_tmp->p_double.v_end = l_tmp->oe_op_double.num_even_sites*l_tmp->num_lattice_site_var; + else + l_tmp->sp_double.v_end = l_tmp->oe_op_double.num_even_sites*l_tmp->num_lattice_site_var; + } + } + + if ( l->level == 0 || l_tmp->next_level == NULL ) + break; + + l_tmp = l_tmp->next_level; + } + + update_threading( no_threading, l); + END_LOCKED_MASTER(threading) + + update_threading( threading, l); +#else + ASSERT(nf==1); +#endif + +} void define_eot( int *eot, int *N, level_struct *l ) { diff --git a/src/data_layout.h b/src/data_layout.h index 566fe51..d9c3e7a 100644 --- a/src/data_layout.h +++ b/src/data_layout.h @@ -23,6 +23,7 @@ #define DATA_LAYOUT_HEADER void data_layout_init( level_struct *l ); + void data_layout_n_flavours( int n, level_struct *l, struct Thread *threading ); void define_eot( int *eot, int *N, level_struct *l ); void define_eo_bt( int **bt, int *eot, int *n_ebs, int *n_obs, int *n_bs, int *N, level_struct *l ); void define_nt_bt_tt( int *nt, int *backward_nt, int **bt, int *tt, int *it, int *dt, level_struct *l ); diff --git a/src/dirac.c b/src/dirac.c index 2afb74a..fec73d5 100644 --- a/src/dirac.c +++ b/src/dirac.c @@ -23,35 +23,37 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) { int i, j, t, z, y, x, mu, nu; + operator_double_struct *op = &(g.op_double); -#ifdef HAVE_TM - - l->tm_shift = g.tm_mu; - l->tm_even_shift = g.tm_mu_even_shift; - l->tm_odd_shift = g.tm_mu_odd_shift; - - vector_double_define( g.op_double.odd_proj, _COMPLEX_double_ZERO, 0, l->inner_vector_size, l ); - vector_double_define( g.op_double.tm_term, I*(l->tm_shift + l->tm_even_shift), - 0, l->inner_vector_size, l ); + op->m0 = g.m0; for ( mu=0; mu<4; mu++ ) - g.op_double.oe_offset += (l->local_lattice[mu]*(g.my_coords[mu]/l->comm_offset[mu]))%2; - g.op_double.oe_offset = g.op_double.oe_offset%2; - + op->oe_offset += (l->local_lattice[mu]*(g.my_coords[mu]/l->comm_offset[mu]))%2; + op->oe_offset = op->oe_offset%2; + for ( i=0,t=0; tlocal_lattice[T]; t++ ) for ( z=0; zlocal_lattice[Z]; z++ ) for ( y=0; ylocal_lattice[Y]; y++ ) - for ( x=0; xlocal_lattice[X]; x++ ){ - if((t+z+y+x+g.op_double.oe_offset)%2) //odd - for ( j=0; j<12; j++, i++){ - g.op_double.odd_proj[i]=1; - g.op_double.tm_term[i]+=I*(l->tm_odd_shift - l->tm_even_shift); - } - else - i+=12; - } - - gamma5_double( g.op_double.tm_term, g.op_double.tm_term, l, no_threading); + for ( x=0; xlocal_lattice[X]; x++ ){ + if((t+z+y+x+op->oe_offset)%2) { //odd + FOR12(op->odd_proj[i] = 1; i++;); + } else { + FOR12(op->odd_proj[i] = _COMPLEX_double_ZERO; i++;); + } + } + +#ifdef HAVE_TM + if ( g.mu + g.mu_even_shift == 0 && g.mu + g.mu_odd_shift == 0 ) + vector_double_define_zero( op->tm_term, 0, l->inner_vector_size, l, no_threading ); + else + tm_term_double_setup( g.mu, g.mu_even_shift, g.mu_odd_shift, op, l, no_threading ); +#endif + +#ifdef HAVE_TM1p1 + if ( g.epsbar == 0 && g.epsbar_ig5_even_shift == 0 && g.epsbar_ig5_odd_shift == 0 ) + vector_double_define_zero( op->epsbar_term, 0, l->inner_vector_size, l, no_threading ); + else + epsbar_term_double_setup( g.epsbar, g.epsbar_ig5_even_shift, g.epsbar_ig5_odd_shift, op, l, no_threading ); #endif // generate clover term @@ -63,7 +65,7 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) { j = 42*l->num_inner_lattice_sites; for ( i=0; iclover[i] = 0; i = 0; for ( t=1; tlocal_lattice[T]+1; t++ ) for ( z=1; zlocal_lattice[Z]+1; z++ ) @@ -71,12 +73,12 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) { for ( x=1; xlocal_lattice[X]+1; x++ ) { // diagonal including the shift for ( j=0; j<12; j++) - g.op_double.clover[42*i+j] = 4+l->dirac_shift; + op->clover[42*i+j] = 4+op->m0; for ( mu=0; mu<4; mu++ ) for ( nu=mu+1; nu<4; nu++ ) { Qdiff( Qstore, mu, nu, t, z, y, x, U ); - set_clover( Qstore, mu, nu, i, g.op_double.clover ); + set_clover( Qstore, mu, nu, i, op->clover ); } i++; } @@ -84,7 +86,7 @@ void compute_clover_term ( SU3_storage U, level_struct *l ) { mat_free( &Qstore, 3 ); spin_free( 4, 4 ); } else { - vector_double_define( g.op_double.clover, 4+l->dirac_shift, 0, l->inner_vector_size, l ); + vector_double_define_real( op->clover, 4+op->m0, 0, l->inner_vector_size, l, no_threading ); } } @@ -108,7 +110,7 @@ void dirac_setup( config_double hopp, level_struct *l ) { if ( g.print > 0 ) printf0("%s\n", CLIFFORD_BASIS ); if ( g.bc == _ANTIPERIODIC ) printf0("antiperiodic in time"); else if ( g.bc == _TWISTED ) printf0("twisted (%.2f, %.2f, %.2f, %.2f)", g.twisted_bc[0], - g.twisted_bc[1], g.twisted_bc[2], g.twisted_bc[3]); + g.twisted_bc[1], g.twisted_bc[2], g.twisted_bc[3]); else printf0("periodic in time"); printf0(" boundary conditions\n"); @@ -124,36 +126,37 @@ void dirac_setup( config_double hopp, level_struct *l ) { if (t 0 ) printf0("Configuration stored...\n"); - + compute_clover_term( U, l ); // calculate the plaquette @@ -650,143 +653,151 @@ void define_odd_even_table( level_struct *l ) { } -void scale_clover( operator_double_struct *op, double scale_even, double scale_odd, level_struct *l ) { - - int i, j, n = l->num_inner_lattice_sites, *odd_even_table = g.odd_even_table; - double factors[2]; - config_double clover=op->clover, clover_pt; - - factors[_EVEN] = scale_even; factors[_ODD] = scale_odd; - - if ( g.csw != 0.0 ) { - for ( i=0; idepth == 0) { + m0_update_double( m0, &(g.op_double), l, threading ); + m0_update_float( m0, &(g.op_float), l, threading ); } else { - for ( i=0; iop_float), l, threading ); + else + m0_update_double( m0, &(l->op_double), l, threading ); } + + if ( g.mixed_precision ) { + m0_update_float( m0, &(l->oe_op_float), l, threading ); + m0_update_float( m0, &(l->s_float.op), l, threading ); + } else { + m0_update_double( m0, &(l->oe_op_double), l, threading ); + m0_update_double( m0, &(l->s_double.op), l, threading ); + } + + START_LOCKED_MASTER(threading) + if(g.print>0) printf0("depth: %d, kappa updated to %f \n", (l->depth), 0.5/(m0 + 4.)); + END_LOCKED_MASTER(threading) + + if ( g.interpolation && l->level > 0 && l->next_level != NULL ) + m0_update(m0, l->next_level, threading); } -void shift_update( complex_double shift, level_struct *l, struct Thread *threading ) { - ASSERT(l->depth == 0); - shift_update_double( &(g.op_double), shift, l, threading ); - shift_update_float( &(g.op_float), shift, l, threading ); - shift_update_double( &(l->s_double.op), shift, l, threading ); - shift_update_float( &(l->s_float.op), shift, l, threading ); +void tm_term_update( double mu, level_struct *l, struct Thread *threading ) { - if ( g.mixed_precision ) - operator_updates_float( l, threading ); - else - operator_updates_double( l, threading ); +#ifdef HAVE_TM + double factor = g.mu_factor[l->depth]; + double even_shift = g.mu_even_shift, odd_shift = g.mu_odd_shift; + + if (l->depth == 0) { // we don't use the multiplicative factor here + tm_term_double_setup( mu, even_shift, odd_shift, &(g.op_double), l, threading ); + tm_term_float_setup( mu, even_shift, odd_shift, &(g.op_float), l, threading ); + } else { + if ( g.mixed_precision ) + tm_term_float_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->op_float), l, threading ); + else + tm_term_double_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->op_double), l, threading ); + } + + if ( g.mixed_precision ) { + tm_term_float_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->oe_op_float), l, threading ); + tm_term_float_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->s_float.op), l, threading ); + } else { + tm_term_double_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->oe_op_double), l, threading ); + tm_term_double_setup( factor*mu, factor*even_shift, factor*odd_shift, &(l->s_double.op), l, threading ); + } - START_LOCKED_MASTER(threading) - l->dirac_shift = shift; - l->real_shift = creal(shift); - END_LOCKED_MASTER(threading) + START_MASTER(threading) + if(g.print>0) { + if( g.mu_even_shift == g.mu_odd_shift ) + printf0("depth: %d, mu updated to %f \n", (l->depth), factor*(mu+even_shift)); + else + printf0("depth: %d, mu updated to %f on even sites and %f on odd sites \n", l->depth, factor*(mu+even_shift), + factor*(mu+odd_shift)); + } + END_MASTER(threading) -#ifdef DEBUG - test_routine( l, threading ); + if ( g.interpolation && l->level > 0 && l->next_level != NULL ) + tm_term_update( mu, l->next_level, threading ); #endif } -void optimized_shift_update( complex_double mass_shift, level_struct *l, struct Thread *threading ) { - - ASSERT(l->depth==0); - - if ( mass_shift != l->dirac_shift ) { - shift_update_double( &(g.op_double), mass_shift, l, threading ); - shift_update_float( &(g.op_float), mass_shift, l, threading ); - if(l->s_double.op.clover != NULL) - shift_update_double( &(l->s_double.op), mass_shift, l, threading ); - if ( l->s_float.op.clover != NULL ) - shift_update_float( &(l->s_float.op), mass_shift, l, threading ); +void epsbar_term_update( level_struct *l, struct Thread *threading ) { - START_LOCKED_MASTER(threading) - l->dirac_shift = mass_shift; - l->real_shift = creal(mass_shift); - END_LOCKED_MASTER(threading) - } +#ifdef HAVE_TM1p1 + double factor = g.epsbar_factor[l->depth]; + double epsbar = g.epsbar; + double even_shift = g.epsbar_ig5_even_shift, odd_shift = g.epsbar_ig5_odd_shift; + + if (l->depth == 0) { + epsbar_term_double_setup( epsbar, even_shift, odd_shift, &(g.op_double), l, threading ); + epsbar_term_float_setup( epsbar, even_shift, odd_shift, &(g.op_float), l, threading ); + } else { + if ( g.mixed_precision ) + epsbar_term_float_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->op_float), l, threading ); + else + epsbar_term_double_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->op_double), l, threading ); + } -#ifdef HAVE_TM - if ( l->tm_shift != g.tm_mu || l->tm_even_shift != g.tm_mu_even_shift || - l->tm_odd_shift != g.tm_mu_odd_shift ) { + if ( g.mixed_precision ) { + epsbar_term_float_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->oe_op_float),l, threading ); + epsbar_term_float_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->s_float.op), l, threading ); + } else { + epsbar_term_double_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->oe_op_double),l, threading ); + epsbar_term_double_setup( factor*epsbar, factor*even_shift, factor*odd_shift, &(l->s_double.op), l, threading ); + } - START_MASTER(threading) - if( g.tm_mu_even_shift == g.tm_mu_odd_shift ) - printf0("depth: %d, updating mu to %f \n", (l->depth), cimag(g.tm_mu+g.tm_mu_even_shift)); + START_MASTER(threading) + if(g.print>0) { + if( even_shift == odd_shift ) + printf0("depth: %d, epsbar term updated to %f + ig5 %f \n", l->depth, factor*epsbar, factor*even_shift); else - printf0("depth: %d, updating mu to %f on even sites and %f on odd sites \n", l->depth, cimag(g.tm_mu+g.tm_mu_even_shift), cimag(g.tm_mu+g.tm_mu_even_shift)); - - l->tm_shift = g.tm_mu; - l->tm_even_shift = g.tm_mu_even_shift; - l->tm_odd_shift = g.tm_mu_odd_shift; - END_LOCKED_MASTER(threading) - - tm_term_double_setup( g.op_double.tm_term, g.op_double.odd_proj, l, threading ); - tm_term_float_setup( g.op_float.tm_term, g.op_float.odd_proj, l, threading ); - - if(l->s_double.op.tm_term != NULL) - tm_term_double_setup( l->s_double.op.tm_term, l->s_double.op.odd_proj, l, threading ); - - if ( l->s_float.op.tm_term != NULL ) - tm_term_float_setup( l->s_float.op.tm_term, l->s_float.op.odd_proj, l, threading ); + printf0("depth: %d, epsbar term updated to %f + ig5 %f on even sites and + ig5 %f on odd sites \n", l->depth, + factor*epsbar, factor*even_shift, factor*odd_shift); } -#endif - - START_LOCKED_MASTER(threading) - if(l->s_double.op.clover != NULL) { -#ifdef OPTIMIZED_SELF_COUPLING_double - if ( g.csw != 0 ) { - double *clover_vectorized_pt = l->s_double.op.clover_vectorized; - config_double clover_pt = l->s_double.op.clover; - config_double tm_term_pt = l->s_double.op.tm_term; - for ( int i=0; inum_inner_lattice_sites; i++ ) { - sse_set_clover_double( clover_vectorized_pt, clover_pt ); - sse_add_diagonal_clover_double( clover_vectorized_pt, tm_term_pt ); - clover_pt += 42; - tm_term_pt += 12; - clover_vectorized_pt += 144; - } - } -#endif - if ( g.odd_even ) - schwarz_double_oddeven_setup( &(l->s_double.op), l ); - } + END_MASTER(threading) - if ( l->s_float.op.clover != NULL ) { -#ifdef OPTIMIZED_SELF_COUPLING_float - if ( g.csw != 0 ) { - config_double clover_pt = g.op_double.clover; - config_double tm_term_pt = g.op_double.tm_term; - for ( int i=0; inum_inner_lattice_sites; i++ ) { - //we have to reorder the term, while in OPTIMIZED_SELF_COUPLING_double we use already reordered terms - float *clover_vectorized_pt = l->s_float.op.clover_vectorized + 144*l->s_float.op.translation_table[i]; - sse_set_clover_float( clover_vectorized_pt, clover_pt ); - sse_add_diagonal_clover_float( clover_vectorized_pt, tm_term_pt ); - clover_pt += 42; - tm_term_pt += 12; - } - } + if ( g.interpolation && l->level > 0 && l->next_level != NULL ) + epsbar_term_update( l->next_level, threading ); #endif - if ( g.odd_even ) - schwarz_float_oddeven_setup( &(l->s_float.op), l ); - } - END_LOCKED_MASTER(threading) +} - if ( g.mixed_precision ) - optimized_shift_update_float( mass_shift, l->next_level, threading ); - else - optimized_shift_update_double( mass_shift, l->next_level, threading ); +void finalize_operator_update( level_struct *l, struct Thread *threading ) { + + if (l->depth == 0) { + START_LOCKED_MASTER(threading) + if(l->s_double.op.clover != NULL) { + operator_double_set_self_couplings( &(l->s_double.op), l ); + if ( g.odd_even ) + schwarz_double_oddeven_setup( &(l->s_double), l ); + } + + if ( l->s_float.op.clover != NULL ) { + operator_float_set_self_couplings( &(l->s_float.op), l ); + if ( g.odd_even ) + schwarz_float_oddeven_setup( &(l->s_float), l ); + } + END_LOCKED_MASTER(threading) + } else { + SYNC_CORES(threading) + if ( g.mixed_precision ) { + if ( !l->idle && g.odd_even && ((g.method >= 4 && l->level > 0) || l->level == 0) ) + coarse_oddeven_float_set_self_couplings( l, threading ); + else + coarse_operator_float_set_self_couplings( &(l->s_float.op), l, threading ); + } else { + if ( !l->idle && g.odd_even && ((g.method >= 4 && l->level > 0) || l->level == 0) ) + coarse_oddeven_double_set_self_couplings( l, threading ); + else + coarse_operator_double_set_self_couplings( &(l->s_double.op), l, threading ); + } + } + if ( g.interpolation && l->level > 0 ) + finalize_operator_update( l->next_level, threading ); + #ifdef DEBUG - if ( l->depth == 0 ) + if (l->depth == 0) test_routine( l, threading ); #endif + } diff --git a/src/dirac.h b/src/dirac.h index f239c51..3b65c89 100644 --- a/src/dirac.h +++ b/src/dirac.h @@ -27,7 +27,7 @@ struct Thread; typedef complex_double ******SU3_storage; void compute_clover_term ( SU3_storage U, level_struct *l ); -void dirac_setup( config_double hopp, level_struct *l ); + void dirac_setup( config_double hopp, level_struct *l ); void SU3_storage_alloc( SU3_storage *U, level_struct *l ); void SU3_storage_free( SU3_storage *U, level_struct *l ); @@ -47,8 +47,9 @@ void dirac_setup( config_double hopp, level_struct *l ); void set_clover( complex_double *q_store, int mu, int nu, int index, config_double clover ); void define_odd_even_table( level_struct *l ); - void scale_clover( operator_double_struct *op, double scale_even, double scale_odd, level_struct *l ); - void shift_update( complex_double shift, level_struct *l, struct Thread *threading ); - void optimized_shift_update( complex_double mass_shift, level_struct *l, struct Thread *threading ); + void m0_update( double m0, level_struct *l, struct Thread *threading ); + void tm_term_update( double mu, level_struct *l, struct Thread *threading ); + void epsbar_term_update( level_struct *l, struct Thread *threading ); + void finalize_operator_update( level_struct *l, struct Thread *threading ); #endif diff --git a/src/dirac_generic.c b/src/dirac_generic.c index 02c5e93..41de506 100644 --- a/src/dirac_generic.c +++ b/src/dirac_generic.c @@ -21,26 +21,149 @@ #include "main.h" -void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, int length, +void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ) { - - vector_PRECISION eta_end = eta + length; + + int nv = l->num_lattice_site_var; + vector_PRECISION lphi = phi+start, leta = eta+start; + vector_PRECISION leta_end = eta+end; + +#ifdef PROFILING + START_MASTER(threading) + PROF_PRECISION_START( _SC ); + END_MASTER(threading) +#endif + +#ifdef HAVE_TM + config_PRECISION tm_term = op->tm_term+(start/nv)*12; +#endif + if ( g.csw == 0.0 ) { - while ( eta < eta_end ) { - FOR12( *eta = (*phi)*(*clover); eta++; phi++; clover++; ) + + config_PRECISION clover = op->clover+(start/nv)*12; +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + clover -= 6; + tm_term -= 6; + FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + FOR6( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + clover -= 6; + tm_term -= 6; + FOR6( *leta = (*lphi)*((*clover)-(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + } + else +#endif + while ( leta < leta_end ) { + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + clover -= 6; + FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + clover -= 6; + FOR6( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); + } + } else { +#endif +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) { + while ( leta < leta_end ) + FOR12( *leta = (*lphi)*((*clover)+(*tm_term)); leta++; lphi++; clover++; tm_term++; ); + } else +#endif + while ( leta < leta_end ) + FOR12( *leta = (*lphi)*(*clover); leta++; lphi++; clover++; ); +#ifdef HAVE_TM1p1 } +#endif + } else { - START_MASTER(threading) - PROF_PRECISION_START( _SC ); - END_MASTER(threading) - while ( eta < eta_end ) { - site_clover_PRECISION( eta, phi, clover ); - eta+=12; phi+=12; clover+=42; + +#ifndef OPTIMIZED_SELF_COUPLING_PRECISION + + config_PRECISION clover = op->clover+(start/nv)*42; +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + doublet_site_clover_PRECISION( leta, lphi, clover ); + clover+=42; + FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + tm_term -= 6; + FOR6( *leta -=(*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + FOR6( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + tm_term -= 6; + FOR6( *leta -= (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + } + else +#endif + while ( leta < leta_end ) { + doublet_site_clover_PRECISION( leta, lphi, clover ); + leta+=24; lphi+=24; + clover+=42; + } + } else { +#endif +#ifdef HAVE_TM + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) + while ( leta < leta_end ) { + site_clover_PRECISION( leta, lphi, clover ); + FOR12( *leta += (*lphi)*(*tm_term); leta++; lphi++; tm_term++; ); + clover+=42; + } + else +#endif + while ( leta < leta_end ) { + site_clover_PRECISION( leta, lphi, clover ); + leta+=12; lphi+=12; + clover+=42; + } +#ifdef HAVE_TM1p1 } - START_MASTER(threading) - PROF_PRECISION_STOP( _SC, 1 ); - END_MASTER(threading) +#endif + +#else + +#ifdef HAVE_TM1p1 + PRECISION *clover = ( g.n_flavours == 2 ) ? op->clover_doublet_vectorized : op->clover_vectorized; +#else + PRECISION *clover = op->clover_vectorized; +#endif + clover += start*12; + while ( leta < leta_end ) { // tm_term included in the clover vectorized + site_clover_vectorized_PRECISION( (PRECISION*) leta, (PRECISION*) lphi, clover ); + leta += 3*SIMD_LENGTH_PRECISION; lphi += 3*SIMD_LENGTH_PRECISION; + clover += 12*3*SIMD_LENGTH_PRECISION; + } + +#endif + } + +#ifdef HAVE_TM1p1 + config_PRECISION eps_term = op->epsbar_term+(start/nv)*12; + lphi = phi+start, leta = eta+start; + if ( g.n_flavours == 2 && + ( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) ) + while ( leta < leta_end ) { + lphi += 6; + FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; ) + lphi -= 12; + eps_term -= 6; + FOR6( *leta += (*lphi)*(*eps_term); leta++; lphi++; eps_term++; ) + lphi += 6; + } +#endif + + +#ifdef PROFILING + START_MASTER(threading) + PROF_PRECISION_STOP( _SC, 1 ); + END_MASTER(threading) +#endif + } static void spin0and1_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, level_struct *l ) { @@ -78,112 +201,315 @@ static void spin2and3_clover_PRECISION( vector_PRECISION eta, vector_PRECISION p } } -#if !defined(OPTIMIZED_NEIGHBOR_COUPLING_PRECISION) && !defined(OPTIMIZED_SELF_COUPLING_PRECISION) void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) - int i, n = s->num_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table; + int n = s->num_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; vector_PRECISION lphi = phi+start, leta = eta+start; - config_PRECISION clover = (g.csw==0.0)?s->op.clover+start:s->op.clover+(start/12)*42; - int j, k, *ind; - complex_PRECISION buf1[25]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2=buf1+6, *buf3=buf2+6, *buf4=buf3+6; - config_PRECISION D_pt; - config_PRECISION D = s->op.D + (start/12)*36; - + // clover term - clover_PRECISION( leta, lphi, clover, 12*n, l, no_threading ); -#ifdef HAVE_TM - config_PRECISION tm_term = s->op.tm_term+start; - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - add_diagonal_PRECISION( leta, lphi, tm_term, 12*n ); -#endif - - // inner block couplings - ind = index[T]; // T direction - for ( i=0; iop), start, start+nv*n, l, no_threading ); + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float // block operator vectorized just in the float environment + PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; + PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; + for ( int mu=0; mu<4; mu++ ) { + block_oddeven_plus_coupling_PRECISION( (PRECISION*)leta, Dplus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor ); + block_oddeven_minus_coupling_PRECISION( (PRECISION*)leta, Dminus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor ); } - ind = index[X]; // X direction - for ( i=0; iop.D + (start/nv)*36; +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + complex_PRECISION buf1[50]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2=buf1+12, *buf3=buf2+12, *buf4=buf3+12; + // inner block couplings + ind = index[T]; // T direction + for ( i=0; inum_inner_lattice_sites, *neighbor = op->neighbor_table, start, end; + int n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, start, end, nv = l->num_lattice_site_var; +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; + complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; +#else int i, j, *nb_pt; - complex_PRECISION pbuf[6]; vector_PRECISION phi_pt, eta_pt, end_pt; config_PRECISION D_pt; - compute_core_start_end(0, 12*n, &start, &end, l, threading ); - vector_PRECISION lphi = phi+start, leta = eta+start; - config_PRECISION clover = (g.csw==0.0)?op->clover+start:op->clover+(start/12)*42; - +#endif + + compute_core_start_end(0, nv*n, &start, &end, l, threading ); + SYNC_MASTER_TO_ALL(threading) - // clover term - clover_PRECISION( leta, lphi, clover, end-start, l, threading ); -#ifdef HAVE_TM - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - add_diagonal_PRECISION( leta, lphi, op->tm_term+start, end-start ); -#endif - + clover_PRECISION(eta, phi, op, start, end, l, threading ); + START_MASTER(threading) PROF_PRECISION_START( _NC ); END_MASTER(threading) + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dprp_PRECISION( prn, phi, start, end ); +#else + complex_PRECISION pbuf[12]; + for ( i=start/2, phi_pt=phi+start; iprnT+i, phi_pt ); + dprp_Z_PRECISION( op->prnZ+i, phi_pt ); + dprp_Y_PRECISION( op->prnY+i, phi_pt ); + dprp_X_PRECISION( op->prnX+i, phi_pt ); + } +#endif + // start communication in negative direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dprn_su3_PRECISION( prp, phi, op, neighbor, start, end ); +#else + // project plus dir and multiply with U dagger + for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+((start/nv)*36), nb_pt=neighbor+((start/nv)*4); phi_ptprpT+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpT+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpT+j+9, D_pt, pbuf+9 ); D_pt += 9; + // Z dir + j = 12*(*nb_pt); nb_pt++; + dprn_Z_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpZ+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpZ+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpZ+j+9, D_pt, pbuf+9 ); D_pt += 9; + // Y dir + j = 12*(*nb_pt); nb_pt++; + dprn_Y_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpY+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpY+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpY+j+9, D_pt, pbuf+9 ); D_pt += 9; + // X dir + j = 12*(*nb_pt); nb_pt++; + dprn_X_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpX+j, D_pt, pbuf ); + mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpX+j+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpX+j+9, D_pt, pbuf+9 ); D_pt += 9; + } +#endif + + // start communication in positive direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + // wait for communication in negative direction + ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + su3_dpbp_PRECISION( eta, prn, op, neighbor, start, end ); +#else + // multiply with U and lift up minus dir + for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+(start/nv)*36, nb_pt=neighbor+(start/nv)*4; eta_ptprnT+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnT+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnT+j+9 ); + dpbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Z dir + j = 12*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnZ+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnZ+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnZ+j+9 ); + dpbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Y dir + j = 12*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnY+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnY+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnY+j+9 ); + dpbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; + // X dir + j = 12*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnX+j ); + mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnX+j+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnX+j+9 ); + dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; + } +#endif + + // wait for communication in positive direction + START_LOCKED_MASTER(threading) + ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); + ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); + END_LOCKED_MASTER(threading) + + // lift up plus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dpbn_PRECISION( eta, prp, start, end ); +#else + for ( i=start/2, eta_pt=eta+start; iprpT+i, eta_pt ); + dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); + dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); + dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); + } +#endif + } else { +#endif + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + prp_PRECISION( prn, phi, start, end ); +#else + complex_PRECISION pbuf[6]; for ( i=start/2, phi_pt=phi+start; iprnT+i, phi_pt ); prp_Z_PRECISION( op->prnZ+i, phi_pt ); prp_Y_PRECISION( op->prnY+i, phi_pt ); prp_X_PRECISION( op->prnX+i, phi_pt ); } +#endif // start communication in negative direction START_LOCKED_MASTER(threading) ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); @@ -193,6 +519,9 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat END_LOCKED_MASTER(threading) // project plus dir and multiply with U dagger +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + prn_su3_PRECISION( prp, phi, op, neighbor, start, end ); +#else for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptprpX+j, D_pt, pbuf ); mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9; } +#endif // start communication in positive direction START_LOCKED_MASTER(threading) @@ -230,6 +560,9 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat END_LOCKED_MASTER(threading) // multiply with U and lift up minus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + su3_pbp_PRECISION( eta, prn, op, neighbor, start, end ); +#else for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnX+j+3 ); pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; } +#endif // wait for communication in positive direction START_LOCKED_MASTER(threading) @@ -262,40 +596,394 @@ void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operat END_LOCKED_MASTER(threading) // lift up plus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + pbn_PRECISION( eta, prp, start, end ); +#else for ( i=start/2, eta_pt=eta+start; iprpT+i, eta_pt ); pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); } - +#endif +#ifdef HAVE_TM1p1 + } +#endif + START_MASTER(threading) PROF_PRECISION_STOP( _NC, 1 ); END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) } -#endif - - -void d_plus_clover_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - gamma5_PRECISION( l->vbuf_PRECISION[6], phi, l, threading ); - d_plus_clover_PRECISION( l->vbuf_PRECISION[7], l->vbuf_PRECISION[6], op, l, threading ); - gamma5_PRECISION( eta, l->vbuf_PRECISION[7], l, threading ); -} void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { + ASSERT(l->depth == 0); + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; eta += threading->start_index[l->depth]; phi += threading->start_index[l->depth]; +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + while ( eta < eta_end ) { + FOR12( *eta = -(*phi); phi++; eta++; ) + FOR12( *eta = (*phi); phi++; eta++; ) + } + } else +#endif while ( eta < eta_end ) { FOR6( *eta = -(*phi); phi++; eta++; ) FOR6( *eta = (*phi); phi++; eta++; ) } } +void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { + + ASSERT(l->depth == 0); + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + complex_PRECISION b[6]; + eta += threading->start_index[l->depth]; + phi += threading->start_index[l->depth]; + while ( eta < eta_end ) { + int i = 0; + FOR6( b[i] = (*phi); phi++; i++; ); + FOR6( *eta = -(*phi); phi++; eta++; ); + i = 0; + FOR6( *eta = - b[i] ; eta++; i++; ); + i = 0; + FOR6( b[i] = (*phi); phi++; i++; ); + FOR6( *eta = (*phi); phi++; eta++; ); + i = 0; + FOR6( *eta = b[i] ; eta++; i++; ); + } + } else +#endif + { + START_MASTER(threading) + warning0("tau1_gamma5_PRECISION called with g.n_flavours != 2\n"); + END_MASTER(threading) + gamma5_PRECISION( eta, phi, l, threading ); + } +} + +void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { + + ASSERT(l->depth == 0); + + int i = threading->start_site[l->depth]; + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + eta += threading->start_index[l->depth]; + phi += threading->start_index[l->depth]; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_ODD) { + FOR24( *eta = (*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_EVEN) { + FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + } + i++; + } + else +#endif + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_ODD) { + FOR12( *eta = (*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_EVEN) { + FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + } + i++; + } +} + +void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { + + ASSERT(l->depth == 0); + + int i = threading->start_site[l->depth]; + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + eta += threading->start_index[l->depth]; + phi += threading->start_index[l->depth]; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_ODD){ + FOR12( *eta = -(*phi); phi++; eta++; ); + FOR12( *eta = (*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_EVEN){ + FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + } + i++; + } + else +#endif + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_ODD){ + FOR6( *eta = -(*phi); phi++; eta++; ); + FOR6( *eta = (*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_EVEN){ + FOR12( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + } + i++; + } +} + +void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { + + ASSERT(l->depth == 0); + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + int i = threading->start_site[l->depth]; + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + eta += threading->start_index[l->depth]; + phi += threading->start_index[l->depth]; + + complex_PRECISION b[6]; + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_ODD){ + int i = 0; + FOR6( b[i] = (*phi); phi++; i++; ); + FOR6( *eta = -(*phi); phi++; eta++; ); + i = 0; + FOR6( *eta = - b[i] ; eta++; i++; ); + i = 0; + FOR6( b[i] = (*phi); phi++; i++; ); + FOR6( *eta = (*phi); phi++; eta++; ); + i = 0; + FOR6( *eta = b[i] ; eta++; i++; ); + } else if(g.odd_even_table[i]==_EVEN){ + FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + } + i++; + } + } else +#endif + { + START_MASTER(threading) + warning0("tau1_gamma5_set_even_to_zero_PRECISION called with g.n_flavours != 2\n"); + END_MASTER(threading) + gamma5_set_even_to_zero_PRECISION( eta, phi, l, threading ); + } +} + +void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { + + int i = threading->start_site[l->depth]; + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + eta += threading->start_index[l->depth]; + phi += threading->start_index[l->depth]; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_EVEN){ + FOR24( *eta = (*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_ODD){ + FOR24( *eta = 0; phi++; eta++; ); + } + i++; + } + else +#endif + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_EVEN) { + FOR12( *eta = (*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_ODD) { + FOR12( *eta = 0; phi++; eta++; ); + } + i++; + } +} + +void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { + + int i = threading->start_site[l->depth]; + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + eta += threading->start_index[l->depth]; + phi += threading->start_index[l->depth]; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_EVEN){ + FOR12( *eta = -(*phi); phi++; eta++; ); + FOR12( *eta = (*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_ODD){ + FOR24( *eta = 0; phi++; eta++; ); + } + i++; + } + else +#endif + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_EVEN){ + FOR6( *eta = -(*phi); phi++; eta++; ); + FOR6( *eta = (*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_ODD){ + FOR12( *eta = 0; phi++; eta++; ); + } + i++; + } +} + +void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { + + ASSERT(l->depth == 0); + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + int i = threading->start_site[l->depth]; + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + eta += threading->start_index[l->depth]; + phi += threading->start_index[l->depth]; + + complex_PRECISION b[6]; + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_EVEN){ + int i = 0; + FOR6( b[i] = (*phi); phi++; i++; ); + FOR6( *eta = -(*phi); phi++; eta++; ); + i = 0; + FOR6( *eta = - b[i] ; eta++; i++; ); + i = 0; + FOR6( b[i] = (*phi); phi++; i++; ); + FOR6( *eta = (*phi); phi++; eta++; ); + i = 0; + FOR6( *eta = b[i] ; eta++; i++; ); + } else if(g.odd_even_table[i]==_ODD){ + FOR24( *eta = _COMPLEX_PRECISION_ZERO; phi++; eta++; ); + } + i++; + } + } else +#endif + { + START_MASTER(threading) + warning0("tau1_gamma5_set_odd_to_zero_PRECISION called with g.n_flavours != 2\n"); + END_MASTER(threading) + gamma5_set_odd_to_zero_PRECISION( eta, phi, l, threading ); + } +} + +void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd, + level_struct *l, struct Thread *threading ) { + + int i = threading->start_site[l->depth]; + vector_PRECISION eta_end = eta + threading->end_index[l->depth]; + eta += threading->start_index[l->depth]; + phi += threading->start_index[l->depth]; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_EVEN){ + FOR24( *eta = even*(*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_ODD){ + FOR24( *eta = odd*(*phi); phi++; eta++; ); + } + i++; + } + else +#endif + while ( eta < eta_end ) { + if(g.odd_even_table[i]==_EVEN) { + FOR12( *eta = even*(*phi); phi++; eta++; ); + } + else if(g.odd_even_table[i]==_ODD) { + FOR12( *eta = odd*(*phi); phi++; eta++; ); + } + i++; + } +} + + +void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) { + +#ifdef HAVE_TM1p1 + + /* + * Order: spin0and1 of flav1 + * spin0and1 of flav2 + * spin2and3 of flav1 + * spin2and3 of flav2 + */ + vector_PRECISION serial_end; + + if( g.n_flavours == 2 ) { + serial_end = serial + threading->end_index[l->depth]; + serial += threading->start_index[l->depth]; + flav1 += threading->start_index[l->depth]/2; + flav2 += threading->start_index[l->depth]/2; + } + else { + serial_end = serial + threading->end_index[l->depth]*2; + serial += threading->start_index[l->depth]*2; + flav1 += threading->start_index[l->depth]; + flav2 += threading->start_index[l->depth]; + } + + while ( serial < serial_end ) { + FOR6( *serial = (*flav1); serial++; flav1++; ) + FOR6( *serial = (*flav2); serial++; flav2++; ) + FOR6( *serial = (*flav1); serial++; flav1++; ) + FOR6( *serial = (*flav2); serial++; flav2++; ) + } +#else + START_MASTER(threading) + warning0("two_flavours_to_serial_PRECISION called without HAVE_TM1p1 defined\n"); + END_MASTER(threading) +#endif + +} + +void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ) { + +#ifdef HAVE_TM1p1 + vector_PRECISION serial_end; + + if( g.n_flavours == 2 ) { + serial_end = serial + threading->end_index[l->depth]; + serial += threading->start_index[l->depth]; + flav1 += threading->start_index[l->depth]/2; + flav2 += threading->start_index[l->depth]/2; + } + else { + serial_end = serial + threading->end_index[l->depth]*2; + serial += threading->start_index[l->depth]*2; + flav1 += threading->start_index[l->depth]; + flav2 += threading->start_index[l->depth]; + } + + while ( serial < serial_end ) { + FOR6( *flav1 = (*serial); serial++; flav1++; ) + FOR6( *flav2 = (*serial); serial++; flav2++; ) + FOR6( *flav1 = (*serial); serial++; flav1++; ) + FOR6( *flav2 = (*serial); serial++; flav2++; ) + } +#else + START_MASTER(threading) + warning0("two_flavours_to_serial_PRECISION called without HAVE_TM1p1 defined\n"); + END_MASTER(threading) +#endif + +} void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { d_plus_clover_PRECISION( eta, phi, op, l, threading ); @@ -304,14 +992,105 @@ void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, oper SYNC_CORES(threading) } +void set_clover_vectorized_PRECISION( operator_PRECISION_struct *op, level_struct *l, Thread *threading ) { + +#define real_index( i, j ) ((i)/SIMD_LENGTH_PRECISION)*12*SIMD_LENGTH_PRECISION + SIMD_LENGTH_PRECISION*(j)*2 + (i)%SIMD_LENGTH_PRECISION +#define imag_index( i, j ) ((i)/SIMD_LENGTH_PRECISION)*12*SIMD_LENGTH_PRECISION + SIMD_LENGTH_PRECISION*((j)*2+1) + (i)%SIMD_LENGTH_PRECISION + + int clover_size = 42; + config_PRECISION clover_pt = op->clover; + PRECISION *clover_v_pt = op->clover_vectorized; +#ifdef HAVE_TM + config_PRECISION tm_term_pt = op->tm_term; +#endif +#ifdef HAVE_TM1p1 + PRECISION *clover_doublet_v_pt = op->clover_doublet_vectorized; +#endif + int start, end; + // ASSUMPTION: SIMD_LENGTH_PRECISION power of 2. + compute_core_start_end_custom( 0, l->num_inner_lattice_sites, &start, &end, l, threading, (SIMD_LENGTH_PRECISION<4) ? 1:(SIMD_LENGTH_PRECISION/4)); + + int index; + PRECISION sign = 0.0; + for ( int i=start*12; i 12 ) index = index % 12; + if ( index == j || index-6 == j ) { + // diagonal entry i+k,i+k + index = n*clover_size + index; + sign = 1.0; + } else if ( index < 6 ) { + // first 6-by-6 matrix + if ( j > index ) { + // upper triangle + index = n*clover_size + 12 + ( 30 - (5-index)*(6-index) )/2 + (j-(index+1)); + sign = 1.0; + } else { + // lower triangle, j < i+k + index = n*clover_size + 12 + ( 30 - (5-(j))*(6-(j)) )/2 + (index-(j+1)); + sign = -1.0; + } + } else { + // i+k >= 6 + // second 6-by-6 matrix + index = index - 6; + if ( j > index ) { + // upper triangle + index = n*clover_size + 12 + 15 + ( 30 - (5-index)*(6-index) )/2 + (j-(index+1)); + sign = 1.0; + } else { + // j < i+k-6 + // lower triangle + index = n*clover_size + 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + (index-(j+1)); + sign = -1.0; + } + } + PRECISION c_re = creal_PRECISION( clover_pt[index] ); + PRECISION c_im = sign*cimag_PRECISION( clover_pt[index] ); +#ifdef HAVE_TM + if ((i+k)%6 == j) { + // add tm_term to diagonal + c_re += creal_PRECISION( tm_term_pt[i+k] ); + c_im += cimag_PRECISION( tm_term_pt[i+k] ); + } +#endif + clover_v_pt[ real_index(i+k,j) ] = c_re; + clover_v_pt[ imag_index(i+k,j) ] = c_im; +#ifdef HAVE_TM1p1 + int d = ( (i+k)%12 < 6 ) ? 0:6; + clover_doublet_v_pt[ real_index(12*n+i+k+d,j) ] = c_re; + clover_doublet_v_pt[ imag_index(12*n+i+k+d,j) ] = c_im; +#ifdef HAVE_TM + if ((i+k)%6 == j) { + // change sign to tm_term on diagonal + c_re -= 2*creal_PRECISION( tm_term_pt[i+k] ); + c_im -= 2*cimag_PRECISION( tm_term_pt[i+k] ); + } +#endif + clover_doublet_v_pt[ real_index(12*n+i+k+d+6,j) ] = c_re; + clover_doublet_v_pt[ imag_index(12*n+i+k+d+6,j) ] = c_im; +#endif + } + } + } + +#undef real_index +#undef imag_index + +} + void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l ) { vector_PRECISION eta_end = eta1 + l->inner_vector_size; while ( eta1 < eta_end ) { - FOR6( *eta1 = (*phi)*(*diag); *eta2 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; ) - FOR6( *eta2 = (*phi)*(*diag); *eta1 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; ) + FOR6( *eta1 = (*phi)*(*diag); *eta2 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; ); + FOR6( *eta2 = (*phi)*(*diag); *eta1 = _COMPLEX_PRECISION_ZERO; eta1++; eta2++; phi++; diag++; ); } } @@ -405,7 +1184,6 @@ void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION } } - void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ) { int i, length, index1, index2, *index_dir, *neighbor; @@ -468,7 +1246,7 @@ void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta } } } - + void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l) { int t, z, y, x, i; int *gl=l->global_lattice, sl[4]; @@ -482,12 +1260,17 @@ void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISIO for (z=0; zlocal_lattice[1]; z++) { phase[Z] = phase[T] + theta[Z]*((double)sl[Z]+z)/(double)gl[Z]; for (y=0; ylocal_lattice[2]; y++) { - phase[Y] = phase[Z] + theta[Y]*((double)sl[Y]+y)/(double)gl[Y]; + phase[Y] = phase[Z] + theta[Y]*((double)sl[Y]+y)/(double)gl[Y]; for (x=0; xlocal_lattice[3]; x++) { - phase[X] = phase[Y] + theta[X]*((double)sl[X]+x)/(double)gl[X]; - twisted_bc = exp(I*phase[X]); - FOR12( *eta = (*phi)*twisted_bc; phi++; eta++; ); - } + phase[X] = phase[Y] + theta[X]*((double)sl[X]+x)/(double)gl[X]; + twisted_bc = exp(I*phase[X]); +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + FOR24( *eta = (*phi)*twisted_bc; phi++; eta++; ); + } else +#endif + { FOR12( *eta = (*phi)*twisted_bc; phi++; eta++; ) } + } } } } @@ -497,20 +1280,12 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { if ( !l->idle ) { -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION +#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); START_LOCKED_MASTER(threading) #else START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); -#endif -#ifdef HAVE_TM - l->next_level->tm_shift = g.tm_mu*g.tm_mu_factor[l->next_level->depth]; - l->next_level->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->next_level->depth]; - l->next_level->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->next_level->depth]; - - if( g.tm_mu_factor[l->next_level->depth]!=g.tm_mu_factor[l->depth] ) - tm_term_PRECISION_setup( l->next_level->op_PRECISION.tm_term, l->next_level->op_PRECISION.odd_proj, l->next_level, no_threading ); #endif conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) @@ -519,13 +1294,13 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) { schwarz_PRECISION_boundary_update( &(l->next_level->s_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( g.method >= 4 && g.odd_even ) { - coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading ); + coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading ); } else { coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading ); } } if ( !l->next_level->idle && l->next_level->level == 0 && g.odd_even ) { - coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading ); + coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading ); } else if ( !l->next_level->idle && l->next_level->level == 0 ) { coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading ); } @@ -535,7 +1310,7 @@ void operator_updates_PRECISION( level_struct *l, struct Thread *threading ) { } -void shift_update_PRECISION( operator_PRECISION_struct *op, complex_PRECISION shift, level_struct *l, struct Thread *threading ) { +void m0_update_PRECISION( PRECISION m0, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { // no hyperthreading in this function if(threading->thread != 0) @@ -543,181 +1318,308 @@ void shift_update_PRECISION( operator_PRECISION_struct *op, complex_PRECISION sh config_PRECISION clover = op->clover; - if ( clover != NULL ) { + if ( clover != NULL && op->m0 != m0 ) { int i, j; - complex_PRECISION old_shift = (complex_PRECISION) l->dirac_shift; - complex_PRECISION shift_diff = shift - old_shift; - - if ( l->depth == 0 ) { - int start = threading->start_site[l->depth]; - int n = threading->n_site[l->depth]; - clover += start*(g.csw?42:12); - for ( i=0; istart_site[l->depth]; - int n = threading->n_site[l->depth]; - int k = l->num_lattice_site_var/2; - int sc_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1); - clover += start*sc_size; - for ( i=0; i0 ) clover += j+1; - *clover += shift_diff; + complex_PRECISION m0_diff = m0 - op->m0; + + START_MASTER(threading) + op->m0 = m0; + END_MASTER(threading) + + if( m0_diff != 0 ) { + if ( l->depth == 0 ) { + int start = threading->start_site[l->depth]; + int n = threading->n_site[l->depth]; + clover += start*(g.csw?42:12); + for ( i=0; i0 ) clover += j+1; - *clover += shift_diff; + } else { + int start = threading->start_site[l->depth]; + int n = threading->n_site[l->depth]; + int k = l->num_parent_eig_vect; + int sc_size = (l->num_parent_eig_vect)*(l->num_parent_eig_vect*2+1); + clover += start*sc_size; + for ( i=0; i0 ) clover += j+1; + *clover += m0_diff; + } + clover ++; + for ( j=0; j0 ) clover += j+1; + *clover += m0_diff; + } + clover += 1 + SQUARE(k); } - clover += 1 + SQUARE(k); } } } } -void tm_term_PRECISION_setup( config_PRECISION tm_term, config_PRECISION odd_proj, level_struct *l, struct Thread *threading ) { +void tm_term_PRECISION_setup( PRECISION mu, PRECISION even, PRECISION odd, operator_PRECISION_struct *op, + level_struct *l, struct Thread *threading ) { #ifdef HAVE_TM if(threading->thread != 0) return; - complex_PRECISION shift = I*l->tm_shift; - complex_PRECISION even_shift = I*l->tm_even_shift; - complex_PRECISION odd_shift = I*l->tm_odd_shift; - + config_PRECISION tm_term = op->tm_term; if ( tm_term != NULL ) { + config_PRECISION odd_proj = op->odd_proj; + complex_PRECISION shift = I*mu; + complex_PRECISION even_shift = I*even; + complex_PRECISION odd_shift = I*odd; + + START_MASTER(threading) + op->mu = mu; + op->mu_even_shift = even; + op->mu_odd_shift = odd; + END_MASTER(threading) + int i, j; int start, end; compute_core_start_end(0, l->num_inner_lattice_sites, &start, &end, l, threading); int n = end-start; - complex_PRECISION tm_shift; if ( l->depth == 0 ) { + complex_PRECISION tm_shift; tm_term += start*12; odd_proj += start*12; for ( i=0; inum_lattice_site_var/2; - int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1); - + int k, m = l->num_parent_eig_vect; + int tm_size = m*(m+1); + tm_term += start*tm_size; odd_proj += start*tm_size; - + if( cimag(even_shift) == 0. && cimag(odd_shift) == 0. ) { - - tm_shift = shift; - - for ( i=0; ithread != 0) + return; - if ( !l->idle ) { + config_PRECISION eps_term = op->epsbar_term; + if ( eps_term != NULL ) { + config_PRECISION odd_proj = op->odd_proj; + complex_PRECISION shift = -epsbar; + complex_PRECISION even_shift = I*even; + complex_PRECISION odd_shift = I*odd; - if ( mass_shift != l->dirac_shift ) { - shift_update_PRECISION( &(l->op_PRECISION), mass_shift, l, threading ); - shift_update_PRECISION( &(l->s_PRECISION.op), mass_shift, l, threading ); - START_LOCKED_MASTER(threading) - l->dirac_shift = mass_shift; - l->real_shift = creal(mass_shift); - END_LOCKED_MASTER(threading) - } - -#ifdef HAVE_TM - if ( l->tm_shift != g.tm_mu*g.tm_mu_factor[l->depth] || - l->tm_even_shift != g.tm_mu_even_shift*g.tm_mu_factor[l->depth] || - l->tm_odd_shift != g.tm_mu_odd_shift*g.tm_mu_factor[l->depth] ) { - START_LOCKED_MASTER(threading) - if( g.tm_mu_even_shift == g.tm_mu_odd_shift ) - printf0("depth: %d, updating mu to %f \n", (l->depth), cimag(g.tm_mu+g.tm_mu_even_shift)); - else - printf0("depth: %d, updating mu to %f on even sites and %f on odd sites \n", l->depth, cimag(g.tm_mu+g.tm_mu_even_shift), cimag(g.tm_mu+g.tm_mu_even_shift)); + START_MASTER(threading) + op->epsbar = epsbar; + op->epsbar_ig5_even_shift = even; + op->epsbar_ig5_odd_shift = odd; + END_MASTER(threading) + + int i, j; + int start, end; + compute_core_start_end(0, l->num_inner_lattice_sites, &start, &end, l, threading); + int n = end-start; + + if ( l->depth == 0 ) { + eps_term += start*12; + odd_proj += start*12; + + if( cimag(even_shift) == 0. && cimag(odd_shift) == 0. ) + for ( i=0; inum_parent_eig_vect; + int eps_size = m*(m+1); - l->tm_shift = g.tm_mu*g.tm_mu_factor[l->depth]; - l->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->depth]; - l->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->depth]; - END_LOCKED_MASTER(threading) - - tm_term_PRECISION_setup( l->op_PRECISION.tm_term, l->op_PRECISION.odd_proj, l, threading ); - tm_term_PRECISION_setup( l->s_PRECISION.op.tm_term, l->s_PRECISION.op.odd_proj, l, threading ); - } + eps_term += start*eps_size; + odd_proj += start*eps_size; + + if( cimag(even_shift) == 0. && cimag(odd_shift) == 0. ) { + for ( i=0; i<2*n; i++ ) { + for ( j=0; jinner_vector_size; + PUBLIC_MALLOC( vd1, complex_double, 4*ivs + 2*4*ivs ); + PUBLIC_MALLOC( vpp1, complex_PRECISION, 2*2*ivs ); + + vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs; + vdd1 = vd4 + ivs; vdd2 = vdd1 + 2*ivs; vdd3 = vdd2 + 2*ivs; vdd4 = vdd3 + 2*ivs; + vpp2 = vpp1 + 2*ivs; + + START_LOCKED_MASTER(threading) + + vector_double_define_random( vd1, 0, l->inner_vector_size, l, no_threading ); + vector_double_define_random( vd2, 0, l->inner_vector_size, l, no_threading ); + apply_operator_double( vd3, vd1, &(g.p), l, no_threading ); +#ifdef HAVE_TM + vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); +#endif + apply_operator_double( vd4, vd2, &(g.p), l, no_threading ); +#ifdef HAVE_TM + vector_double_real_scale( g.op_double.tm_term, g.op_double.tm_term, -1, 0, l->inner_vector_size, l ); +#endif + add_diagonal_double( vd3, vd2, g.op_double.epsbar_term, l->inner_vector_size ); + add_diagonal_double( vd4, vd1, g.op_double.epsbar_term, l->inner_vector_size ); + + two_flavours_to_serial_double( vd1, vd2, vdd1, l, no_threading ); + two_flavours_to_serial_double( vd3, vd4, vdd2, l, no_threading ); + + data_layout_n_flavours( 2, l, threading ); + + trans_PRECISION( vpp1, vdd1, op->translation_table, l, no_threading ); + apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, no_threading ); + trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, no_threading ); + + vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l ); + diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) / + global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading ); + + test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION: %le\n", l->depth, diff ); + END_LOCKED_MASTER(threading) + + if(threading->n_core > 1) { + trans_PRECISION( vpp1, vdd1, op->translation_table, l, threading ); + apply_operator_PRECISION( vpp2, vpp1, &(l->p_PRECISION), l, threading ); + trans_back_PRECISION( vdd3, vpp2, op->translation_table, l, threading ); + + SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) - if ( !l->idle && g.method >= 4 && l->level > 0 && g.odd_even ) - coarse_oddeven_re_setup_PRECISION( &(l->s_PRECISION.op), _REORDER, l, threading ); - else if ( !l->idle && l->level == 0 && g.odd_even) - coarse_oddeven_re_setup_PRECISION( &(l->s_PRECISION.op), _NO_REORDERING, l, threading ); - else - coarse_operator_PRECISION_set_couplings_clover( &(l->s_PRECISION.op), l, threading ); - - if(l->level > 0) - optimized_shift_update_PRECISION( mass_shift, l->next_level, threading ); - } + START_LOCKED_MASTER(threading) + vector_double_minus( vdd4, vdd3, vdd2, 0, l->inner_vector_size, l ); + diff = global_norm_double( vdd4, 0, l->inner_vector_size, l, no_threading ) / + global_norm_double( vdd3, 0, l->inner_vector_size, l, no_threading ); + + test0_PRECISION("depth: %d, correctness of doublet Dirac operator PRECISION with threading: %le\n", l->depth, diff ); + END_LOCKED_MASTER(threading) + } + + PUBLIC_FREE( vd1, complex_double, 4*ivs + 2*4*ivs ); + PUBLIC_FREE( vpp1, complex_PRECISION, 2*2*ivs ); + + START_LOCKED_MASTER(threading) + if ( g.method >=4 && g.odd_even ) + oddeven_PRECISION_test( l ); + END_LOCKED_MASTER(threading) +#endif + } diff --git a/src/dirac_generic.h b/src/dirac_generic.h index 79bf51a..f88ae7e 100644 --- a/src/dirac_generic.h +++ b/src/dirac_generic.h @@ -24,46 +24,77 @@ struct Thread; - void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - - void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, config_PRECISION clover, int length, - level_struct *l, struct Thread *threading ); + void two_flavours_to_serial_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ); + void serial_to_two_flavours_PRECISION( vector_PRECISION flav1, vector_PRECISION flav2, vector_PRECISION serial, level_struct *l, struct Thread *threading ); + + + void clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, int start, int end, level_struct *l, struct Thread *threading ); void d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void d_plus_clover_dagger_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void block_d_plus_clover_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); + void set_clover_vectorized_PRECISION( operator_PRECISION_struct *op, level_struct *l, Thread *threading ); + void diagonal_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, config_PRECISION diag, level_struct *l ); void d_plus_clover_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, schwarz_PRECISION_struct *s, level_struct *l ); void d_neighbor_aggregate_PRECISION( vector_PRECISION eta1, vector_PRECISION eta2, vector_PRECISION phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l ); void apply_twisted_bc_to_vector_PRECISION( vector_PRECISION eta, vector_PRECISION phi, double *theta, level_struct *l); void operator_updates_PRECISION( level_struct *l, struct Thread *threading ); - void shift_update_PRECISION( operator_PRECISION_struct *op, complex_PRECISION shift, level_struct *l, struct Thread *threading ); - void tm_term_PRECISION_setup( config_PRECISION tm_term, config_PRECISION odd_proj, level_struct *l, struct Thread *threading ); - void optimized_shift_update_PRECISION( complex_PRECISION mass_shift, level_struct *l, struct Thread *threading ); + void m0_update_PRECISION( PRECISION m0,operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + void tm_term_PRECISION_setup( PRECISION mu, PRECISION even, PRECISION odd, operator_PRECISION_struct *op, + level_struct *l, struct Thread *threading ); + void epsbar_term_PRECISION_setup( PRECISION epsbar, PRECISION even, PRECISION odd, operator_PRECISION_struct *op, + level_struct *l, struct Thread *threading ); + void two_flavours_test_PRECISION( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); + + void gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_set_even_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void tau1_gamma5_set_odd_to_zero_PRECISION( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void scale_even_odd_PRECISION( vector_PRECISION eta, vector_PRECISION phi, complex_double even, complex_double odd, + level_struct *l, struct Thread *threading ); + static inline void add_diagonal_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, - const config_PRECISION diag, const int length ) { + const config_PRECISION diag, const int length ) { + config_PRECISION diag_pt = diag; + vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length; +#ifdef HAVE_TM1p1 + if(g.n_flavours == 2) + while ( eta_pt < eta_end ) { + FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) + diag_pt -= 6; + FOR6( *eta_pt -= (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) + FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) + diag_pt -= 6; + FOR6( *eta_pt -= (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) + } + else +#endif + while ( eta_pt < eta_end ) + FOR12( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) + } + +#ifdef HAVE_TM1p1 + static inline void apply_doublet_coupling_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, + const config_PRECISION diag, const int length ) { config_PRECISION diag_pt = diag; vector_PRECISION phi_pt = phi, eta_pt = eta, eta_end = eta + length; while ( eta_pt < eta_end ) { - FOR12( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) + phi_pt += 6; + FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) + phi_pt -= 12; + diag_pt -= 6; + FOR6( *eta_pt += (*phi_pt)*(*diag_pt); eta_pt++; phi_pt++; diag_pt++; ) + phi_pt += 6; } } - static inline void zero12_PRECISION( const vector_PRECISION phi ) { - phi[ 0] = _COMPLEX_PRECISION_ZERO; - phi[ 1] = _COMPLEX_PRECISION_ZERO; - phi[ 2] = _COMPLEX_PRECISION_ZERO; - phi[ 3] = _COMPLEX_PRECISION_ZERO; - phi[ 4] = _COMPLEX_PRECISION_ZERO; - phi[ 5] = _COMPLEX_PRECISION_ZERO; - phi[ 6] = _COMPLEX_PRECISION_ZERO; - phi[ 7] = _COMPLEX_PRECISION_ZERO; - phi[ 8] = _COMPLEX_PRECISION_ZERO; - phi[ 9] = _COMPLEX_PRECISION_ZERO; - phi[10] = _COMPLEX_PRECISION_ZERO; - phi[11] = _COMPLEX_PRECISION_ZERO; - } +#endif // eta = D*phi static inline void mvm_PRECISION( const vector_PRECISION eta, const complex_PRECISION *D, const vector_PRECISION phi ) { @@ -117,6 +148,96 @@ eta[2] -= conj_PRECISION(D[8])*phi[2]; } +/* + // 1 +/- gamma_mu + static inline void pr_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt, const int mu, const int sign ) { + prp_pt[0] = l_pt[0] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+0]; + prp_pt[1] = l_pt[1] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+1]; + prp_pt[2] = l_pt[2] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+2]; + prp_pt[3] = l_pt[3] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+0]; + prp_pt[4] = l_pt[4] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+1]; + prp_pt[5] = l_pt[5] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+2]; + } + + // 1 +/- gamma_mu + static inline void pr_doublet_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt, const int mu, const int sign ) { + prp_pt[ 0] = l_pt[ 0] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+0]; + prp_pt[ 1] = l_pt[ 1] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+1]; + prp_pt[ 2] = l_pt[ 2] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+2]; + prp_pt[ 3] = l_pt[ 3] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+0]; + prp_pt[ 4] = l_pt[ 4] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+1]; + prp_pt[ 5] = l_pt[ 5] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+2]; + prp_pt[ 6] = l_pt[ 6] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+6]; + prp_pt[ 7] = l_pt[ 7] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+7]; + prp_pt[ 8] = l_pt[ 8] + sign*gamma_val_PRECISION[mu][0]*l_pt[3*gamma_co[mu][0]+6*gamma_doublet_offset[mu][0]+8]; + prp_pt[ 9] = l_pt[ 9] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+6]; + prp_pt[10] = l_pt[10] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+7]; + prp_pt[11] = l_pt[11] + sign*gamma_val_PRECISION[mu][1]*l_pt[3*gamma_co[mu][1]+6*gamma_doublet_offset[mu][1]+8]; + } + +static inline void project_PRECISION( complex_double *pr[4], complex_double *phi, int start, int end, level_struct *l ) { + int site_var = l->num_lattice_site_var; + complex_double *phi_pt = phi+start*site_var; + complex_double *phi_end = phi+end*site_var; + complex_double *pr_pt[4] = {pr[0]+start*site_var/2, pr[1]+start*site_var/2, pr[2]+start*site_var/2, prn[3]+start*site_var/2}; + +#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) + while( phi_pt < phi_end ) { + int mu = 0; + FOR4( pr_doublet_PRECISION( pr_pt[mu], phi_pt, mu, -1 ); pr_pt[mu]+=site_var/2; mu++;); + phi_pt += site_var; + } + else +#endif + while( phi_pt < phi_end ) { + int mu = 0; + FOR4( pr_PRECISION( pr_pt[mu], phi_pt, mu, -1 ); pr_pt[mu]+=site_var/2; mu++;); + phi_pt += site_var; + } + +#else + + PRECISION sign_re[4*3*SIMD_LENGTH_PRECISION]; + PRECISION sign_im[4*3*SIMD_LENGTH_PRECISION]; + int index_re[4*3*SIMD_LENGTH_PRECISION]; + int index_im[4*3*SIMD_LENGTH_PRECISION]; + + int j=0; + for ( int mu=0; mu<4; mu++ ) + for ( int i=0; i<3*SIMD_LENGTH_PRECISION; i++, j++ ) { + int spin = (i%6)/3; + sign_re[j] = creal(gamma_val_PRECISION[mu][spin])+creal(I*gamma_val_PRECISION[mu][spin]); + sign_im[j] = cimag(gamma_val_PRECISION[mu][spin])+cimag(I*gamma_val_PRECISION[mu][spin]); + index_re[j] = 6*gamma_co[mu][spin] + gamma_offset[mu][spin] + 0 + 2*(i%(site_var/2)); + index_im[j] = 6*gamma_co[mu][spin] - gamma_offset[mu][spin] + 1 + 2*(i%(site_var/2)); + } + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + } else +#endif + while( phi_pt < phi_end ) { + mm_PRECISION phi_pt1_re[3], phi_pt1_im[3]; + mm_loadi_6times_float( (PRECISION*) phi_pt+0, &(phi_pt1_re[0]), &(phi_pt1_re[1]), &(phi_pt1_re[2]), 2, 24 ); + mm_loadi_6times_float( (PRECISION*) phi_pt+1, &(phi_pt1_im[0]), &(phi_pt1_im[1]), &(phi_pt1_im[2]), 2, 24 ); + for ( int mu=0; mu<4; mu++ ) + for ( int i=0; i<3; i++ ) { + mm_PRECISION phi_pt2_re = mm_set_from_list( (PRECISION*) phi_pt+0, &(sign_re[(mu*3+i)*SIMD_LENGTH_PRECISION]), &(index_re[(mu*3+i)*SIMD_LENGTH_PRECISION]) ); + mm_PRECISION phi_pt2_im = mm_set_from_list( (PRECISION*) phi_pt+1, &(sign_im[(mu*3+i)*SIMD_LENGTH_PRECISION]), &(index_im[(mu*3+i)*SIMD_LENGTH_PRECISION]) ); + mm_PRECISION res_re = mm_sub_PRECISION( phi_pt1_re, phi_pt2_re ); + mm_PRECISION res_im = mm_sub_PRECISION( phi_pt1_im, phi_pt2_im ); + cstore_PRECISION( res_re, res_im, pr[mu] ); + pr[mu] += SIMD_LENGTH_PRECISION; + } + phi_pt += 4*3*SIMD_LENGTH_PRECISION; + } + + +#endif +} +*/ // 1 - gamma_T static inline void prp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { prp_pt[0] = l_pt[0] -GAMMA_T_SPIN0_VAL*l_pt[3*GAMMA_T_SPIN0_CO]; @@ -313,6 +434,370 @@ l_pt[11] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[3*GAMMA_X_SPIN3_CO+2]; } +//START +#ifdef HAVE_TM1p1 + +//#define flav_gamma(k) ((k)>1?((k)*3+6):((k)*3)) +#define flav_gamma(k) (3*(k)+6*((k)/2)) + + // 1 - gamma_T + static inline void dprp_T_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + prp_pt[ 0] = l_pt[ 0] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)]; + prp_pt[ 1] = l_pt[ 1] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1]; + prp_pt[ 2] = l_pt[ 2] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2]; + prp_pt[ 3] = l_pt[ 3] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)]; + prp_pt[ 4] = l_pt[ 4] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+1]; + prp_pt[ 5] = l_pt[ 5] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+2]; + prp_pt[ 6] = l_pt[ 6] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+6]; + prp_pt[ 7] = l_pt[ 7] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+7]; + prp_pt[ 8] = l_pt[ 8] -GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+8]; + prp_pt[ 9] = l_pt[ 9] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+6]; + prp_pt[10] = l_pt[10] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+7]; + prp_pt[11] = l_pt[11] -GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+8]; + } + + // 1 + gamma_T + static inline void dprn_T_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + prn_pt[ 0] = l_pt[ 0] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)]; + prn_pt[ 1] = l_pt[ 1] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+1]; + prn_pt[ 2] = l_pt[ 2] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+2]; + prn_pt[ 3] = l_pt[ 3] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)]; + prn_pt[ 4] = l_pt[ 4] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+1]; + prn_pt[ 5] = l_pt[ 5] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+2]; + prn_pt[ 6] = l_pt[ 6] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+6]; + prn_pt[ 7] = l_pt[ 7] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+7]; + prn_pt[ 8] = l_pt[ 8] +GAMMA_T_SPIN0_VAL*l_pt[flav_gamma(GAMMA_T_SPIN0_CO)+8]; + prn_pt[ 9] = l_pt[ 9] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+6]; + prn_pt[10] = l_pt[10] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+7]; + prn_pt[11] = l_pt[11] +GAMMA_T_SPIN1_VAL*l_pt[flav_gamma(GAMMA_T_SPIN1_CO)+8]; + } + + // - (1 - gamma_T) + static inline void dpbp_su3_T_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + l_pt[ 0] -= prp_su3_pt[ 0]; + l_pt[ 1] -= prp_su3_pt[ 1]; + l_pt[ 2] -= prp_su3_pt[ 2]; + l_pt[ 3] -= prp_su3_pt[ 3]; + l_pt[ 4] -= prp_su3_pt[ 4]; + l_pt[ 5] -= prp_su3_pt[ 5]; + l_pt[ 6] -= prp_su3_pt[ 6]; + l_pt[ 7] -= prp_su3_pt[ 7]; + l_pt[ 8] -= prp_su3_pt[ 8]; + l_pt[ 9] -= prp_su3_pt[ 9]; + l_pt[10] -= prp_su3_pt[10]; + l_pt[11] -= prp_su3_pt[11]; + l_pt[12] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)]; + l_pt[13] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+1]; + l_pt[14] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+2]; + l_pt[15] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)]; + l_pt[16] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+1]; + l_pt[17] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+2]; + l_pt[18] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+6]; + l_pt[19] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+7]; + l_pt[20] += GAMMA_T_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+8]; + l_pt[21] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+6]; + l_pt[22] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+7]; + l_pt[23] += GAMMA_T_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+8]; + } + + // -(1 + gamma_T) + static inline void dpbn_su3_T_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + l_pt[ 0] -= prn_su3_pt[ 0]; + l_pt[ 1] -= prn_su3_pt[ 1]; + l_pt[ 2] -= prn_su3_pt[ 2]; + l_pt[ 3] -= prn_su3_pt[ 3]; + l_pt[ 4] -= prn_su3_pt[ 4]; + l_pt[ 5] -= prn_su3_pt[ 5]; + l_pt[ 6] -= prn_su3_pt[ 6]; + l_pt[ 7] -= prn_su3_pt[ 7]; + l_pt[ 8] -= prn_su3_pt[ 8]; + l_pt[ 9] -= prn_su3_pt[ 9]; + l_pt[10] -= prn_su3_pt[10]; + l_pt[11] -= prn_su3_pt[11]; + l_pt[12] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)]; + l_pt[13] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+1]; + l_pt[14] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+2]; + l_pt[15] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)]; + l_pt[16] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+1]; + l_pt[17] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+2]; + l_pt[18] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+6]; + l_pt[19] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+7]; + l_pt[20] -= GAMMA_T_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN2_CO)+8]; + l_pt[21] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+6]; + l_pt[22] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+7]; + l_pt[23] -= GAMMA_T_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_T_SPIN3_CO)+8]; + } + + + // 1 - gamma_Z + static inline void dprp_Z_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + prp_pt[ 0] = l_pt[ 0] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)]; + prp_pt[ 1] = l_pt[ 1] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1]; + prp_pt[ 2] = l_pt[ 2] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2]; + prp_pt[ 3] = l_pt[ 3] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)]; + prp_pt[ 4] = l_pt[ 4] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+1]; + prp_pt[ 5] = l_pt[ 5] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+2]; + prp_pt[ 6] = l_pt[ 6] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+6]; + prp_pt[ 7] = l_pt[ 7] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+7]; + prp_pt[ 8] = l_pt[ 8] -GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+8]; + prp_pt[ 9] = l_pt[ 9] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+6]; + prp_pt[10] = l_pt[10] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+7]; + prp_pt[11] = l_pt[11] -GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+8]; + } + + // 1 + gamma_Z + static inline void dprn_Z_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + prn_pt[ 0] = l_pt[ 0] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)]; + prn_pt[ 1] = l_pt[ 1] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+1]; + prn_pt[ 2] = l_pt[ 2] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+2]; + prn_pt[ 3] = l_pt[ 3] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)]; + prn_pt[ 4] = l_pt[ 4] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+1]; + prn_pt[ 5] = l_pt[ 5] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+2]; + prn_pt[ 6] = l_pt[ 6] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+6]; + prn_pt[ 7] = l_pt[ 7] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+7]; + prn_pt[ 8] = l_pt[ 8] +GAMMA_Z_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN0_CO)+8]; + prn_pt[ 9] = l_pt[ 9] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+6]; + prn_pt[10] = l_pt[10] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+7]; + prn_pt[11] = l_pt[11] +GAMMA_Z_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Z_SPIN1_CO)+8]; + } + + // - (1 - gamma_Z) + static inline void dpbp_su3_Z_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + l_pt[ 0] -= prp_su3_pt[ 0]; + l_pt[ 1] -= prp_su3_pt[ 1]; + l_pt[ 2] -= prp_su3_pt[ 2]; + l_pt[ 3] -= prp_su3_pt[ 3]; + l_pt[ 4] -= prp_su3_pt[ 4]; + l_pt[ 5] -= prp_su3_pt[ 5]; + l_pt[ 6] -= prp_su3_pt[ 6]; + l_pt[ 7] -= prp_su3_pt[ 7]; + l_pt[ 8] -= prp_su3_pt[ 8]; + l_pt[ 9] -= prp_su3_pt[ 9]; + l_pt[10] -= prp_su3_pt[10]; + l_pt[11] -= prp_su3_pt[11]; + l_pt[12] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)]; + l_pt[13] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+1]; + l_pt[14] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+2]; + l_pt[15] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)]; + l_pt[16] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+1]; + l_pt[17] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+2]; + l_pt[18] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+6]; + l_pt[19] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+7]; + l_pt[20] += GAMMA_Z_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+8]; + l_pt[21] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+6]; + l_pt[22] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+7]; + l_pt[23] += GAMMA_Z_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+8]; + } + + // -(1 + gamma_Z) + static inline void dpbn_su3_Z_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + l_pt[ 0] -= prn_su3_pt[ 0]; + l_pt[ 1] -= prn_su3_pt[ 1]; + l_pt[ 2] -= prn_su3_pt[ 2]; + l_pt[ 3] -= prn_su3_pt[ 3]; + l_pt[ 4] -= prn_su3_pt[ 4]; + l_pt[ 5] -= prn_su3_pt[ 5]; + l_pt[ 6] -= prn_su3_pt[ 6]; + l_pt[ 7] -= prn_su3_pt[ 7]; + l_pt[ 8] -= prn_su3_pt[ 8]; + l_pt[ 9] -= prn_su3_pt[ 9]; + l_pt[10] -= prn_su3_pt[10]; + l_pt[11] -= prn_su3_pt[11]; + l_pt[12] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)]; + l_pt[13] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+1]; + l_pt[14] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+2]; + l_pt[15] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)]; + l_pt[16] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+1]; + l_pt[17] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+2]; + l_pt[18] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+6]; + l_pt[19] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+7]; + l_pt[20] -= GAMMA_Z_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN2_CO)+8]; + l_pt[21] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+6]; + l_pt[22] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+7]; + l_pt[23] -= GAMMA_Z_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Z_SPIN3_CO)+8]; + } + + + // 1 - gamma_Y + static inline void dprp_Y_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + prp_pt[ 0] = l_pt[ 0] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)]; + prp_pt[ 1] = l_pt[ 1] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1]; + prp_pt[ 2] = l_pt[ 2] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2]; + prp_pt[ 3] = l_pt[ 3] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)]; + prp_pt[ 4] = l_pt[ 4] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+1]; + prp_pt[ 5] = l_pt[ 5] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+2]; + prp_pt[ 6] = l_pt[ 6] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+6]; + prp_pt[ 7] = l_pt[ 7] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+7]; + prp_pt[ 8] = l_pt[ 8] -GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+8]; + prp_pt[ 9] = l_pt[ 9] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+6]; + prp_pt[10] = l_pt[10] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+7]; + prp_pt[11] = l_pt[11] -GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+8]; + } + + // 1 + gamma_Y + static inline void dprn_Y_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + prn_pt[ 0] = l_pt[ 0] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)]; + prn_pt[ 1] = l_pt[ 1] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+1]; + prn_pt[ 2] = l_pt[ 2] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+2]; + prn_pt[ 3] = l_pt[ 3] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)]; + prn_pt[ 4] = l_pt[ 4] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+1]; + prn_pt[ 5] = l_pt[ 5] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+2]; + prn_pt[ 6] = l_pt[ 6] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+6]; + prn_pt[ 7] = l_pt[ 7] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+7]; + prn_pt[ 8] = l_pt[ 8] +GAMMA_Y_SPIN0_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN0_CO)+8]; + prn_pt[ 9] = l_pt[ 9] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+6]; + prn_pt[10] = l_pt[10] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+7]; + prn_pt[11] = l_pt[11] +GAMMA_Y_SPIN1_VAL*l_pt[flav_gamma(GAMMA_Y_SPIN1_CO)+8]; + } + + // - (1 - gamma_Y) + static inline void dpbp_su3_Y_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + l_pt[ 0] -= prp_su3_pt[ 0]; + l_pt[ 1] -= prp_su3_pt[ 1]; + l_pt[ 2] -= prp_su3_pt[ 2]; + l_pt[ 3] -= prp_su3_pt[ 3]; + l_pt[ 4] -= prp_su3_pt[ 4]; + l_pt[ 5] -= prp_su3_pt[ 5]; + l_pt[ 6] -= prp_su3_pt[ 6]; + l_pt[ 7] -= prp_su3_pt[ 7]; + l_pt[ 8] -= prp_su3_pt[ 8]; + l_pt[ 9] -= prp_su3_pt[ 9]; + l_pt[10] -= prp_su3_pt[10]; + l_pt[11] -= prp_su3_pt[11]; + l_pt[12] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)]; + l_pt[13] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+1]; + l_pt[14] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+2]; + l_pt[15] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)]; + l_pt[16] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+1]; + l_pt[17] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+2]; + l_pt[18] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+6]; + l_pt[19] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+7]; + l_pt[20] += GAMMA_Y_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+8]; + l_pt[21] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+6]; + l_pt[22] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+7]; + l_pt[23] += GAMMA_Y_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+8]; + } + + // -(1 + gamma_Y) + static inline void dpbn_su3_Y_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + l_pt[ 0] -= prn_su3_pt[ 0]; + l_pt[ 1] -= prn_su3_pt[ 1]; + l_pt[ 2] -= prn_su3_pt[ 2]; + l_pt[ 3] -= prn_su3_pt[ 3]; + l_pt[ 4] -= prn_su3_pt[ 4]; + l_pt[ 5] -= prn_su3_pt[ 5]; + l_pt[ 6] -= prn_su3_pt[ 6]; + l_pt[ 7] -= prn_su3_pt[ 7]; + l_pt[ 8] -= prn_su3_pt[ 8]; + l_pt[ 9] -= prn_su3_pt[ 9]; + l_pt[10] -= prn_su3_pt[10]; + l_pt[11] -= prn_su3_pt[11]; + l_pt[12] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)]; + l_pt[13] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+1]; + l_pt[14] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+2]; + l_pt[15] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)]; + l_pt[16] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+1]; + l_pt[17] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+2]; + l_pt[18] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+6]; + l_pt[19] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+7]; + l_pt[20] -= GAMMA_Y_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN2_CO)+8]; + l_pt[21] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+6]; + l_pt[22] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+7]; + l_pt[23] -= GAMMA_Y_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_Y_SPIN3_CO)+8]; + } + + + // 1 - gamma_X + static inline void dprp_X_PRECISION( const vector_PRECISION prp_pt, const vector_PRECISION l_pt ) { + prp_pt[ 0] = l_pt[ 0] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)]; + prp_pt[ 1] = l_pt[ 1] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1]; + prp_pt[ 2] = l_pt[ 2] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2]; + prp_pt[ 3] = l_pt[ 3] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)]; + prp_pt[ 4] = l_pt[ 4] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+1]; + prp_pt[ 5] = l_pt[ 5] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+2]; + prp_pt[ 6] = l_pt[ 6] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+6]; + prp_pt[ 7] = l_pt[ 7] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+7]; + prp_pt[ 8] = l_pt[ 8] -GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+8]; + prp_pt[ 9] = l_pt[ 9] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+6]; + prp_pt[10] = l_pt[10] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+7]; + prp_pt[11] = l_pt[11] -GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+8]; + } + + // 1 + gamma_X + static inline void dprn_X_PRECISION( const vector_PRECISION prn_pt, const vector_PRECISION l_pt ) { + prn_pt[ 0] = l_pt[ 0] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)]; + prn_pt[ 1] = l_pt[ 1] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+1]; + prn_pt[ 2] = l_pt[ 2] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+2]; + prn_pt[ 3] = l_pt[ 3] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)]; + prn_pt[ 4] = l_pt[ 4] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+1]; + prn_pt[ 5] = l_pt[ 5] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+2]; + prn_pt[ 6] = l_pt[ 6] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+6]; + prn_pt[ 7] = l_pt[ 7] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+7]; + prn_pt[ 8] = l_pt[ 8] +GAMMA_X_SPIN0_VAL*l_pt[flav_gamma(GAMMA_X_SPIN0_CO)+8]; + prn_pt[ 9] = l_pt[ 9] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+6]; + prn_pt[10] = l_pt[10] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+7]; + prn_pt[11] = l_pt[11] +GAMMA_X_SPIN1_VAL*l_pt[flav_gamma(GAMMA_X_SPIN1_CO)+8]; + } + + // - (1 - gamma_X) + static inline void dpbp_su3_X_PRECISION( const vector_PRECISION prp_su3_pt, const vector_PRECISION l_pt ) { + l_pt[ 0] -= prp_su3_pt[ 0]; + l_pt[ 1] -= prp_su3_pt[ 1]; + l_pt[ 2] -= prp_su3_pt[ 2]; + l_pt[ 3] -= prp_su3_pt[ 3]; + l_pt[ 4] -= prp_su3_pt[ 4]; + l_pt[ 5] -= prp_su3_pt[ 5]; + l_pt[ 6] -= prp_su3_pt[ 6]; + l_pt[ 7] -= prp_su3_pt[ 7]; + l_pt[ 8] -= prp_su3_pt[ 8]; + l_pt[ 9] -= prp_su3_pt[ 9]; + l_pt[10] -= prp_su3_pt[10]; + l_pt[11] -= prp_su3_pt[11]; + l_pt[12] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)]; + l_pt[13] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+1]; + l_pt[14] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+2]; + l_pt[15] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)]; + l_pt[16] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+1]; + l_pt[17] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+2]; + l_pt[18] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+6]; + l_pt[19] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+7]; + l_pt[20] += GAMMA_X_SPIN2_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+8]; + l_pt[21] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+6]; + l_pt[22] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+7]; + l_pt[23] += GAMMA_X_SPIN3_VAL*prp_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+8]; + } + + // -(1 + gamma_X) + static inline void dpbn_su3_X_PRECISION( const vector_PRECISION prn_su3_pt, const vector_PRECISION l_pt ) { + l_pt[ 0] -= prn_su3_pt[ 0]; + l_pt[ 1] -= prn_su3_pt[ 1]; + l_pt[ 2] -= prn_su3_pt[ 2]; + l_pt[ 3] -= prn_su3_pt[ 3]; + l_pt[ 4] -= prn_su3_pt[ 4]; + l_pt[ 5] -= prn_su3_pt[ 5]; + l_pt[ 6] -= prn_su3_pt[ 6]; + l_pt[ 7] -= prn_su3_pt[ 7]; + l_pt[ 8] -= prn_su3_pt[ 8]; + l_pt[ 9] -= prn_su3_pt[ 9]; + l_pt[10] -= prn_su3_pt[10]; + l_pt[11] -= prn_su3_pt[11]; + l_pt[12] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)]; + l_pt[13] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+1]; + l_pt[14] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+2]; + l_pt[15] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)]; + l_pt[16] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+1]; + l_pt[17] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+2]; + l_pt[18] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+6]; + l_pt[19] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+7]; + l_pt[20] -= GAMMA_X_SPIN2_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN2_CO)+8]; + l_pt[21] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+6]; + l_pt[22] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+7]; + l_pt[23] -= GAMMA_X_SPIN3_VAL*prn_su3_pt[flav_gamma(GAMMA_X_SPIN3_CO)+8]; + } + +#endif +//END + static inline void twospin_p_T_PRECISION( const vector_PRECISION out_spin0and1, const vector_PRECISION out_spin2and3, const vector_PRECISION in ) { out_spin0and1[ 0] -= in[ 0]; out_spin0and1[ 1] -= in[ 1]; @@ -637,6 +1122,158 @@ out_spin2and3[11] -= in[11]; } + static inline void doublet_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { + // diagonal + eta[ 0] = clover[ 0]*phi[ 0]; + eta[ 1] = clover[ 1]*phi[ 1]; + eta[ 2] = clover[ 2]*phi[ 2]; + eta[ 3] = clover[ 3]*phi[ 3]; + eta[ 4] = clover[ 4]*phi[ 4]; + eta[ 5] = clover[ 5]*phi[ 5]; + eta[ 6] = clover[ 0]*phi[ 6]; + eta[ 7] = clover[ 1]*phi[ 7]; + eta[ 8] = clover[ 2]*phi[ 8]; + eta[ 9] = clover[ 3]*phi[ 9]; + eta[10] = clover[ 4]*phi[10]; + eta[11] = clover[ 5]*phi[11]; + eta[12] = clover[ 6]*phi[12]; + eta[13] = clover[ 7]*phi[13]; + eta[14] = clover[ 8]*phi[14]; + eta[15] = clover[ 9]*phi[15]; + eta[16] = clover[10]*phi[16]; + eta[17] = clover[11]*phi[17]; + eta[18] = clover[ 6]*phi[18]; + eta[19] = clover[ 7]*phi[19]; + eta[20] = clover[ 8]*phi[20]; + eta[21] = clover[ 9]*phi[21]; + eta[22] = clover[10]*phi[22]; + eta[23] = clover[11]*phi[23]; + // spin 0 and 1 flav 1 + eta[0] += clover[12]*phi[1]; + eta[0] += clover[13]*phi[2]; + eta[0] += clover[14]*phi[3]; + eta[0] += clover[15]*phi[4]; + eta[0] += clover[16]*phi[5]; + eta[1] += clover[17]*phi[2]; + eta[1] += clover[18]*phi[3]; + eta[1] += clover[19]*phi[4]; + eta[1] += clover[20]*phi[5]; + eta[2] += clover[21]*phi[3]; + eta[2] += clover[22]*phi[4]; + eta[2] += clover[23]*phi[5]; + eta[3] += clover[24]*phi[4]; + eta[3] += clover[25]*phi[5]; + eta[4] += clover[26]*phi[5]; + eta[1] += conj_PRECISION(clover[12])*phi[0]; + eta[2] += conj_PRECISION(clover[13])*phi[0]; + eta[3] += conj_PRECISION(clover[14])*phi[0]; + eta[4] += conj_PRECISION(clover[15])*phi[0]; + eta[5] += conj_PRECISION(clover[16])*phi[0]; + eta[2] += conj_PRECISION(clover[17])*phi[1]; + eta[3] += conj_PRECISION(clover[18])*phi[1]; + eta[4] += conj_PRECISION(clover[19])*phi[1]; + eta[5] += conj_PRECISION(clover[20])*phi[1]; + eta[3] += conj_PRECISION(clover[21])*phi[2]; + eta[4] += conj_PRECISION(clover[22])*phi[2]; + eta[5] += conj_PRECISION(clover[23])*phi[2]; + eta[4] += conj_PRECISION(clover[24])*phi[3]; + eta[5] += conj_PRECISION(clover[25])*phi[3]; + eta[5] += conj_PRECISION(clover[26])*phi[4]; + // spin 0 and 1 flav 2 + eta[ 6] += clover[12]*phi[ 7]; + eta[ 6] += clover[13]*phi[ 8]; + eta[ 6] += clover[14]*phi[ 9]; + eta[ 6] += clover[15]*phi[10]; + eta[ 6] += clover[16]*phi[11]; + eta[ 7] += clover[17]*phi[ 8]; + eta[ 7] += clover[18]*phi[ 9]; + eta[ 7] += clover[19]*phi[10]; + eta[ 7] += clover[20]*phi[11]; + eta[ 8] += clover[21]*phi[ 9]; + eta[ 8] += clover[22]*phi[10]; + eta[ 8] += clover[23]*phi[11]; + eta[ 9] += clover[24]*phi[10]; + eta[ 9] += clover[25]*phi[11]; + eta[10] += clover[26]*phi[11]; + eta[ 7] += conj_PRECISION(clover[12])*phi[ 6]; + eta[ 8] += conj_PRECISION(clover[13])*phi[ 6]; + eta[ 9] += conj_PRECISION(clover[14])*phi[ 6]; + eta[10] += conj_PRECISION(clover[15])*phi[ 6]; + eta[11] += conj_PRECISION(clover[16])*phi[ 6]; + eta[ 8] += conj_PRECISION(clover[17])*phi[ 7]; + eta[ 9] += conj_PRECISION(clover[18])*phi[ 7]; + eta[10] += conj_PRECISION(clover[19])*phi[ 7]; + eta[11] += conj_PRECISION(clover[20])*phi[ 7]; + eta[ 9] += conj_PRECISION(clover[21])*phi[ 8]; + eta[10] += conj_PRECISION(clover[22])*phi[ 8]; + eta[11] += conj_PRECISION(clover[23])*phi[ 8]; + eta[10] += conj_PRECISION(clover[24])*phi[ 9]; + eta[11] += conj_PRECISION(clover[25])*phi[ 9]; + eta[11] += conj_PRECISION(clover[26])*phi[10]; + // spin 2 and 3 flav 1 + eta[12] += clover[28]*phi[14]; + eta[12] += clover[27]*phi[13]; + eta[12] += clover[29]*phi[15]; + eta[12] += clover[30]*phi[16]; + eta[12] += clover[31]*phi[17]; + eta[13] += clover[32]*phi[14]; + eta[13] += clover[33]*phi[15]; + eta[13] += clover[34]*phi[16]; + eta[13] += clover[35]*phi[17]; + eta[14] += clover[36]*phi[15]; + eta[14] += clover[37]*phi[16]; + eta[14] += clover[38]*phi[17]; + eta[15] += clover[39]*phi[16]; + eta[15] += clover[40]*phi[17]; + eta[16] += clover[41]*phi[17]; + eta[13] += conj_PRECISION(clover[27])*phi[12]; + eta[14] += conj_PRECISION(clover[28])*phi[12]; + eta[15] += conj_PRECISION(clover[29])*phi[12]; + eta[16] += conj_PRECISION(clover[30])*phi[12]; + eta[17] += conj_PRECISION(clover[31])*phi[12]; + eta[14] += conj_PRECISION(clover[32])*phi[13]; + eta[15] += conj_PRECISION(clover[33])*phi[13]; + eta[16] += conj_PRECISION(clover[34])*phi[13]; + eta[17] += conj_PRECISION(clover[35])*phi[13]; + eta[15] += conj_PRECISION(clover[36])*phi[14]; + eta[16] += conj_PRECISION(clover[37])*phi[14]; + eta[17] += conj_PRECISION(clover[38])*phi[14]; + eta[16] += conj_PRECISION(clover[39])*phi[15]; + eta[17] += conj_PRECISION(clover[40])*phi[15]; + eta[17] += conj_PRECISION(clover[41])*phi[16]; + // spin 2 and 3 flav 2 + eta[18] += clover[28]*phi[20]; + eta[18] += clover[27]*phi[19]; + eta[18] += clover[29]*phi[21]; + eta[18] += clover[30]*phi[22]; + eta[18] += clover[31]*phi[23]; + eta[19] += clover[32]*phi[20]; + eta[19] += clover[33]*phi[21]; + eta[19] += clover[34]*phi[22]; + eta[19] += clover[35]*phi[23]; + eta[20] += clover[36]*phi[21]; + eta[20] += clover[37]*phi[22]; + eta[20] += clover[38]*phi[23]; + eta[21] += clover[39]*phi[22]; + eta[21] += clover[40]*phi[23]; + eta[22] += clover[41]*phi[23]; + eta[19] += conj_PRECISION(clover[27])*phi[18]; + eta[20] += conj_PRECISION(clover[28])*phi[18]; + eta[21] += conj_PRECISION(clover[29])*phi[18]; + eta[22] += conj_PRECISION(clover[30])*phi[18]; + eta[23] += conj_PRECISION(clover[31])*phi[18]; + eta[20] += conj_PRECISION(clover[32])*phi[19]; + eta[21] += conj_PRECISION(clover[33])*phi[19]; + eta[22] += conj_PRECISION(clover[34])*phi[19]; + eta[23] += conj_PRECISION(clover[35])*phi[19]; + eta[21] += conj_PRECISION(clover[36])*phi[20]; + eta[22] += conj_PRECISION(clover[37])*phi[20]; + eta[23] += conj_PRECISION(clover[38])*phi[20]; + eta[22] += conj_PRECISION(clover[39])*phi[21]; + eta[23] += conj_PRECISION(clover[40])*phi[21]; + eta[23] += conj_PRECISION(clover[41])*phi[22]; + } + static inline void spin0and1_site_clover_PRECISION( const vector_PRECISION eta, const vector_PRECISION phi, const config_PRECISION clover ) { // diagonal eta[ 0] = clover[ 0]*phi[ 0]; @@ -808,5 +1445,37 @@ eta[11] += conj_PRECISION(clover[40])*phi[ 9]; eta[11] += conj_PRECISION(clover[41])*phi[10]; } - + + static inline void site_clover_vectorized_PRECISION( PRECISION *eta, PRECISION *phi, PRECISION *clover ) { + mm_PRECISION in_re[3][6]; + mm_PRECISION in_im[3][6]; + + mm_PRECISION clov_re; + mm_PRECISION clov_im; + + mm_PRECISION out_re; + mm_PRECISION out_im; + + for ( int i=0; i<6; i++ ) { + mm_loadi_6times_PRECISION( phi+2*i+0, &(in_re[0][i]), &(in_re[1][i]), &(in_re[2][i]), 0, 12 ); + mm_loadi_6times_PRECISION( phi+2*i+1, &(in_im[0][i]), &(in_im[1][i]), &(in_im[2][i]), 0, 12 ); + } + + for ( int n=0; n<3; n++ ) { + clov_re = mm_load_PRECISION( clover ); + clov_im = mm_load_PRECISION( clover+SIMD_LENGTH_PRECISION ); + cmul_PRECISION( clov_re, clov_im, in_re[n][0], in_im[n][0], &out_re, &out_im ); + clover+=2*SIMD_LENGTH_PRECISION; + + for ( int i=1; i<6; i++ ) { + clov_re = mm_load_PRECISION( clover ); + clov_im = mm_load_PRECISION( clover+SIMD_LENGTH_PRECISION ); + cfmadd_PRECISION( clov_re, clov_im, in_re[n][i], in_im[n][i], &out_re, &out_im ); + clover+=2*SIMD_LENGTH_PRECISION; + } + + cstore_PRECISION( eta, out_re, out_im ); + eta+=2*SIMD_LENGTH_PRECISION; + } + } #endif diff --git a/src/gathering_generic.c b/src/gathering_generic.c index e5e4a23..47996d5 100644 --- a/src/gathering_generic.c +++ b/src/gathering_generic.c @@ -46,10 +46,14 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l // define data merging // define data gathering permutation int i, mu, current_rank, offset, offset_sum, - process_coords[4] = {0,0,0,0}, parent_coords[4] = {0,0,0,0}, *process_list = NULL; + process_coords[4] = {0,0,0,0}, parent_coords[4] = {0,0,0,0}, *process_list = NULL; MALLOC( process_list, int, l->num_processes ); +#ifdef HAVE_TM1p1 + MALLOC( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); +#else MALLOC( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); - +#endif + l->idle = 0; i = 0; for ( process_coords[T]=0; process_coords[T]parent_rank) ); + g.Cart_rank( g.comm_cart, parent_coords, &(l->parent_rank) ); // find out if current process is supposed to idle if ( offset_sum > 0 ) l->idle = 1; @@ -91,7 +95,11 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l MALLOC( gs->gather_list, int, gs->gather_list_length ); MALLOC( gs->permutation, int, l->num_inner_lattice_sites ); MALLOC( gs->reqs, MPI_Request, gs->gather_list_length ); +#ifdef HAVE_TM1p1 + MALLOC( gs->buffer, complex_PRECISION, 2*l->inner_vector_size ); +#else MALLOC( gs->buffer, complex_PRECISION, l->inner_vector_size ); +#endif MALLOC( field1, int, l->num_inner_lattice_sites ); MALLOC( field2, int, l->num_inner_lattice_sites ); @@ -112,7 +120,7 @@ void gathering_PRECISION_setup( gathering_PRECISION_struct *gs, level_struct *l for ( mu=0; mu<4; mu++ ) process_coords[mu] = g.my_coords[mu] + *(count[mu]) * (l->comm_offset[mu]/merge[mu]); - g.Cart_rank( g.comm_cart, process_coords, gs->gather_list + j ); + g.Cart_rank( g.comm_cart, process_coords, gs->gather_list + j ); j++; @@ -204,32 +212,56 @@ void gathering_PRECISION_free( gathering_PRECISION_struct *gs, level_struct *l ) FREE( gs->gather_list, int, gs->gather_list_length ); FREE( gs->permutation, int, l->num_inner_lattice_sites ); FREE( gs->reqs, MPI_Request, gs->gather_list_length ); +#ifdef HAVE_TM1p1 + FREE( gs->buffer, complex_PRECISION, 2*l->inner_vector_size ); +#else FREE( gs->buffer, complex_PRECISION, l->inner_vector_size ); +#endif MPI_Comm_free( &(gs->level_comm) ); MPI_Group_free( &(gs->level_comm_group) ); } +#ifdef HAVE_TM1p1 + FREE( gs->transfer_buffer, complex_PRECISION, 2 * gs->dist_inner_lattice_sites * l->num_lattice_site_var ); +#else FREE( gs->transfer_buffer, complex_PRECISION, gs->dist_inner_lattice_sites * l->num_lattice_site_var ); +#endif } void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_struct *in, level_struct *l ) { int send_size_hopp = l->gs_PRECISION.dist_inner_lattice_sites * 4 * SQUARE( l->num_lattice_site_var ), - send_size_clov = l->gs_PRECISION.dist_inner_lattice_sites * ( (l->num_lattice_site_var*(l->num_lattice_site_var+1))/2 ); + send_size_clov = l->gs_PRECISION.dist_inner_lattice_sites * ( (l->num_lattice_site_var*(l->num_lattice_site_var+1))/2 ), + send_size_block = l->gs_PRECISION.dist_inner_lattice_sites * ( (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)) ); #ifdef HAVE_TM - int send_size_block = l->gs_PRECISION.dist_inner_lattice_sites * ( (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)) ); + out->mu = in->mu; + out->mu_even_shift = in->mu_even_shift; + out->mu_odd_shift = in->mu_odd_shift; #endif - + out->m0 = in->m0; +#ifdef HAVE_TM1p1 + out->epsbar = in->epsbar; + out->epsbar_ig5_even_shift = in->epsbar_ig5_even_shift; + out->epsbar_ig5_odd_shift = in->epsbar_ig5_odd_shift; +#endif + if ( g.my_rank != l->parent_rank ) { - MPI_Request req; + MPI_Request req, odd_req; +#ifdef HAVE_TM1p1 + MPI_Request eps_req; + MPI_Isend( in->epsbar_term, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 4, g.comm_cart, &eps_req ); +#endif #ifdef HAVE_TM - MPI_Request tm_req, odd_req; - MPI_Isend( in->tm_term, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 2, g.comm_cart, &tm_req ); - MPI_Isend( in->odd_proj, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 3, g.comm_cart, &odd_req ); + MPI_Request tm_req; + MPI_Isend( in->tm_term, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 3, g.comm_cart, &tm_req ); #endif + MPI_Isend( in->odd_proj, send_size_block, MPI_COMPLEX_PRECISION, l->parent_rank, 2, g.comm_cart, &odd_req ); MPI_Isend( in->D, send_size_hopp, MPI_COMPLEX_PRECISION, l->parent_rank, 0, g.comm_cart, &req ); MPI_Send( in->clover, send_size_clov, MPI_COMPLEX_PRECISION, l->parent_rank, 1, g.comm_cart ); +#ifdef HAVE_TM1p1 + MPI_Wait( &eps_req, MPI_STATUS_IGNORE ); +#endif #ifdef HAVE_TM MPI_Wait( &tm_req, MPI_STATUS_IGNORE ); MPI_Wait( &odd_req, MPI_STATUS_IGNORE ); @@ -238,43 +270,54 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s } else { int i, j, n=l->gs_PRECISION.gather_list_length, s=l->num_inner_lattice_sites, t, *pi = l->gs_PRECISION.permutation; - vector_PRECISION buffer_hopp = NULL, buffer_clov = NULL; - MPI_Request *hopp_reqs = NULL, *clov_reqs = NULL; + vector_PRECISION buffer_hopp = NULL, buffer_clov = NULL, buffer_odd_proj = NULL; + MPI_Request *hopp_reqs = NULL, *clov_reqs = NULL, *odd_proj_reqs = NULL; +#ifdef HAVE_TM1p1 + vector_PRECISION buffer_eps_term = NULL; + MPI_Request *eps_term_reqs = NULL; + MALLOC( buffer_eps_term, complex_PRECISION, n*send_size_block ); + MALLOC( eps_term_reqs, MPI_Request, n ); +#endif #ifdef HAVE_TM - vector_PRECISION buffer_tm_term = NULL, buffer_odd_proj = NULL; - MPI_Request *tm_term_reqs = NULL, *odd_proj_reqs = NULL; + vector_PRECISION buffer_tm_term = NULL; + MPI_Request *tm_term_reqs = NULL; MALLOC( buffer_tm_term, complex_PRECISION, n*send_size_block ); - MALLOC( buffer_odd_proj, complex_PRECISION, n*send_size_block ); MALLOC( tm_term_reqs, MPI_Request, n ); - MALLOC( odd_proj_reqs, MPI_Request, n ); #endif MALLOC( buffer_hopp, complex_PRECISION, n*send_size_hopp ); MALLOC( buffer_clov, complex_PRECISION, n*send_size_clov ); + MALLOC( buffer_odd_proj, complex_PRECISION, n*send_size_block ); MALLOC( hopp_reqs, MPI_Request, n ); MALLOC( clov_reqs, MPI_Request, n ); + MALLOC( odd_proj_reqs, MPI_Request, n ); PROF_PRECISION_START( _GD_COMM ); for ( i=1; igs_PRECISION.gather_list[i], 4, g.comm_cart, &(eps_term_reqs[i]) ); +#endif #ifdef HAVE_TM MPI_Irecv( buffer_tm_term+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, - l->gs_PRECISION.gather_list[i], 2, g.comm_cart, &(tm_term_reqs[i]) ); - MPI_Irecv( buffer_odd_proj+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, - l->gs_PRECISION.gather_list[i], 3, g.comm_cart, &(odd_proj_reqs[i]) ); + l->gs_PRECISION.gather_list[i], 3, g.comm_cart, &(tm_term_reqs[i]) ); #endif MPI_Irecv( buffer_hopp+i*send_size_hopp, send_size_hopp, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 0, g.comm_cart, &(hopp_reqs[i]) ); MPI_Irecv( buffer_clov+i*send_size_clov, send_size_clov, MPI_COMPLEX_PRECISION, l->gs_PRECISION.gather_list[i], 1, g.comm_cart, &(clov_reqs[i]) ); + MPI_Irecv( buffer_odd_proj+i*send_size_block, send_size_block, MPI_COMPLEX_PRECISION, + l->gs_PRECISION.gather_list[i], 2, g.comm_cart, &(odd_proj_reqs[i]) ); } PROF_PRECISION_STOP( _GD_COMM, 2*n-2 ); - -#ifdef HAVE_TM + +#ifdef HAVE_TM1p1 for ( i=0; itm_term[i]; - + buffer_eps_term[i] = in->epsbar_term[i]; +#endif +#ifdef HAVE_TM for ( i=0; iodd_proj[i]; + buffer_tm_term[i] = in->tm_term[i]; #endif for ( i=0; iclover[i]; -#ifdef HAVE_TM + for ( i=0; iodd_proj[i]; + +#ifdef HAVE_TM1p1 PROF_PRECISION_START( _GD_IDLE ); for ( i=1; itm_term[ t*pi[i] + j ] = buffer_tm_term[ t*i + j ]; - + out->epsbar_term[ t*pi[i] + j ] = buffer_eps_term[ t*i + j ]; +#endif +#ifdef HAVE_TM PROF_PRECISION_START( _GD_IDLE ); for ( i=1; iodd_proj[ t*pi[i] + j ] = buffer_odd_proj[ t*i + j ]; + out->tm_term[ t*pi[i] + j ] = buffer_tm_term[ t*i + j ]; #endif PROF_PRECISION_START( _GD_IDLE ); @@ -324,20 +371,34 @@ void conf_PRECISION_gather( operator_PRECISION_struct *out, operator_PRECISION_s for ( i=0; iclover[ t*pi[i] + j ] = buffer_clov[ t*i + j ]; + + PROF_PRECISION_START( _GD_IDLE ); + for ( i=1; iodd_proj[ t*pi[i] + j ] = buffer_odd_proj[ t*i + j ]; FREE( buffer_hopp, complex_PRECISION, n*send_size_hopp ); FREE( buffer_clov, complex_PRECISION, n*send_size_clov ); + FREE( buffer_odd_proj, complex_PRECISION, n*send_size_block ); FREE( hopp_reqs, MPI_Request, n ); FREE( clov_reqs, MPI_Request, n ); + FREE( odd_proj_reqs, MPI_Request, n ); #ifdef HAVE_TM FREE( buffer_tm_term, complex_PRECISION, n*send_size_block ); - FREE( buffer_odd_proj, complex_PRECISION, n*send_size_block ); FREE( tm_term_reqs, MPI_Request, n ); - FREE( odd_proj_reqs, MPI_Request, n ); #endif +#ifdef HAVE_TM1p1 + FREE( buffer_eps_term, complex_PRECISION, n*send_size_block ); + FREE( eps_term_reqs, MPI_Request, n ); +#endif + } l->dummy_p_PRECISION.op = out; - l->dummy_p_PRECISION.shift = 0; l->dummy_p_PRECISION.v_start = 0; l->dummy_p_PRECISION.v_end = l->inner_vector_size; diff --git a/src/ghost_generic.c b/src/ghost_generic.c index 8db1cfc..5a423a0 100644 --- a/src/ghost_generic.c +++ b/src/ghost_generic.c @@ -26,7 +26,7 @@ void negative_sendrecv_PRECISION( vector_PRECISION phi, const int mu, comm_PRECI if( l->global_splitting[mu] > 1 ) { int i, j, num_boundary_sites = c->num_boundary_sites[2*mu+1], boundary_start, - *boundary_table = c->boundary_table[2*mu+1], n = l->num_lattice_site_var; + *boundary_table = c->boundary_table[2*mu+1], n = l->num_lattice_site_var; vector_PRECISION buffer, tmp_pt, buffer_pt; @@ -102,6 +102,10 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str if ( g.method < 5 ) factor = 2; } + +#ifdef HAVE_TM1p1 + factor *= 2; +#endif if ( buffer_size <= 0 ) { c->comm_start[0] = c->offset*l->num_inner_lattice_sites; @@ -128,13 +132,22 @@ void ghost_alloc_PRECISION( int buffer_size, comm_PRECISION_struct *c, level_str } else { for ( mu=0; mu<4; mu++ ) { c->max_length[mu] = buffer_size; +#ifdef HAVE_TM1p1 + MALLOC( c->buffer[2*mu], complex_PRECISION, 2*buffer_size ); + MALLOC( c->buffer[2*mu+1], complex_PRECISION, 2*buffer_size ); +#else MALLOC( c->buffer[2*mu], complex_PRECISION, buffer_size ); MALLOC( c->buffer[2*mu+1], complex_PRECISION, buffer_size ); +#endif } } if ( l->vbuf_PRECISION[8] == NULL ) { +#ifdef HAVE_TM1p1 + MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size ); +#else MALLOC( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size ); +#endif } } @@ -149,7 +162,11 @@ void ghost_free_PRECISION( comm_PRECISION_struct *c, level_struct *l ) { } if ( l->vbuf_PRECISION[8] != NULL ) { +#ifdef HAVE_TM1p1 + FREE( l->vbuf_PRECISION[8], complex_PRECISION, 2*l->vector_size ); +#else FREE( l->vbuf_PRECISION[8], complex_PRECISION, l->vector_size ); +#endif } } @@ -194,6 +211,15 @@ void ghost_sendrecv_PRECISION( vector_PRECISION phi, const int mu, const int dir table_start = c->num_even_boundary_sites[mu_dir]; } +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + length[0] *= 2; + length[1] *= 2; + comm_start *= 2; + offset *= 2; + } +#endif + ASSERT( c->in_use[mu_dir] == 0 ); c->in_use[mu_dir] = 1; @@ -270,6 +296,11 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, int mu_dir = 2*mu-MIN(dir,0); int i, j, *table, offset = c->offset, length[2]={0,0}, table_start = 0; vector_PRECISION buffer, phi_pt; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) + offset *= 2; +#endif if ( amount == _FULL_SYSTEM ) { length[0] = (c->num_boundary_sites[2*mu])*offset; @@ -284,7 +315,7 @@ void ghost_wait_PRECISION( vector_PRECISION phi, const int mu, const int dir, length[1] = c->num_odd_boundary_sites[2*mu+1]*offset; table_start = c->num_even_boundary_sites[mu_dir]; } - + ASSERT( c->in_use[mu_dir] == 1 ); if ( dir == 1 ) { diff --git a/src/gram_schmidt_generic.c b/src/gram_schmidt_generic.c new file mode 100644 index 0000000..71eb26c --- /dev/null +++ b/src/gram_schmidt_generic.c @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#include "main.h" + +#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION +void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vect, level_struct *l, struct Thread *threading ) +#else +void gram_schmidt_on_aggregates_PRECISION( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) +#endif +{ + + PROF_PRECISION_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading ); + +#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION + SYNC_CORES(threading) + SYNC_HYPERTHREADS(threading) + int i, j, k, k1, k2, num_aggregates = l->s_PRECISION.num_aggregates, + aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; + + complex_PRECISION alpha1, alpha2; + vector_PRECISION v_pt1, v_pt2; + PRECISION norm1, norm2; + + for ( j=threading->n_thread*threading->core+threading->thread; jn_thread*threading->n_core ) { + for ( k1=0; k1 V[k2] | 2*j-th and 2*j+1-st aggregate + for ( i=0; idepth == 0) + block_gram_schmidt_PRECISION( V, num_vec, l, threading ); + else + aggregate_gram_schmidt_PRECISION( V, num_vec, l, threading ); +#endif + + PROF_PRECISION_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading ); +} + + +void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int begin, const int n, level_struct *l, struct Thread *threading ) { + + // NOTE: only thread safe, if "buffer" is the same buffer for all threads belonging to a common MPI process + START_MASTER(threading) + PROF_PRECISION_START( _LA ); + END_MASTER(threading) + SYNC_CORES(threading) + + PRECISION beta; + int i, j, start, end; + + compute_core_start_end_custom( 0, l->inner_vector_size, &start, &end, l, threading, l->num_lattice_site_var ); + + for ( i=begin; iinner_vector_size, l, threading ); + SYNC_CORES(threading) + START_MASTER(threading) + for ( j=0; j0 ) { + START_MASTER(threading) + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( buffer, buffer+n, i, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + END_MASTER(threading) + SYNC_MASTER_TO_ALL(threading) + } + + for( j=0; jinner_vector_size, l, threading ); + SYNC_CORES(threading) + } + + SYNC_CORES(threading) + + beta = global_norm_PRECISION( V[i], 0, l->inner_vector_size, l, threading ); + SYNC_MASTER_TO_ALL(threading) + vector_PRECISION_real_scale( V[i], V[i], creal(1.0/beta), start, end, l ); + SYNC_CORES(threading) + } + + START_MASTER(threading) + PROF_PRECISION_STOP( _LA, 1 ); + END_MASTER(threading) + SYNC_CORES(threading) +} diff --git a/src/gram_schmidt_generic.h b/src/gram_schmidt_generic.h new file mode 100644 index 0000000..1500157 --- /dev/null +++ b/src/gram_schmidt_generic.h @@ -0,0 +1,521 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#ifndef GRAM_SCHMIDT_PRECISION_HEADER +#define GRAM_SCHMIDT_PRECISION_HEADER + +// Gram-Schmidt on full vectors +void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int start, const int n, + level_struct *l, struct Thread *threading ); +// Gram-Schmidt on aggregates +#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION +void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vec, + level_struct *l, struct Thread *threading ); +#else // optimized version on the operator layout +void gram_schmidt_on_aggregates_PRECISION( complex_PRECISION *operator, const int num_vec, + level_struct *l, struct Thread *threading ); +#endif + +#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION + +// SIMD version of gram_schmidt_on_aggregates optimized on a operator layout +// block_gram_schmidt_PRECISION follows, after definition of others inline void functions marked by "used by *IT*" +static inline void block_gram_schmidt_PRECISION( complex_PRECISION *V, int num_vec, level_struct *l, + struct Thread *threading ); + +static inline void aggregate_gram_schmidt_PRECISION( complex_PRECISION *V, const int num_vec, + level_struct *l, struct Thread *threading ) { + + SYNC_CORES(threading) + SYNC_HYPERTHREADS(threading) + int i, j, k, k1, k2, k3, num_aggregates = l->s_PRECISION.num_aggregates, + aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; + + PRECISION *v_pt1; + PRECISION *v_pt2; + PRECISION norm1, norm2; + PRECISION next_norm1; + PRECISION next_norm2; + int ldv = SIMD_LENGTH_PRECISION; + int V_block_offset = 2*l->vector_size; + + for ( j=threading->n_thread*threading->core+threading->thread; jn_thread*threading->n_core ) { + + v_pt1 = (PRECISION *)V + 0 + j*aggregate_size*2*ldv; + + next_norm1 = 0.0; + next_norm2 = 0.0; + for ( i=0; is_PRECISION.num_aggregates, + aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; + + PRECISION *v_pt1; + PRECISION *v_pt2; + PRECISION norm; + PRECISION next_norm; + int ldv = leading_dimension; + //offset = 6; + + + // current thread chooses an aggregate + for ( int jp=threading->core; jp<2*num_aggregates; jp+=threading->n_core ) { + j = jp/2; + int component = jp%2; + + + v_pt1 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv; + + next_norm = 0.0; + + // for the whole aggregate + for ( i=0; is_PRECISION.num_aggregates; + int aggregate_size = l->inner_vector_size / num_aggregates; + int offset = l->num_lattice_site_var/2; + + for ( int jp=threading->core; jpn_core ) { + int j = jp/2; + int component = jp%2; + // factors 2 are for complex and spin01/23 aggregates + Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; + Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; + mm_PRECISION U_re; + mm_PRECISION U_im; + mm_PRECISION B_re; + mm_PRECISION B_im; + mm_PRECISION S_re[SIMD_LENGTH_PRECISION]; + mm_PRECISION S_im[SIMD_LENGTH_PRECISION]; + for( int i=0; is_PRECISION.num_aggregates; + int aggregate_size = l->inner_vector_size / num_aggregates; + int offset = l->num_lattice_site_var/2; + + for ( int jp=threading->core; jpn_core ) { + int j = jp/2; + int component = jp%2; + // factors 2 are for complex and spin01/23 aggregates + Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; + Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; + mm_PRECISION U_re; + mm_PRECISION U_im; + mm_PRECISION B_re; + mm_PRECISION B_im; + mm_PRECISION S_re[SIMD_LENGTH_PRECISION]; + mm_PRECISION S_im[SIMD_LENGTH_PRECISION]; + for( int i=0; is_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION, 64); + ((PRECISION **)threading->workspace)[0] = S; + END_LOCKED_MASTER(threading) + S = ((PRECISION **)threading->workspace)[0]; + + aggregate_block_dot_block_PRECISION(S, U, B, num_vec, SIMD_LENGTH_PRECISION, l , threading); + aggregate_block_minus_block_times_dot_PRECISION(B, U, S, num_vec, SIMD_LENGTH_PRECISION, l , threading); + + START_LOCKED_MASTER(threading) + FREE_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION); + END_LOCKED_MASTER(threading) + + END_NO_HYPERTHREADS(threading) +} + +static inline void block_gram_schmidt_PRECISION( complex_PRECISION *V, int num_vec, level_struct *l, + struct Thread *threading ) { + SYNC_CORES(threading); + for ( int i=0; ivector_size), + (PRECISION *)(V + j*l->vector_size), vecs, + l, threading ); + aggregate_gram_schmidt_block_PRECISION( (PRECISION *)(V + i*l->vector_size), vecs, SIMD_LENGTH_PRECISION, l, threading ); + } + SYNC_CORES(threading); +} + +#endif //OPTIMIZED_INTERPOLATION_SETUP_PRECISION +#endif diff --git a/src/init.c b/src/init.c index e40d771..6295390 100644 --- a/src/init.c +++ b/src/init.c @@ -44,19 +44,13 @@ void next_level_setup( vector_double *V, level_struct *l, struct Thread *threadi // define next level parameters l->next_level->level = l->level-1; l->next_level->depth = l->depth+1; - l->next_level->real_shift = l->real_shift; - l->next_level->dirac_shift = l->dirac_shift; -#ifdef HAVE_TM - l->next_level->tm_shift = g.tm_mu*g.tm_mu_factor[l->next_level->depth]; - l->next_level->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->next_level->depth]; - l->next_level->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->next_level->depth]; -#endif l->next_level->tol = l->tol; l->next_level->post_smooth_iter = g.post_smooth_iter[l->depth+1]; l->next_level->relax_fac = g.relax_fac[l->depth+1]; l->next_level->block_iter = g.block_iter[l->depth+1]; l->next_level->setup_iter = g.setup_iter[l->depth+1]; l->next_level->num_eig_vect = l->level==1?l->num_eig_vect:g.num_eig_vect[l->depth+1]; + l->next_level->num_parent_eig_vect = l->num_eig_vect; l->next_level->num_lattice_site_var = 2 * l->num_eig_vect; l->next_level->n_cy = g.ncycle[l->depth+1]; l->next_level->global_lattice = g.global_lattice[l->depth+1]; @@ -118,7 +112,9 @@ void next_level_setup( vector_double *V, level_struct *l, struct Thread *threadi } } + START_LOCKED_MASTER(threading) if ( l->depth == 0 ) printf0("\ninitial coarse grid correction is defined\n"); + END_LOCKED_MASTER(threading) } @@ -142,8 +138,8 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) double t0=0, t1=0; START_LOCKED_MASTER(threading) + g.in_setup = 1; if ( g.vt.evaluation ) { - l->dirac_shift = l->real_shift; l->level = g.num_levels-1; } @@ -160,9 +156,14 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) g.tol, _RIGHT, vcycle_float, &(g.p_MP), l ); g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) +#ifdef HAVE_TM1p1 + MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); + MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); +#else MALLOC( g.p.b, complex_double, l->inner_vector_size ); MALLOC( g.p.x, complex_double, l->inner_vector_size ); #endif +#endif #ifdef INIT_ONE_PREC } else { #endif @@ -181,9 +182,14 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) g.tol, _NOTHING, NULL, &(g.p_MP), l ); g.p.op = &(g.op_double); #if defined(INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) +#ifdef HAVE_TM1p1 + MALLOC( g.p.b, complex_double, 2*l->inner_vector_size ); + MALLOC( g.p.x, complex_double, 2*l->inner_vector_size ); +#else MALLOC( g.p.b, complex_double, l->inner_vector_size ); MALLOC( g.p.x, complex_double, l->inner_vector_size ); #endif +#endif #ifdef INIT_ONE_PREC } else { #endif @@ -246,19 +252,26 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) } if ( g.method >=0 ) printf0("| restart length: %-3d |\n", g.restart ); - printf0("| m0: %+9.6lf |\n", creal(l->dirac_shift) ); - if(g.setup_m0!=l->dirac_shift) + printf0("| m0: %+9.6lf |\n", g.m0 ); + if(g.setup_m0!=g.m0) printf0("| setup m0: %+9.6lf |\n", g.setup_m0 ); printf0("| csw: %+9.6lf |\n", g.csw ); #ifdef HAVE_TM - printf0("| mu: %+9.6lf |\n", g.tm_mu); - if(g.setup_tm_mu!=g.tm_mu) - printf0("| setup mu: %+9.6lf |\n", g.setup_tm_mu ); - if(g.tm_mu_odd_shift!=0.) - printf0("| mu on odd sites: %+9.6lf |\n", g.tm_mu + g.tm_mu_odd_shift ); - if(g.tm_mu_even_shift!=0.) - printf0("| mu on even sites: %+9.6lf |\n", g.tm_mu + g.tm_mu_even_shift ); - + printf0("| mu: %+9.6lf |\n", g.mu); + if(g.setup_mu!=g.mu) + printf0("| setup mu: %+9.6lf |\n", g.setup_mu ); + if(g.mu_odd_shift!=0.) + printf0("| mu on odd sites: %+9.6lf |\n", g.mu + g.mu_odd_shift ); + if(g.mu_even_shift!=0.) + printf0("| mu on even sites: %+9.6lf |\n", g.mu + g.mu_even_shift ); +#endif +#ifdef HAVE_TM1p1 + if(g.epsbar) + printf0("| epsbar: %+9.6lf |\n", g.epsbar); + if(g.epsbar_ig5_odd_shift!=0.) + printf0("| ig5 epsbar odd sites: %+9.6lf |\n", g.epsbar_ig5_odd_shift ); + if(g.epsbar_ig5_even_shift!=0.) + printf0("| ig5 epsbar even sites: %+9.6lf |\n", g.epsbar_ig5_even_shift ); #endif if ( g.method > 0 ) { printf0("+----------------------------------------------------------+\n"); @@ -285,13 +298,20 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) printf0("| tolerance: %-5.0le |\n", g.coarse_tol ); } #ifdef HAVE_TM - if( g.tm_mu!=0. && g.tm_mu_factor[i]!=1 ) - printf0("| mu: %+9.6lf |\n", g.tm_mu * g.tm_mu_factor[i]); - if( g.tm_mu_odd_shift!=0. && g.tm_mu_factor[i]!=1 ) - printf0("| mu on odd sites: %+9.6lf |\n", (g.tm_mu + g.tm_mu_odd_shift) * g.tm_mu_factor[i] ); - if( g.tm_mu_even_shift!=0. && g.tm_mu_factor[i]!=1 ) - printf0("| mu on even sites: %+9.6lf |\n", (g.tm_mu + g.tm_mu_even_shift) * g.tm_mu_factor[i] ); - + if( g.mu!=0. && g.mu_factor[i]!=1 ) + printf0("| mu: %+9.6lf |\n", g.mu * g.mu_factor[i] ); + if( g.mu_odd_shift!=0. && g.mu_factor[i]!=1 ) + printf0("| mu on odd sites: %+9.6lf |\n", (g.mu + g.mu_odd_shift) * g.mu_factor[i] ); + if( g.mu_even_shift!=0. && g.mu_factor[i]!=1 ) + printf0("| mu on even sites: %+9.6lf |\n", (g.mu + g.mu_even_shift) * g.mu_factor[i] ); +#endif +#ifdef HAVE_TM1p1 + if( g.epsbar!=0. && g.epsbar_factor[i]!=1 ) + printf0("| epsbar: %+9.6lf |\n", g.epsbar * g.epsbar_factor[i] ); + if(g.epsbar_ig5_odd_shift!=0. && g.epsbar_factor[i]!=1) + printf0("| ig5 epsbar on odd sites: %+9.6lf |\n", (g.epsbar + g.epsbar_ig5_odd_shift) * g.epsbar_factor[i] ); + if(g.epsbar_ig5_even_shift!=0. && g.epsbar_factor[i]!=1) + printf0("| ig5 epsbar on even sites: %+9.6lf |\n", (g.epsbar + g.epsbar_ig5_even_shift) *g.epsbar_factor[i] ); #endif } } @@ -305,15 +325,17 @@ void method_setup( vector_double *V, level_struct *l, struct Thread *threading ) printf0("\n"); } #endif + g.in_setup = 0; END_LOCKED_MASTER(threading) + + START_LOCKED_MASTER(threading) + if ( l->depth==0 && g.method >=0 ) + prof_print( l ); + END_LOCKED_MASTER(threading) #ifdef DEBUG test_routine( l, threading ); #endif - START_LOCKED_MASTER(threading) - if ( l->depth==0 ) - prof_print( l ); - END_LOCKED_MASTER(threading) } @@ -341,9 +363,14 @@ void method_free( level_struct *l ) { #endif fgmres_MP_struct_free( &(g.p_MP) ); #if defined (INIT_ONE_PREC) && (defined (DEBUG) || defined (TEST_VECTOR_ANALYSIS)) +#ifdef HAVE_TM1p1 + FREE( g.p.b, complex_double, 2*l->inner_vector_size ); + FREE( g.p.x, complex_double, 2*l->inner_vector_size ); +#else FREE( g.p.b, complex_double, l->inner_vector_size ); FREE( g.p.x, complex_double, l->inner_vector_size ); #endif +#endif #ifdef INIT_ONE_PREC } else { #endif @@ -368,28 +395,28 @@ void method_update( int setup_iter, level_struct *l, struct Thread *threading ) if ( g.method > 0 && g.interpolation && g.num_levels > 1 && setup_iter > 0 ) { - double t0=0, t1=0, shift = creal(l->dirac_shift); + double t0=0, t1=0; START_LOCKED_MASTER(threading) g.in_setup = 1; - if ( l->depth==0 ) - prof_init( l ); + if ( l->depth==0 ) + prof_init( l ); END_LOCKED_MASTER(threading) - START_MASTER(threading) - t0 = MPI_Wtime(); - END_MASTER(threading) + MASTER(threading) + t0 = MPI_Wtime(); -#ifndef HAVE_TM - if ( g.setup_m0 != shift ) - optimized_shift_update( (complex_double)g.setup_m0, l, threading ); -#else - double tm_shift = g.tm_mu; - if ( g.setup_tm_mu != tm_shift || g.setup_m0 != shift ) { - g.tm_mu = g.setup_tm_mu; - optimized_shift_update( (complex_double)g.setup_m0, l, threading ); + if ( g.setup_m0 != g.m0 ) { + m0_update( (complex_double)g.setup_m0, l, threading ); +#ifdef HAVE_TM } + if ( g.setup_mu != g.mu ) { + tm_term_update( (complex_double)g.setup_mu, l, threading ); + finalize_operator_update( l, threading ); + } else if (g.setup_m0 != g.m0) { #endif + finalize_operator_update( l, threading ); + } if ( g.mixed_precision ) iterative_float_setup( setup_iter, l, threading ); @@ -397,34 +424,35 @@ void method_update( int setup_iter, level_struct *l, struct Thread *threading ) iterative_double_setup( setup_iter, l, threading ); -#ifndef HAVE_TM - if ( g.setup_m0 != shift ) - optimized_shift_update( (complex_double)shift, l, threading ); -#else - if ( g.setup_tm_mu != tm_shift || g.setup_m0 != shift ) { - g.tm_mu = tm_shift; - optimized_shift_update( (complex_double) shift, l, threading ); + if ( g.setup_m0 != g.m0 ) { + m0_update( (complex_double)g.m0, l, threading ); +#ifdef HAVE_TM } + if ( g.setup_mu != g.mu ) { + tm_term_update( (complex_double)g.mu, l, threading ); + finalize_operator_update( l, threading ); + } else if (g.setup_m0 != g.m0) { #endif + finalize_operator_update( l, threading ); + } - START_MASTER(threading) - t1 = MPI_Wtime(); - g.total_time = t1-t0; - printf0("\nperformed %d iterative setup steps\n", setup_iter ); - printf0("elapsed time: %lf seconds (%lf seconds on coarse grid)\n\n", t1-t0, g.coarse_time ); - END_MASTER(threading) + MASTER(threading) { + t1 = MPI_Wtime(); + g.total_time = t1-t0; + printf0("\nperformed %d iterative setup steps\n", setup_iter ); + printf0("elapsed time: %lf seconds (%lf seconds on coarse grid)\n\n", t1-t0, g.coarse_time ); + } -#ifdef DEBUG - test_routine( l, threading ); -#endif - START_LOCKED_MASTER(threading) g.in_setup = 0; if ( l->depth==0 ) prof_print( l ); END_LOCKED_MASTER(threading) - - + +#ifdef DEBUG + test_routine( l, threading ); +#endif + } } @@ -487,7 +515,10 @@ void method_finalize( level_struct *l ) { FREE( g.ncycle, int, ls ); FREE( g.relax_fac, double, ls ); #ifdef HAVE_TM - FREE( g.tm_mu_factor, double, ls ); + FREE( g.mu_factor, double, ls ); +#endif +#ifdef HAVE_TM1p1 + FREE( g.epsbar_factor, double, ls ); #endif FREE( g.block_iter, int, ls ); FREE( g.setup_iter, int, ls ); @@ -529,7 +560,7 @@ int read_parameter( void **save_at, char *search_pattern, char *read_format, int if ( read_from == NULL ) { if ( !set_default ) - error0("unable to find string \"%s\" --- fatal error\n", search_pattern); + error0("FILE NULL, unable to find string \"%s\" --- fatal error\n", search_pattern); else return match; } @@ -539,6 +570,7 @@ int read_parameter( void **save_at, char *search_pattern, char *read_format, int while ( !match && fgets( read_pattern, 100000, read_from ) ) { k = strlen( read_pattern ); + /* j = 0; for ( i=0; in) { + match = 1; + i = 0; + while ( ilevel = g.num_levels-1-l->depth; - l->post_smooth_iter = g.post_smooth_iter[l->depth]; - l->block_iter = g.block_iter[l->depth]; - l->setup_iter = g.setup_iter[l->depth]; - l->num_eig_vect = g.num_eig_vect[l->depth]; - - if ( l->level > 0 ) - parameter_update( l->next_level ); -} - void read_global_info( FILE *in ) { void *save_pt; @@ -706,26 +740,30 @@ void read_no_default_info( FILE *in, level_struct *l ) { read_parameter( &save_pt, "d0 block lattice:", "%d", 4, in, _NO_DEFAULT_SET ); // Wilson mass - save_pt = &(l->real_shift); l->real_shift = 0; + save_pt = &(g.m0); g.m0 = 0; read_parameter( &save_pt, "m0:", "%lf", 1, in, _DEFAULT_SET ); - if ( l->real_shift == 0 ) { + if ( g.m0 == 0 ) { double kappa=0; save_pt = &(kappa); read_parameter( &save_pt, "kappa:", "%lf", 1, in, _DEFAULT_SET ); ASSERT(kappa != 0); - l->real_shift = 1./(2.*kappa)-4.; //setting m0 from kappa + g.m0 = 1./(2.*kappa)-4.; //setting m0 from kappa } save_pt = &(g.csw); read_parameter( &save_pt, "csw:", "%lf", 1, in, _NO_DEFAULT_SET ); #ifdef HAVE_TM - save_pt = &(g.tm_mu);g.tm_mu=0; + save_pt = &(g.mu);g.mu=0; read_parameter( &save_pt, "mu:", "%lf", 1, in, _DEFAULT_SET ); - if ( g.tm_mu == 0 ) { + if ( g.mu == 0 ) { read_parameter( &save_pt, "2KappaMu:", "%lf", 1, in, _DEFAULT_SET ); - g.tm_mu = g.tm_mu*(4.+l->real_shift); + g.mu = g.mu*(4.+g.m0); } #endif +#ifdef HAVE_TM1p1 + save_pt = &(g.epsbar); g.epsbar = 0; + read_parameter( &save_pt, "epsbar:", "%lf", 1, in, _DEFAULT_SET ); +#endif } void set_global_info( struct init *params, level_struct *l ) { @@ -747,10 +785,10 @@ void set_global_info( struct init *params, level_struct *l ) { } // Operator - l->real_shift = 1./(2.*params->kappa)-4.; + g.m0 = 1./(2.*params->kappa)-4.; g.csw = params->csw; #ifdef HAVE_TM - g.tm_mu = params->mu; + g.mu = params->mu; #endif g.num_openmp_processes = params->number_openmp_threads; @@ -781,23 +819,23 @@ void read_geometry_data( FILE *in, int ls ) { int i, mu, nb, nls, nlls, flag; for ( i=0; i0) { // global lattice sprintf( inputstr, "d%d global lattice:", i ); save_pt = g.global_lattice[i]; if ( ! read_parameter( &save_pt, inputstr, "%d", 4, in, _DEFAULT_SET ) ) { - nls = 1; - for ( mu=0; mu<4; mu++ ) { - g.global_lattice[i][mu] = g.global_lattice[i-1][mu]/g.block_lattice[i-1][mu]; - nls *= g.global_lattice[i][mu]; - } - if ( g.odd_even && nls < 2 ) { - warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i ); - g.num_levels = i; ls = i; - break; - } + nls = 1; + for ( mu=0; mu<4; mu++ ) { + g.global_lattice[i][mu] = g.global_lattice[i-1][mu]/g.block_lattice[i-1][mu]; + nls *= g.global_lattice[i][mu]; + } + if ( g.odd_even && nls < 2 ) { + warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i ); + g.num_levels = i; ls = i; + break; + } } // local lattice @@ -805,84 +843,84 @@ void read_geometry_data( FILE *in, int ls ) { save_pt = g.local_lattice[i]; if ( ! read_parameter( &save_pt, inputstr, "%d", 4, in, _DEFAULT_SET ) ) { - nls = 1; - nlls = 1; - for ( mu=0; mu<4; mu++ ) { - g.local_lattice[i][mu] = g.local_lattice[i-1][mu]/g.block_lattice[i-1][mu]; - nlls *= g.local_lattice[i][mu]; - nls *= g.global_lattice[i][mu]; - } - if ( g.odd_even && nlls < 2 ) { - if ( nls/nlls > 1 ) { - mu = shortest_dir( g.local_lattice[i] ); - if ( g.global_lattice[i][mu] > g.local_lattice[i][mu] ) { - g.local_lattice[i][mu] *= lcm( g.local_lattice[i][mu], - g.global_lattice[i][mu]/g.local_lattice[i][mu] ); - } - } - } + nls = 1; + nlls = 1; + for ( mu=0; mu<4; mu++ ) { + g.local_lattice[i][mu] = g.local_lattice[i-1][mu]/g.block_lattice[i-1][mu]; + nlls *= g.local_lattice[i][mu]; + nls *= g.global_lattice[i][mu]; + } + if ( g.odd_even && nlls < 2 ) { + if ( nls/nlls > 1 ) { + mu = shortest_dir( g.local_lattice[i] ); + if ( g.global_lattice[i][mu] > g.local_lattice[i][mu] ) { + g.local_lattice[i][mu] *= lcm( g.local_lattice[i][mu], + g.global_lattice[i][mu]/g.local_lattice[i][mu] ); + } + } + } } // block lattice for ( mu=0; mu<4; mu++ ) - g.block_lattice[i][mu] = 1; + g.block_lattice[i][mu] = 1; if ( i g.local_lattice[i][mu] ) { - g.local_lattice[i][mu] *= lcm( g.local_lattice[i][mu], - g.global_lattice[i][mu]/g.local_lattice[i][mu] ); - } - } - - } + nb *= g.local_lattice[i][mu]/g.block_lattice[i][mu]; + + if ( g.local_lattice[i][mu] < g.block_lattice[i][mu] ) { + g.local_lattice[i][mu] *= g.block_lattice[i][mu]; + if ( ! DIVIDES( g.local_lattice[i][mu], g.global_lattice[i][mu] ) ) { + g.local_lattice[i][mu] /= g.block_lattice[i][mu]; + } + warning0("lattice dimensions not valid for a %d-level method, choosing a %d-level method\n", g.num_levels, i+1 ); + g.num_levels = i+1; ls=i+1; + g.block_lattice[i][mu] = 1; + flag = 0; + break; + } + } + + if ( flag == 1 && g.method == 2 && nb == 1 ) { + mu = shortest_dir( g.local_lattice[i] ); + if ( g.global_lattice[i][mu] > g.local_lattice[i][mu] ) { + g.local_lattice[i][mu] *= lcm( g.local_lattice[i][mu], + g.global_lattice[i][mu]/g.local_lattice[i][mu] ); + } + } + + } } } #ifdef DEBUG printf00("level: %d, gl: %3d %3d %3d %3d\n", i, g.global_lattice[i][0], - g.global_lattice[i][1],g.global_lattice[i][2],g.global_lattice[i][3] ); + g.global_lattice[i][1],g.global_lattice[i][2],g.global_lattice[i][3] ); printf00("level: %d, ll: %3d %3d %3d %3d\n", i, g.local_lattice[i][0], - g.local_lattice[i][1],g.local_lattice[i][2],g.local_lattice[i][3] ); - + g.local_lattice[i][1],g.local_lattice[i][2],g.local_lattice[i][3] ); + printf00("level: %d, bl: %3d %3d %3d %3d\n\n", i, g.block_lattice[i][0], - g.block_lattice[i][1],g.block_lattice[i][2],g.block_lattice[i][3] ); + g.block_lattice[i][1],g.block_lattice[i][2],g.block_lattice[i][3] ); #endif - - + + sprintf( inputstr, "d%d post smooth iter:", i ); save_pt = &(g.post_smooth_iter[i]); g.post_smooth_iter[i] = 4; read_parameter( &save_pt, inputstr, "%d", 1, in, _DEFAULT_SET ); @@ -910,7 +948,13 @@ void read_geometry_data( FILE *in, int ls ) { #ifdef HAVE_TM sprintf( inputstr, "d%d mu factor:", i ); - save_pt = &(g.tm_mu_factor[i]); g.tm_mu_factor[i] = 1; + save_pt = &(g.mu_factor[i]); g.mu_factor[i] = 1; + read_parameter( &save_pt, inputstr, "%lf", 1, in, _DEFAULT_SET ); +#endif + +#ifdef HAVE_TM1p1 + sprintf( inputstr, "d%d epsbar factor:", i ); + save_pt = &(g.epsbar_factor[i]); g.epsbar_factor[i] = 1; read_parameter( &save_pt, inputstr, "%lf", 1, in, _DEFAULT_SET ); #endif @@ -945,16 +989,23 @@ void read_solver_parameters( FILE *in, level_struct *l ) { save_pt = &(g.odd_even); g.odd_even = 1; read_parameter( &save_pt, "odd even preconditioning:", "%d", 1, in, _DEFAULT_SET ); - save_pt = &(g.setup_m0); g.setup_m0=l->real_shift; + save_pt = &(g.setup_m0); g.setup_m0 = g.m0; read_parameter( &save_pt, "setup m0:", "%lf", 1, in, _DEFAULT_SET ); #ifdef HAVE_TM - save_pt = &(g.tm_mu_odd_shift);g.tm_mu_odd_shift=0; + save_pt = &(g.mu_odd_shift); g.mu_odd_shift = 0; read_parameter( &save_pt, "mu odd shift:", "%lf", 1, in, _DEFAULT_SET ); - save_pt = &(g.tm_mu_even_shift);g.tm_mu_even_shift=0; + save_pt = &(g.mu_even_shift); g.mu_even_shift = 0; read_parameter( &save_pt, "mu even shift:", "%lf", 1, in, _DEFAULT_SET ); - save_pt = &(g.setup_tm_mu); g.setup_tm_mu=g.tm_mu; + save_pt = &(g.setup_mu); g.setup_mu = g.mu; read_parameter( &save_pt, "setup mu:", "%lf", 1, in, _DEFAULT_SET ); #endif + +#ifdef HAVE_TM1p1 + save_pt = &(g.epsbar_ig5_odd_shift);g.epsbar_ig5_odd_shift=0; + read_parameter( &save_pt, "epsbar odd shift:", "%lf", 1, in, _DEFAULT_SET ); + save_pt = &(g.epsbar_ig5_even_shift);g.epsbar_ig5_even_shift=0; + read_parameter( &save_pt, "epsbar even shift:", "%lf", 1, in, _DEFAULT_SET ); +#endif save_pt = &(g.method); g.method = 2; read_parameter( &save_pt, "method:", "%d", 1, in, _DEFAULT_SET ); @@ -1033,7 +1084,7 @@ void validate_parameters( int ls, level_struct *l ) { int i; int mu; - + #ifdef SSE if ( !g.odd_even ) warning0("The SSE implementation is based on the odd-even preconditioned code.\ @@ -1068,13 +1119,12 @@ void validate_parameters( int ls, level_struct *l ) { if ( g.block_lattice[i][mu] != g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ) warning0("when using SSE, Schwarz block size and aggregate size have to match.\n"); ASSERT( g.block_lattice[i][mu] == g.global_lattice[i][mu]/g.global_lattice[i+1][mu] ); + // it works everywhere but we have some problem with the vector size. + // TODO: check all vectora allocated with size l->inner_vector_size + ASSERT( g.num_eig_vect[i] % SIMD_LENGTH_float == 0 ); #endif } - for ( i=0; i 0, g.kcycle_max_restart > 0 ) ); ASSERT( IMPLIES( g.kcycle && g.method > 0, 0 < g.kcycle_tol && g.kcycle_tol < 1 ) ); + + //LIST OF CASES WHICH SHOULD WORK, BUT DO NOT (TODO) + #ifdef SSE ASSERT( g.mixed_precision ); -// ASSERT( DIVIDES( 4, g.num_eig_vect[0] ) ); +#endif + + //TODO: Could work without, but you need to fix the setup phase. + for ( i=0; i2 + if ( g.num_levels>2 && g.interpolation ) + ASSERT( g.mixed_precision ); + +#ifdef HAVE_TM1p1 + //TODO: method = 6 not supported with HAVE_TM1p1. To fix all the g5D functions + ASSERT( g.method !=6 ); #endif } @@ -1130,7 +1195,10 @@ void allocate_for_global_struct_after_read_global_info( int ls ) { MALLOC( g.ncycle, int, ls ); MALLOC( g.relax_fac, double, ls ); #ifdef HAVE_TM - MALLOC( g.tm_mu_factor, double, ls ); + MALLOC( g.mu_factor, double, ls ); +#endif +#ifdef HAVE_TM1p1 + MALLOC( g.epsbar_factor, double, ls ); #endif MALLOC( g.block_iter, int, ls ); MALLOC( g.setup_iter, int, ls ); @@ -1157,7 +1225,8 @@ void set_level_and_global_structs_according_to_global_struct( level_struct *l ) l->block_iter = g.block_iter[0]; l->setup_iter = g.setup_iter[0]; l->num_eig_vect = g.num_eig_vect[0]; - + l->num_parent_eig_vect = 6; //for consistency sake + // compute some additional values l->num_lattice_site_var = 12; g.num_processes = 1; @@ -1170,16 +1239,7 @@ void set_level_and_global_structs_according_to_global_struct( level_struct *l ) g.num_processes *= l->global_splitting[mu]; } - l->dirac_shift = l->real_shift; -#ifdef HAVE_TM - l->tm_shift = g.tm_mu; - l->tm_even_shift = g.tm_mu_even_shift; - l->tm_odd_shift = g.tm_mu_odd_shift; -#endif - l->even_shift = l->dirac_shift; - l->odd_shift = l->dirac_shift; - g.solve_m0 = l->dirac_shift; - g.setup_m0 = l->dirac_shift; + g.setup_m0 = g.m0; } void lg_in( char *inputfile, level_struct *l ) { @@ -1214,6 +1274,29 @@ void lg_in( char *inputfile, level_struct *l ) { fclose(in); } +void parameter_update( level_struct *l ) { + + if(l->depth==0) { + int ls = MAX(g.num_levels,2); + set_level_and_global_structs_according_to_global_struct( l ); + validate_parameters( ls, l ); + } + + l->level = g.num_levels-1-l->depth; + l->post_smooth_iter = g.post_smooth_iter[l->depth]; + l->block_iter = g.block_iter[l->depth]; + l->setup_iter = g.setup_iter[l->depth]; + l->num_eig_vect = g.num_eig_vect[l->depth]; + if(l->depth>0) + l->num_parent_eig_vect = g.num_eig_vect[l->depth-1]; + else + l->num_parent_eig_vect = 6; + + if ( l->level > 0 && l->next_level != NULL ) + parameter_update( l->next_level ); +} + + void set_DDalphaAMG_parameters( struct init *params, level_struct *l ) { FILE *in=NULL; diff --git a/src/init_generic.c b/src/init_generic.c index d14ca5a..7f6b50b 100644 --- a/src/init_generic.c +++ b/src/init_generic.c @@ -98,12 +98,19 @@ double prof_PRECISION_print( level_struct *l ) { void fine_level_PRECISION_alloc( level_struct *l ) { int n = 8; - +#ifdef HAVE_TM1p1 + MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size ); + for ( int i=1; ivbuf_PRECISION[i] = l->vbuf_PRECISION[0] + 2*i*l->vector_size; + MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size ); + l->p_PRECISION.x = l->p_PRECISION.b + 2*l->inner_vector_size; +#else MALLOC( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size ); for ( int i=1; ivbuf_PRECISION[i] = l->vbuf_PRECISION[0] + i*l->vector_size; MALLOC( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size ); l->p_PRECISION.x = l->p_PRECISION.b + l->inner_vector_size; +#endif } @@ -111,11 +118,19 @@ void fine_level_PRECISION_free( level_struct *l ) { int n = 8; +#ifdef HAVE_TM1p1 + FREE( l->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->vector_size ); + for ( int i=1; ivbuf_PRECISION[i] = NULL; + FREE( l->p_PRECISION.b, complex_PRECISION, 2*2*l->inner_vector_size ); + l->p_PRECISION.x = NULL; +#else FREE( l->vbuf_PRECISION[0], complex_PRECISION, n*l->vector_size ); for ( int i=1; ivbuf_PRECISION[i] = NULL; FREE( l->p_PRECISION.b, complex_PRECISION, 2*l->inner_vector_size ); l->p_PRECISION.x = NULL; +#endif } @@ -143,18 +158,28 @@ void next_level_PRECISION_setup( level_struct *l ) { g.method==6?g5D_apply_coarse_operator_PRECISION:apply_coarse_operator_PRECISION, &(l->next_level->p_PRECISION), l->next_level ); } else { +#ifdef HAVE_TM1p1 + MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size ); + l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + 2*l->next_level->vector_size; +#else MALLOC( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size ); l->next_level->p_PRECISION.x = l->next_level->p_PRECISION.b + l->next_level->vector_size; - l->next_level->p_PRECISION.shift = 0; +#endif l->next_level->p_PRECISION.v_start = 0; - l->next_level->p_PRECISION.v_end = l->inner_vector_size; + l->next_level->p_PRECISION.v_end = l->next_level->inner_vector_size; } } int i, n = (l->next_level->level>0)?6:4; +#ifdef HAVE_TM1p1 + MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size ); + for ( i=1; inext_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + 2*i*l->next_level->vector_size; +#else MALLOC( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size ); for ( i=1; inext_level->vbuf_PRECISION[i] = l->next_level->vbuf_PRECISION[0] + i*l->next_level->vector_size; +#endif } } @@ -167,13 +192,21 @@ void next_level_PRECISION_free( level_struct *l ) { if ( ( l->level == 1 && !l->next_level->idle ) || g.kcycle ) { fgmres_PRECISION_struct_free( &(l->next_level->p_PRECISION), l->next_level ); } else { +#ifdef HAVE_TM1p1 + FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*2*l->next_level->vector_size ); +#else FREE( l->next_level->p_PRECISION.b, complex_PRECISION, 2*l->next_level->vector_size ); +#endif } int i, n = (l->next_level->level>0)?6:4; for ( i=1; inext_level->vbuf_PRECISION[i] = NULL; +#ifdef HAVE_TM1p1 + FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, 2*n*l->next_level->vector_size ); +#else FREE( l->next_level->vbuf_PRECISION[0], complex_PRECISION, n*l->next_level->vector_size ); +#endif coarsening_index_table_PRECISION_free( &(l->is_PRECISION), l ); } @@ -203,9 +236,7 @@ void vcycle_timing_PRECISION( int n, level_struct *l, struct Thread *threading ) PUBLIC_MALLOC( v1, complex_PRECISION, l->inner_vector_size ); PUBLIC_MALLOC( v2, complex_PRECISION, l->inner_vector_size ); - START_LOCKED_MASTER(threading) - vector_PRECISION_define_random( v2, 0, l->inner_vector_size, l ); - END_LOCKED_MASTER(threading) + vector_PRECISION_define_random( v2, 0, l->inner_vector_size, l, threading ); START_MASTER(threading) t0 = MPI_Wtime(); diff --git a/src/interpolation_generic.c b/src/interpolation_generic.c index 5c78c9f..b6a4436 100644 --- a/src/interpolation_generic.c +++ b/src/interpolation_generic.c @@ -21,22 +21,26 @@ #include "main.h" -#if ( !defined( SSE ) || !defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) ) - void interpolation_PRECISION_alloc( level_struct *l ) { int k, n = l->num_eig_vect; MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n ); +#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n ); l->is_PRECISION.interpolation[0] = NULL; MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 64 ); for ( k=1; kis_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size; MALLOC( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); +#else + // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size + MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, + ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 ); +#endif l->is_PRECISION.test_vector[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 64 ); + MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 128 ); for ( k=1; kis_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size; } @@ -64,9 +68,13 @@ void interpolation_PRECISION_free( level_struct *l ) { FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size ); FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n ); +#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size ); FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n ); FREE( l->is_PRECISION.operator, complex_PRECISION, n*l->inner_vector_size ); +#else + FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size ); +#endif } @@ -80,12 +88,39 @@ void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, int end = threading->end_index[l->depth]; SYNC_CORES(threading) +#ifndef OPTIMIZED_INTERPOLATION_OPERATOR_PRECISION operator += start*num_eig_vect; for ( int i=start; i num_eig_vect) + j_end = num_eig_vect; + + operator = l->is_PRECISION.operator + j*l->vector_size + start*offset; + + for ( int i=start; iis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1, - num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; + int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, + num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; @@ -102,64 +137,184 @@ void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_ vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); END_LOCKED_MASTER(threading) SYNC_HYPERTHREADS(threading) - - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; - for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { + phi_pt = phi + i*2*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; + for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { + phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; + for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + PRECISION tmp_phi1_c_re[2*OPERATOR_COMPONENT_OFFSET_PRECISION]; + PRECISION tmp_phi1_c_im[2*OPERATOR_COMPONENT_OFFSET_PRECISION]; + PRECISION tmp_phi2_c_re[2*OPERATOR_COMPONENT_OFFSET_PRECISION]; + PRECISION tmp_phi2_c_im[2*OPERATOR_COMPONENT_OFFSET_PRECISION]; + mm_PRECISION zero = mm_setzero_PRECISION(); + for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { + mm_store_PRECISION(tmp_phi1_c_re+j, zero); + mm_store_PRECISION(tmp_phi1_c_im+j, zero); + mm_store_PRECISION(tmp_phi2_c_re+j, zero); + mm_store_PRECISION(tmp_phi2_c_im+j, zero); + } + // copy phi_c into temporary + for ( j=0; jis_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; + + for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { + phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + PRECISION tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_PRECISION]; + PRECISION tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_PRECISION]; + mm_PRECISION zero = mm_setzero_PRECISION(); + for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { + mm_store_PRECISION(tmp_phi_c_re+j, zero); + mm_store_PRECISION(tmp_phi_c_im+j, zero); + } + // copy phi_c into temporary + for ( j=0; jis_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; + + for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - - START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - int sign = 1; - operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; - for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, sign = 1, - num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; + int i, j, k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, + num_parent_eig_vect = l->num_parent_eig_vect, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; +#ifndef OPTIMIZED_INTERPOLATION_OPERATOR_PRECISION + int sign = 1; +#ifdef HAVE_TM1p1 + if( g.n_flavours==2 ) + for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { + phi_pt = phi + i*2*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; + + for ( j=0; j<2*2*num_eig_vect; j++ ) + phi_c_pt[j] = 0; + + for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { + phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + operator = l->is_PRECISION.operator + i*2*num_eig_vect*num_parent_eig_vect*aggregate_sites; + + for ( j=0; j<2*num_eig_vect; j++ ) + phi_c_pt[j] = 0; + + for ( k=0; kn_thread*threading->core + threading->thread; in_core*threading->n_thread ) { + + int offset = SIMD_LENGTH_PRECISION; + phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*2*num_eig_vect; + + // loop over blocks of SIMD_LENGTH_PRECISION vectors + for ( j=0; jis_PRECISION.operator + j*l->vector_size/2 + i*2*offset*num_parent_eig_vect*aggregate_sites; + + // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving + // complex components and masking + // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator) + PRECISION tmp_phi1_c_re[2*offset]; + PRECISION tmp_phi1_c_im[2*offset]; + PRECISION tmp_phi2_c_re[2*offset]; + PRECISION tmp_phi2_c_im[2*offset]; + mm_PRECISION zero = mm_setzero_PRECISION(); + for ( k1=0; k1<2*offset; k1+=offset ) { + mm_store_PRECISION(tmp_phi1_c_re+k1, zero); + mm_store_PRECISION(tmp_phi1_c_im+k1, zero); + mm_store_PRECISION(tmp_phi2_c_re+k1, zero); + mm_store_PRECISION(tmp_phi2_c_im+k1, zero); + } + + for ( k=0; k broadcast + mm_PRECISION phi1_re = mm_set1_PRECISION(((PRECISION *)phi_pt)[0]); + mm_PRECISION phi1_im = mm_set1_PRECISION(((PRECISION *)phi_pt)[1]); + mm_PRECISION phi2_re = mm_set1_PRECISION(((PRECISION *)phi_pt)[0+2*num_parent_eig_vect]); + mm_PRECISION phi2_im = mm_set1_PRECISION(((PRECISION *)phi_pt)[1+2*num_parent_eig_vect]); - for ( j=0; j<2*num_eig_vect; j++ ) - phi_c_pt[j] = 0; - - for ( k=0; k= num_eig_vect ) break; + ((PRECISION*)(phi_c_pt+j+m))[0] = tmp_phi1_c_re[m]; + ((PRECISION*)(phi_c_pt+j+m))[1] = tmp_phi1_c_im[m]; + } + for ( int m=0; m= num_eig_vect ) break; + ((PRECISION*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi2_c_re[m]; + ((PRECISION*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi2_c_im[m]; + } + for ( int m=0; m= num_eig_vect ) break; + ((PRECISION*)(phi_c_pt+2*num_eig_vect+j+m))[0] = tmp_phi1_c_re[m+offset]; + ((PRECISION*)(phi_c_pt+2*num_eig_vect+j+m))[1] = tmp_phi1_c_im[m+offset]; + } + for ( int m=0; m= num_eig_vect ) break; + ((PRECISION*)(phi_c_pt+3*num_eig_vect+j+m))[0] = tmp_phi2_c_re[m+offset]; + ((PRECISION*)(phi_c_pt+3*num_eig_vect+j+m))[1] = tmp_phi2_c_im[m+offset]; + } } } - } + else +#endif + for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { + + int offset = SIMD_LENGTH_PRECISION; + // loop over blocks of SIMD_LENGTH_PRECISION vectors + for ( j=0; jnext_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; + operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; + + // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving + // complex components and masking + // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator) + PRECISION tmp_phi_c_re[2*offset]; + PRECISION tmp_phi_c_im[2*offset]; + mm_PRECISION zero = mm_setzero_PRECISION(); + for ( k1=0; k1<2*offset; k1+=offset ) { + mm_store_PRECISION(tmp_phi_c_re+k1, zero); + mm_store_PRECISION(tmp_phi_c_im+k1, zero); + } + + for ( k=0; k broadcast + mm_PRECISION phi_re = mm_set1_PRECISION(((PRECISION *)phi_pt)[0]); + mm_PRECISION phi_im = mm_set1_PRECISION(((PRECISION *)phi_pt)[1]); + + mm_PRECISION operator_re = mm_load_PRECISION((PRECISION *)operator); + mm_PRECISION operator_im = mm_load_PRECISION((PRECISION *)operator+offset); + mm_PRECISION phi_c_re = mm_load_PRECISION(tmp_phi_c_re+low_high_offset); + mm_PRECISION phi_c_im = mm_load_PRECISION(tmp_phi_c_im+low_high_offset); + + cfmadd_conj_PRECISION(operator_re, operator_im, phi_re, phi_im, &phi_c_re, &phi_c_im); + + mm_store_PRECISION(tmp_phi_c_re+low_high_offset, phi_c_re); + mm_store_PRECISION(tmp_phi_c_im+low_high_offset, phi_c_im); + // skip to next real line of matrix + operator += offset; + phi_pt++; + } + low_high_offset = offset; + } + } + + for ( int m=0; m= num_eig_vect ) break; + ((PRECISION*)(phi_c_pt+j+m))[0] = tmp_phi_c_re[m]; + ((PRECISION*)(phi_c_pt+j+m))[1] = tmp_phi_c_im[m]; + } + + for ( int m=0; m= num_eig_vect ) break; + ((PRECISION*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi_c_re[m+offset]; + ((PRECISION*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi_c_im[m+offset]; + } + } + } +#endif SYNC_HYPERTHREADS(threading) START_LOCKED_MASTER(threading) @@ -205,5 +543,3 @@ void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_str END_LOCKED_MASTER(threading) PROF_PRECISION_STOP( _PR, 1, threading ); } - -#endif diff --git a/src/interpolation_generic.h b/src/interpolation_generic.h index 97be6ec..90e9051 100644 --- a/src/interpolation_generic.h +++ b/src/interpolation_generic.h @@ -22,17 +22,16 @@ #ifndef INTERPOLATION_PRECISION_HEADER #define INTERPOLATION_PRECISION_HEADER - struct Thread; void interpolation_PRECISION_alloc( level_struct *l ); void interpolation_PRECISION_free( level_struct *l ); void interpolation_PRECISION_dummy_alloc( level_struct *l ); void interpolation_PRECISION_dummy_free( level_struct *l ); - void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ); - void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, struct Thread *threading ); - void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, struct Thread *threading ); + void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading ); + void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading ); + void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, Thread *threading ); - void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ); + void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, Thread *threading ); #endif diff --git a/src/io.c b/src/io.c index 354d545..deb104c 100644 --- a/src/io.c +++ b/src/io.c @@ -373,7 +373,7 @@ unsigned int initFile( char *filename, const int mode, level_struct *l ) { lls[1] = l->local_lattice[1]; lls[2] = l->local_lattice[2]; lls[3] = l->local_lattice[0]; - writeSmallDataset_double(configgroup_id, "m0", l->real_shift); + writeSmallDataset_double(configgroup_id, "m0", g.m0); writeSmallDataset_double(configgroup_id, "csw", g.csw); writeSmallDataset_double(configgroup_id, "plaquette_clov", g.plaq_clov); writeSmallDataset_double(configgroup_id, "plaquette_hopp", g.plaq_hopp); @@ -548,7 +548,7 @@ void write_header_mg( FILE **file, double *lambda, char* vector_type, int n, lev fprintf( *file, "
\n" ); fprintf( *file, "%s\n", vector_type ); fprintf( *file, "clifford basis: %s\n", CLIFFORD_BASIS ); - fprintf( *file, "m0: %.14lf\n", l->real_shift ); + fprintf( *file, "m0: %.14lf\n", g.m0 ); fprintf( *file, "csw: %.14lf\n", g.csw ); fprintf( *file, "clov plaq: %.14lf\n", g.plaq_clov ); fprintf( *file, "hopp plaq: %.14lf\n", g.plaq_hopp ); diff --git a/src/lime_io.c b/src/lime_io.c index 9dfa5e3..a082271 100644 --- a/src/lime_io.c +++ b/src/lime_io.c @@ -33,7 +33,7 @@ typedef struct lime_fileinfo { * LIME functions * * In DDalphaAMG format: - ** t slowest running index + ** t slowest running index ** x fastest running index ** all positive directions ** ordering: +T,+Z,+Y,+X @@ -502,7 +502,7 @@ void lime_write_vector( double *phi, char *filename ) { } } - if ( g.my_rank == 0 ) { + if ( g.my_rank == 0 ) { for ( i=0; idata + i ) ); } diff --git a/src/linalg.c b/src/linalg.c index 3487404..cdc1171 100644 --- a/src/linalg.c +++ b/src/linalg.c @@ -21,7 +21,6 @@ #include "main.h" -#ifndef OPTIMIZED_LINALG_float void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, vector_float psi, int start, int end, level_struct *l, struct Thread *threading ) { @@ -36,6 +35,8 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_ SYNC_CORES(threading) +#ifndef OPTIMIZED_LINALG_float + compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); for(int c=0; cworkspace)[threading->core] = results; END_NO_HYPERTHREADS(threading) @@ -60,7 +86,7 @@ void process_multi_inner_product_MP( int count, complex_double *results, vector_ PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } -#endif + double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) { diff --git a/src/linalg.h b/src/linalg.h index 4182def..62e95b5 100644 --- a/src/linalg.h +++ b/src/linalg.h @@ -24,16 +24,8 @@ struct Thread; - void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha, - int sign, int count, int start, int end, level_struct *l ); - - void vector_float_multi_saxpy( vector_float z, vector_float *V, complex_float *alpha, - int sign, int count, int start, int end, level_struct *l ); - - void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, - vector_float psi, int start, int end, level_struct *l, - struct Thread *threading ); - + void process_multi_inner_product_MP( int count, complex_double *results, vector_float *phi, vector_float psi, + int start, int end, level_struct *l, struct Thread *threading ); double global_norm_MP( vector_float x, int start, int end, level_struct *l, struct Thread *threading ); #endif diff --git a/src/linalg_generic.c b/src/linalg_generic.c index 58153e1..635426c 100644 --- a/src/linalg_generic.c +++ b/src/linalg_generic.c @@ -21,57 +21,69 @@ #include "main.h" -#include "sse_float_intrinsic.h" -#include "sse_linalg.h" -#include "sse_linalg_PRECISION.h" - -#ifndef OPTIMIZED_LINALG_PRECISION -complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) { +complex_PRECISION global_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, + level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _GIP, threading ); complex_PRECISION local_alpha = 0, global_alpha = 0; int thread_start; int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - SYNC_CORES(threading) + SYNC_CORES(threading); +#ifndef OPTIMIZED_LINALG_PRECISION + + compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); VECTOR_FOR( int i=thread_start, iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) + ((complex_PRECISION *)threading->workspace)[threading->core] = local_alpha; + END_NO_HYPERTHREADS(threading); // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((complex_PRECISION *)threading->workspace)[0] += ((complex_PRECISION *)threading->workspace)[i]; - local_alpha = ((complex_PRECISION *)threading->workspace)[0]; - END_MASTER(threading) + SYNC_CORES(threading); + MASTER(threading) { + for(int i=1; in_core; i++) + ((complex_PRECISION *)threading->workspace)[0] += ((complex_PRECISION *)threading->workspace)[i]; + local_alpha = ((complex_PRECISION *)threading->workspace)[0]; + } if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_PRECISION_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); - PROF_PRECISION_STOP( _ALLR, 1 ); - ((complex_PRECISION *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) + MASTER(threading) { + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + ((complex_PRECISION *)threading->workspace)[0] = global_alpha; + } // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) + SYNC_MASTER_TO_ALL(threading); global_alpha = ((complex_PRECISION *)threading->workspace)[0]; PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); return global_alpha; } else { // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) + SYNC_MASTER_TO_ALL(threading); local_alpha = ((complex_PRECISION *)threading->workspace)[0]; PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); return local_alpha; } } -#endif complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) { @@ -103,7 +115,6 @@ complex_PRECISION process_inner_product_PRECISION( vector_PRECISION phi, vector_ } -#if !defined( OPTIMIZED_LINALG_PRECISION ) void process_multi_inner_product_PRECISION( int count, complex_PRECISION *results, vector_PRECISION *phi, vector_PRECISION psi, int start, int end, level_struct *l, struct Thread *threading ) { @@ -116,25 +127,40 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result int thread_end; SYNC_CORES(threading) + +#ifndef OPTIMIZED_LINALG_PRECISION + if ( l->depth == 0 ) { compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); for(int c=0; cworkspace)[threading->core] = results; END_NO_HYPERTHREADS(threading) @@ -152,7 +178,6 @@ void process_multi_inner_product_PRECISION( int count, complex_PRECISION *result PROF_PRECISION_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); } -#endif complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l ) { @@ -168,7 +193,7 @@ complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECI return numerator/denominator; } -#ifndef OPTIMIZED_LINALG_PRECISION + PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) { PROF_PRECISION_START( _GIP, threading ); @@ -177,45 +202,57 @@ PRECISION global_norm_PRECISION( vector_PRECISION x, int start, int end, level_s int thread_start; int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - SYNC_CORES(threading) + SYNC_CORES(threading); + +#ifndef OPTIMIZED_LINALG_PRECISION + compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); VECTOR_FOR( int i=thread_start, iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) + ((PRECISION *)threading->workspace)[threading->core] = local_alpha; + END_NO_HYPERTHREADS(threading); // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((PRECISION *)threading->workspace)[0] += ((PRECISION *)threading->workspace)[i]; - local_alpha = ((PRECISION *)threading->workspace)[0]; - END_MASTER(threading) + SYNC_CORES(threading); + MASTER(threading) { + for(int i=1; in_core; i++) + ((PRECISION *)threading->workspace)[0] += ((PRECISION *)threading->workspace)[i]; + local_alpha = ((PRECISION *)threading->workspace)[0]; + } if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_PRECISION_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); - PROF_PRECISION_STOP( _ALLR, 1 ); - ((PRECISION *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) + MASTER(threading) { + PROF_PRECISION_START( _ALLR ); + MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); + PROF_PRECISION_STOP( _ALLR, 1 ); + ((PRECISION *)threading->workspace)[0] = global_alpha; + } // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) + SYNC_MASTER_TO_ALL(threading); global_alpha = ((PRECISION *)threading->workspace)[0]; PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); return (PRECISION)sqrt((double)global_alpha); } else { // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) + SYNC_MASTER_TO_ALL(threading); local_alpha = ((PRECISION *)threading->workspace)[0]; PROF_PRECISION_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); return (PRECISION)sqrt((double)local_alpha); } } -#endif + PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ) { @@ -245,6 +282,53 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_ return (PRECISION)sqrt((double)local_alpha); } +// vector storage for PRECISION precision +void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l, Thread *threading ) { + + int i; + PROF_PRECISION_START( _SET, threading ); + + THREADED_VECTOR_FOR( i, start, end, phi[i] = value, i++, l, threading ); + + PROF_PRECISION_STOP( _SET, 1, threading ); +} + +void vector_PRECISION_define_real( vector_PRECISION phi, PRECISION value, int start, int end, level_struct *l, Thread *threading ) { + + int i; + PROF_PRECISION_START( _SET, threading ); + + PRECISION *phi_pt = (PRECISION*) phi; + THREADED_VECTOR_FOR( i, 2*start, 2*end, phi_pt[i] = value; phi_pt[i+1] = 0, i+=2, l, threading ); + + PROF_PRECISION_STOP( _SET, 1, threading ); +} + +void vector_PRECISION_define_zero( vector_PRECISION phi, int start, int end, level_struct *l, Thread *threading ) { + + int i; + PROF_PRECISION_START( _SET, threading ); + + PRECISION *phi_pt = (PRECISION*) phi; + THREADED_VECTOR_FOR( i, 2*start, 2*end, phi_pt[i] = phi_pt[i+1] = 0, i+=2, l, threading ); + + PROF_PRECISION_STOP( _SET, 1, threading ); +} + + +void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l, Thread *threading ) { + + int i; + PROF_PRECISION_START( _SET, threading ); + + // this would yield different results if we threaded it, so we don't + START_LOCKED_MASTER(threading) + VECTOR_FOR( i=start, iinner_vector_size ); } -#ifndef OPTIMIZED_LINALG_PRECISION -void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ) { +void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l, struct Thread *threading ) { - int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_PRECISION_START( _LA6 ); + int thread_start, thread_end; + PROF_PRECISION_START( _LA6, threading ); + +#ifndef OPTIMIZED_LINALG_PRECISION + compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); VECTOR_FOR( int i=start, iinner_vector_size ); -} +#else + compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION); + mm_PRECISION alpha_re = mm_set1_PRECISION( creal_PRECISION(alpha) ); + mm_PRECISION alpha_im = mm_set1_PRECISION( cimag_PRECISION(alpha) ); + + for( int i=start; iinner_vector_size, threading ); +} void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, @@ -307,126 +403,92 @@ void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, i int thread = omp_get_thread_num(); if(thread == 0 && start != end) - PROF_PRECISION_START( _CPY ); - + PROF_PRECISION_START( _CPY ); + VECTOR_FOR( int i=start, iinner_vector_size ); + PROF_PRECISION_STOP( _CPY, (double)(end-start)/(double)l->inner_vector_size ); } -#ifndef OPTIMIZED_LINALG_PRECISION -void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ) { +void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, + int start, int end, level_struct *l, struct Thread *threading ) { - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_PRECISION_START( _LA8 ); + int thread_start, thread_end; + PROF_PRECISION_START( _LA8, threading ); +#ifndef OPTIMIZED_LINALG_PRECISION + + compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); VECTOR_FOR( int i=start, iinner_vector_size ); -} +#else + compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION); + mm_PRECISION alpha_re = mm_set1_PRECISION( creal_PRECISION(alpha) ); + mm_PRECISION alpha_im = mm_set1_PRECISION( cimag_PRECISION(alpha) ); + + for ( int i=start; iinner_vector_size, threading ); +} + +void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, + int count, int start, int end, level_struct *l, struct Thread *threading ) { - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_PRECISION_START( _LA8 ); + int thread_start, thread_end; + PROF_PRECISION_START( _LA8, threading ); +#ifndef OPTIMIZED_LINALG_PRECISION + complex_PRECISION alpha_signed[count]; for ( int c=0; cstart_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; - - while ( eta < eta_end ) { - if(g.odd_even_table[i]==_ODD){ - FOR12( *eta = (*phi); phi++; eta++; ) - } - else if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta = 0; phi++; eta++; ) - } - i++; - } -} - -void vector_PRECISION_gamma5_set_even_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { - - int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; - - while ( eta < eta_end ) { - if(g.odd_even_table[i]==_ODD){ - FOR6( *eta = -(*phi); phi++; eta++; ) - FOR6( *eta = (*phi); phi++; eta++; ) - } - else if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta = 0; phi++; eta++; ) - } - i++; - } -} + } +#else -void vector_PRECISION_set_odd_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ) { - - int i = threading->start_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; - - while ( eta < eta_end ) { - if(g.odd_even_table[i]==_EVEN){ - FOR12( *eta = (*phi); phi++; eta++; ) - } - else if(g.odd_even_table[i]==_ODD){ - FOR12( *eta = 0; phi++; eta++; ) - } - i++; + compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, SIMD_LENGTH_PRECISION); + for ( int c=0; cstart_site[l->depth]; - vector_PRECISION eta_end = eta + threading->end_index[l->depth]; - eta += threading->start_index[l->depth]; - phi += threading->start_index[l->depth]; - - while ( eta < eta_end ) { - if(g.odd_even_table[i]==_EVEN){ - FOR6( *eta = -(*phi); phi++; eta++; ) - FOR6( *eta = (*phi); phi++; eta++; ) - } - else if(g.odd_even_table[i]==_ODD){ - FOR12( *eta = 0; phi++; eta++; ) - } - i++; - } -} void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, int orthogonal, level_struct *l, Thread *threading ) { @@ -439,7 +501,7 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, complex_PRECISION ip[k], ip_buffer[2*k]; MALLOC( v_tmp, complex_PRECISION, l->inner_vector_size ); - vector_PRECISION_define(v_tmp, 0, 0, l->inner_vector_size, l ); + vector_PRECISION_define_zero( v_tmp, 0, l->inner_vector_size, l, threading ); MALLOC( W_tmp, complex_PRECISION*, k ); W_tmp[0] = NULL; @@ -448,7 +510,7 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, W_tmp[j] = W_tmp[0]+j*l->inner_vector_size; for ( j=0; jinner_vector_size, l ); + vector_PRECISION_scale( W_tmp[j], W[j], diag[j], 0, l->inner_vector_size, l, threading ); } process_multi_inner_product_PRECISION( k, ip, W_tmp, v, 0, l->inner_vector_size, l, threading ); @@ -457,10 +519,10 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, ip_buffer[j] = ip[j]; } MPI_Allreduce( ip_buffer, ip_buffer+k, k, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); - END_MASTER(threading) - SYNC_MASTER_TO_ALL(threading) + END_MASTER(threading); + SYNC_MASTER_TO_ALL(threading); - vector_PRECISION_multi_saxpy( v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l ); + vector_PRECISION_multi_saxpy( v_tmp, W_tmp, ip_buffer+k, 1, k, 0, l->inner_vector_size, l, threading ); if (orthogonal) vector_PRECISION_minus( z, v, v_tmp, 0, l->inner_vector_size, l ); @@ -472,75 +534,6 @@ void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, FREE( W_tmp, complex_PRECISION*, k ); } -void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vect, level_struct *l, struct Thread *threading ) { - - PROF_PRECISION_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading ); - SYNC_CORES(threading) - SYNC_HYPERTHREADS(threading) - int i, j, k, k1, k2, num_aggregates = l->s_PRECISION.num_aggregates, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; - - complex_PRECISION alpha1, alpha2; - vector_PRECISION v_pt1, v_pt2; - PRECISION norm1, norm2; - - for ( j=threading->n_thread*threading->core+threading->thread; jn_thread*threading->n_core ) { - for ( k1=0; k1 V[k2] | 2*j-th and 2*j+1-st aggregate - for ( i=0; ivector_size-l->inner_vector_size)/(double)l->inner_vector_size, threading ); } - - -void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int begin, const int n, level_struct *l, struct Thread *threading ) { - - // NOTE: only thread safe, if "buffer" is the same buffer for all threads belonging to a common MPI process - START_MASTER(threading) - PROF_PRECISION_START( _LA ); - END_MASTER(threading) - SYNC_CORES(threading) - - PRECISION beta; - int i, j, start, end; - - compute_core_start_end_custom( 0, l->inner_vector_size, &start, &end, l, threading, l->num_lattice_site_var ); - - for ( i=begin; iinner_vector_size, l, threading ); - SYNC_CORES(threading) - START_MASTER(threading) - for ( j=0; j0 ) { - START_MASTER(threading) - PROF_PRECISION_START( _ALLR ); - MPI_Allreduce( buffer, buffer+n, i, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); - PROF_PRECISION_STOP( _ALLR, 1 ); - END_MASTER(threading) - SYNC_MASTER_TO_ALL(threading) - } - - for( j=0; jinner_vector_size, l, threading ); - SYNC_MASTER_TO_ALL(threading) - vector_PRECISION_real_scale( V[i], V[i], creal(1.0/beta), start, end, l ); - SYNC_CORES(threading) - } - - START_MASTER(threading) - PROF_PRECISION_STOP( _LA, 1 ); - END_MASTER(threading) - SYNC_CORES(threading) -} - - -#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION ) -void setup_gram_schmidt_PRECISION_compute_dots( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading) { - - int thread_start; - int thread_end; - int cache_block_size = 12*64; - complex_PRECISION tmp[cache_block_size]; - - for(int i=0; i<2*offset; i++) - thread_buffer[i] = 0.0; - - SYNC_CORES(threading) - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); - - for ( int i=thread_start; iworkspace)[threading->core] = thread_buffer; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) { - for(int j=0; jworkspace)[0][j] += ((complex_PRECISION **)threading->workspace)[i][j]; - ((complex_PRECISION **)threading->workspace)[0][j+offset] += ((complex_PRECISION **)threading->workspace)[i][j+offset]; - } - } - END_MASTER(threading) - // only master needs the result in this case (it will be distributed later) -} -#endif - - -#if !defined( SSE ) || !defined( GRAM_SCHMIDT_VECTORIZED_PRECISION ) -void setup_gram_schmidt_PRECISION_axpys( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading) { - - int thread_start; - int thread_end; - int cache_block_size = 12*64; - complex_PRECISION tmp[cache_block_size]; - - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); - - for ( int i=thread_start; iinner_vector_size; - int thread_start = threading->start_index[l->depth]; - int thread_end = threading->end_index[l->depth]; - - complex_PRECISION thread_buffer[4*n]; - - for ( i=0; i<4*n; i++ ) - thread_buffer[i] = 0; - - for ( i=0; idepth > 0 ) { - coarse_gamma5_PRECISION( g5v, V[i], thread_start, thread_end, l ); - for ( j=0; j0 ) { - PROF_PRECISION_START( _ALLR ); - MPI_Allreduce( thread_buffer, thread_buffer+2*n, 2*n, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm ); - PROF_PRECISION_STOP( _ALLR, 1 ); - } - for ( j=2*n; j<4*n; j++ ) - ((complex_PRECISION *)(threading->workspace))[j] = thread_buffer[j]; - END_LOCKED_MASTER(threading) - for ( j=2*n; j<4*n; j++ ) - thread_buffer[j] = ((complex_PRECISION *)(threading->workspace))[j]; - - - if ( l->depth > 0 ) { - for( j=0; jinner_vector_size, threading ); -} - diff --git a/src/linalg_generic.h b/src/linalg_generic.h index 29af91e..7df053d 100644 --- a/src/linalg_generic.h +++ b/src/linalg_generic.h @@ -22,17 +22,6 @@ #ifndef LINALG_PRECISION_HEADER #define LINALG_PRECISION_HEADER -#ifdef _M10TV - #define VECTOR_FOR( start, end, expression, update, l ) do{ \ - if ( l->depth == 0 ) { \ - for ( start; end; ) \ - FOR12( expression; update; ) \ - } else { \ - for ( start; end; ) \ - FOR20( expression; update; ) \ - } \ - } while(0) -#else #define VECTOR_FOR( start, end, expression, update, l ) do{ \ if ( l->depth == 0 ) { \ for ( start; end; ) \ @@ -42,20 +31,7 @@ FOR2( expression; update; ) \ } \ } while(0) -#endif - -#ifdef _M10TV - #define REAL_VECTOR_FOR( start, end, expression, update, l ) do{ \ - if ( l->depth == 0 ) { \ - for ( start; end; ) \ - FOR24( expression; update; ) \ - } else { \ - for ( start; end; ) \ - FOR40( expression; update; ) \ - } \ - } while(0) -#else #define REAL_VECTOR_FOR( start, end, expression, update, l ) do{ \ if ( l->depth == 0 ) { \ for ( start; end; ) \ @@ -65,23 +41,7 @@ FOR4( expression; update; ) \ } \ } while(0) -#endif - -#ifdef _M10TV - #define THREADED_VECTOR_FOR( i, start_index, end_index, expression, update, l, threading ) do{ \ - int thread_start, thread_end; \ - if ( l->depth == 0 ) { \ - compute_core_start_end_custom(start_index, end_index, &thread_start, &thread_end, l, threading, 12); \ - for ( i=thread_start; idepth == 0 ) { \ @@ -94,8 +54,6 @@ FOR2( expression; update; ) \ } \ } while(0) -#endif - struct Thread; @@ -109,41 +67,28 @@ PRECISION process_norm_PRECISION( vector_PRECISION x, int start, int end, level_struct *l, struct Thread *threading ); complex_PRECISION local_xy_over_xx_PRECISION( vector_PRECISION phi, vector_PRECISION psi, int start, int end, level_struct *l ); + + void vector_PRECISION_define( vector_PRECISION phi, complex_PRECISION value, int start, int end, level_struct *l, Thread *threading ); + void vector_PRECISION_define_real( vector_PRECISION phi, PRECISION value, int start, int end, level_struct *l, Thread *threading ); + void vector_PRECISION_define_zero( vector_PRECISION phi, int start, int end, level_struct *l, Thread *threading ); + void vector_PRECISION_define_random( vector_PRECISION phi, int start, int end, level_struct *l, Thread *threading ); void vector_PRECISION_plus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x + y void vector_PRECISION_minus( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, int start, int end, level_struct *l ); // z := x - y - void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := alpha*x + // z := alpha*x + void vector_PRECISION_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, + level_struct *l, struct Thread *threading ); void vector_PRECISION_real_scale( vector_PRECISION z, vector_PRECISION x, complex_PRECISION alpha, int start, int end, level_struct *l ); - void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, int start, int end, level_struct *l ); // z := x + alpha*y - void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ); // z := x - void vector_PRECISION_set_even_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void vector_PRECISION_gamma5_set_even_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void vector_PRECISION_set_odd_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void vector_PRECISION_gamma5_set_odd_to_zero( vector_PRECISION eta, vector_PRECISION phi, level_struct *l, struct Thread *threading ); - void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, - int orthogonal, level_struct *l, Thread *threading ); - - void gram_schmidt_on_aggregates_PRECISION( vector_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - - // Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt - void aggregate_gram_schmidt_block_PRECISION( PRECISION *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, - int num_vec, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); + // z := x + alpha*y + void vector_PRECISION_saxpy( vector_PRECISION z, vector_PRECISION x, vector_PRECISION y, complex_PRECISION alpha, + int start, int end, level_struct *l, struct Thread *threading ); + void vector_PRECISION_multi_saxpy( vector_PRECISION z, vector_PRECISION *V, complex_PRECISION *alpha, int sign, + int count, int start, int end, level_struct *l, struct Thread *threading ); + // z := x + void vector_PRECISION_copy( vector_PRECISION z, vector_PRECISION x, int start, int end, level_struct *l ); - void gram_schmidt_PRECISION( vector_PRECISION *V, complex_PRECISION *buffer, const int start, const int n, level_struct *l, struct Thread *threading ); - void setup_gram_schmidt_PRECISION( vector_PRECISION *V, vector_PRECISION g5v, - complex_PRECISION *buffer, const int n, level_struct *l, - struct Thread *threading ); - void spinwise_PRECISION_skalarmultiply( vector_PRECISION eta1, vector_PRECISION eta2, - vector_PRECISION phi, complex_PRECISION alpha, int start, int end, level_struct *l ); + void vector_PRECISION_projection( vector_PRECISION z, vector_PRECISION v, int k, vector_PRECISION *W, complex_PRECISION *diag, int orthogonal, level_struct *l, struct Thread *threading ); + void set_boundary_PRECISION( vector_PRECISION phi, complex_PRECISION alpha, level_struct *l, struct Thread *threading ); #endif diff --git a/src/linsolve.c b/src/linsolve.c index 720dc4c..411bc0c 100644 --- a/src/linsolve.c +++ b/src/linsolve.c @@ -47,7 +47,6 @@ void fgmres_MP_struct_alloc( int m, int n, int vl, double tol, const int prec_ki p->dp.print = g.vt.evaluation?0:1; p->sp.print = g.vt.evaluation?0:1; p->dp.initial_guess_zero = 1; p->sp.initial_guess_zero = 1; - p->dp.shift = 0; p->sp.shift = 0; p->dp.v_start = 0; p->sp.v_start = 0; p->dp.v_end = l->inner_vector_size; p->sp.v_end = l->inner_vector_size; @@ -60,6 +59,10 @@ void fgmres_MP_struct_alloc( int m, int n, int vl, double tol, const int prec_ki g.p.eval_operator = d_plus_clover_double; } +#ifdef HAVE_TM1p1 + vl*=2; +#endif + // double precision part total = 0; total += (m+1)*m; // Hessenberg matrix @@ -200,7 +203,12 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { SYNC_MASTER_TO_ALL(threading) if( ol == 0) { - norm_r0 = creal(gamma0); + if (l->depth == 0 && !p->dp.initial_guess_zero) { + norm_r0 = global_norm_double( p->dp.b, start, end, l, threading ); + printf0("| initial guess relative residual: %le |\n", creal(gamma0)/norm_r0); + } else { + norm_r0 = creal(gamma0); + } } #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) else { @@ -220,12 +228,11 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { // inner loop in single precision for( il=0; ildp.restart_length && finish==0; il++) { j = il; iter++; - arnoldi_step_MP( p->sp.V, p->sp.Z, p->sp.w, p->dp.H, p->dp.y, j, p->sp.preconditioner, - p->sp.shift, &(p->sp), l, threading ); + arnoldi_step_MP( p->sp.V, p->sp.Z, p->sp.w, p->dp.H, p->dp.y, j, p->sp.preconditioner, &(p->sp), l, threading ); if ( cabs( p->dp.H[j][j+1] ) > 1E-15 ) { qr_update_double( p->dp.H, p->dp.s, p->dp.c, p->dp.gamma, j, l, threading ); - gamma_jp1 = cabs( p->dp.gamma[j+1] ); + gamma_jp1 = cabs( p->dp.gamma[j+1] ); if ( iter%10 == 0 || p->sp.preconditioner != NULL || l->depth > 0 ) { #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -316,8 +323,7 @@ int fgmres_MP( gmres_MP_struct *p, level_struct *l, struct Thread *threading ) { void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, complex_double **H, complex_double* buffer, int j, void (*prec)(), - complex_float shift, gmres_float_struct *p, level_struct *l, - struct Thread *threading ) { + gmres_float_struct *p, level_struct *l, struct Thread *threading ) { SYNC_MASTER_TO_ALL(threading) SYNC_CORES(threading) @@ -332,7 +338,6 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, if ( prec != NULL ) { if ( p->kind == _LEFT ) { apply_operator_float( Z[0], V[j], p, l, threading ); - if ( shift ) vector_float_saxpy( Z[0], Z[0], V[j], shift, start, end, l ); prec( w, NULL, Z[0], _NO_RES, l, threading ); } else { if ( g.mixed_precision == 2 && (g.method >= 1 && g.method <= 2 ) ) { @@ -342,11 +347,9 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, prec( Z[j], NULL, V[j], _NO_RES, l, threading ); apply_operator_float( w, Z[j], p, l, threading ); // w = D*Z[j] } - if ( shift ) vector_float_saxpy( w, w, Z[j], shift, start, end, l ); } } else { apply_operator_float( w, V[j], p, l, threading ); // w = D*V[j] - if ( shift ) vector_float_saxpy( w, w, V[j], shift, start, end, l ); } complex_double tmp[j+1]; @@ -370,7 +373,7 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, complex_float alpha[j+1]; for( i=0; i<=j; i++ ) alpha[i] = (complex_float) -H[j][i]; - vector_float_multi_saxpy( w, V, alpha, 1, j+1, start, end, l ); + vector_float_multi_saxpy( w, V, alpha, 1, j+1, p->v_start, p->v_end, l, threading ); complex_double tmp2 = global_norm_MP( w, p->v_start, p->v_end, l, threading ); START_MASTER(threading) @@ -390,11 +393,6 @@ void compute_solution_MP( vector_float x, vector_float *V, complex_double *y, int i, k; // start and end indices for vector functions depending on thread - int start; - int end; - // compute start and end indices for core - // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads - compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); START_MASTER(threading) @@ -415,12 +413,12 @@ void compute_solution_MP( vector_float x, vector_float *V, complex_double *y, SYNC_MASTER_TO_ALL(threading) // x = V*y - vector_float_scale( x, V[0], (complex_float) y[0], start, end, l ); + vector_float_scale( x, V[0], (complex_float) y[0], p->v_start, p->v_end, l, threading ); complex_float alpha[j]; for ( i=1; i<=j; i++ ) alpha[i-1] = (complex_float) y[i]; - vector_float_multi_saxpy( x, &(V[1]), alpha, 1, j, start, end, l ); + vector_float_multi_saxpy( x, &(V[1]), alpha, 1, j, p->v_start, p->v_end, l, threading ); } diff --git a/src/linsolve.h b/src/linsolve.h index 55ed9fd..86dd54f 100644 --- a/src/linsolve.h +++ b/src/linsolve.h @@ -29,8 +29,7 @@ void arnoldi_step_MP( vector_float *V, vector_float *Z, vector_float w, complex_double **H, complex_double* buffer, int j, void (*prec)(), - complex_float shift, gmres_float_struct *p, level_struct *l, - struct Thread *threading ); + gmres_float_struct *p, level_struct *l, struct Thread *threading ); void compute_solution_MP( vector_float x, vector_float *V, complex_double *y, complex_double *gamma, complex_double **H, int j, diff --git a/src/linsolve_generic.c b/src/linsolve_generic.c index 365bdf2..ae8f167 100644 --- a/src/linsolve_generic.c +++ b/src/linsolve_generic.c @@ -39,7 +39,6 @@ void fgmres_PRECISION_struct_init( gmres_PRECISION_struct *p ) { p->gamma = NULL; p->c = NULL; p->s = NULL; - p->shift = 0; p->preconditioner = NULL; p->eval_operator = NULL; } @@ -71,6 +70,10 @@ void fgmres_PRECISION_struct_alloc( int m, int n, int vl, PRECISION tol, const i p->eval_operator = eval_op; p->tol = tol; p->kind = prec_kind; + +#ifdef HAVE_TM1p1 + vl*=2; +#endif if(m > 0) { total += (m+1)*m; // Hessenberg matrix @@ -147,7 +150,6 @@ void fgmres_PRECISION_struct_alloc( int m, int n, int vl, PRECISION tol, const i p->timing = 1; p->print = g.vt.evaluation?0:1; p->initial_guess_zero = 1; - p->shift = 0; p->v_start = 0; p->v_end = l->inner_vector_size; p->op = &(g.op_PRECISION); @@ -156,7 +158,6 @@ void fgmres_PRECISION_struct_alloc( int m, int n, int vl, PRECISION tol, const i p->timing = 0; p->print = 0; p->initial_guess_zero = 1; - p->shift = 0; p->v_start = 0; p->v_end = l->inner_vector_size; p->op = &(l->s_PRECISION.op); @@ -165,7 +166,6 @@ void fgmres_PRECISION_struct_alloc( int m, int n, int vl, PRECISION tol, const i p->print = 0; p->initial_guess_zero = 1; p->layout = -1; - p->shift = 0; p->v_start = 0; p->v_end = l->inner_vector_size; if ( g.odd_even ) @@ -232,7 +232,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread complex_PRECISION beta = 0; - double norm_r0=1, gamma_jp1=1, t0=0, t1=0; + PRECISION norm_r0=1, gamma_jp1=1, t0=0, t1=0; START_LOCKED_MASTER(threading) if ( l->depth==0 && ( p->timing || p->print ) ) prof_init( l ); @@ -258,7 +258,6 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread res = _RES; if ( p->kind == _LEFT && p->preconditioner ) { apply_operator_PRECISION( p->Z[0], p->x, p, l, threading ); - if ( p->shift ) vector_PRECISION_saxpy( p->Z[0], p->Z[0], p->x, p->shift, start, end, l ); if ( g.method == 5 ) { START_LOCKED_MASTER(threading) g.bicgstab_tol = (!g.mixed_precision)?p->tol:MAX( 1E-3, (p->tol/(gamma_jp1/norm_r0))*5E-1 ); @@ -270,20 +269,25 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread } vector_PRECISION_minus( p->r, p->b, p->w, start, end, l ); // compute r = b - w } - gamma0 = (complex_PRECISION) global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) + gamma0 = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, threading ); // gamma_0 = norm(r) START_MASTER(threading) p->gamma[0] = gamma0; - END_MASTER(threading) - SYNC_MASTER_TO_ALL(threading) + END_MASTER(threading); + SYNC_MASTER_TO_ALL(threading); - if( ol == 0) { - norm_r0 = creal(p->gamma[0]); + if ( ol == 0 ) { + if (l->depth == 0 && !p->initial_guess_zero) { + norm_r0 = global_norm_PRECISION( p->b, p->v_start, p->v_end, l, threading ); + printf0("| initial guess relative residual: %le |\n", creal(gamma0)/norm_r0); + } else { + norm_r0 = creal(p->gamma[0]); + } } - + vector_PRECISION_real_scale( p->V[0], p->r, 1/p->gamma[0], start, end, l ); // v_0 = r / gamma_0 #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { - arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, 0, p->preconditioner, p->shift, p, l, threading ); + arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, 0, p->preconditioner, p, l, threading ); } #endif @@ -298,18 +302,18 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread // one step of Arnoldi #if defined(SINGLE_ALLREDUCE_ARNOLDI) && defined(PIPELINED_ARNOLDI) if ( l->level == 0 && l->depth > 0 ) { - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j+1, p->preconditioner, p->shift, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j+1, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+2, j+1 ); break; } } else { - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p->shift, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j ); break; } } #else - if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p->shift, p, l, threading ) ) { + if ( !arnoldi_step_PRECISION( p->V, p->Z, p->w, p->H, p->y, j, p->preconditioner, p, l, threading ) ) { printf0("| -------------- iteration %d, restart due to H(%d,%d) < 0 |\n", iter, j+1, j ); break; } @@ -330,7 +334,7 @@ int fgmres_PRECISION( gmres_PRECISION_struct *p, level_struct *l, struct Thread if( gamma_jp1/norm_r0 < p->tol || gamma_jp1/norm_r0 > 1E+5 ) { // if satisfied ... stop finish = 1; START_MASTER(threading) - if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level ); + if ( gamma_jp1/norm_r0 > 1E+5 ) printf0("Divergence of fgmres_PRECISION, iter = %d, level=%d\n", iter, l->level ); END_MASTER(threading) } } else { @@ -423,7 +427,7 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr vector_PRECISION x, b, r, r_tilde, p, pp, v, s, t; // Krylov subspace size: 5 complex_PRECISION alpha=1, beta=1, rho=1, rho_old=1, omega=1; int iter=0, maxiter; - double tol, b_norm, r_norm, s_norm; + PRECISION tol, b_norm, r_norm, s_norm; // start and end indices for vector functions depending on thread int start; int end; @@ -437,11 +441,12 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr vector_PRECISION_copy( r, b, start, end, l ); vector_PRECISION_copy( r_tilde, b, start, end, l ); - vector_PRECISION_define( x, 0, start, end, l ); - vector_PRECISION_define( v, 0, start, end, l ); - vector_PRECISION_define( s, 0, start, end, l ); - vector_PRECISION_define( t, 0, start, end, l ); + vector_PRECISION_define_zero( x, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_define_zero( v, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_define_zero( s, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_define_zero( t, ps->v_start, ps->v_end, l, threading ); b_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading ); + r_norm = b_norm; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) START_MASTER(threading) @@ -453,6 +458,7 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr rho_old = rho; rho = global_inner_product_PRECISION( r_tilde, r, ps->v_start, ps->v_end, l, threading ); + if ( rho == 0 ) { START_MASTER(threading) printf0("rho = 0: BiCGstab did not converge.\n"); @@ -464,27 +470,29 @@ void bicgstab_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thr vector_PRECISION_copy( p, r, start, end, l ); } else { beta = (rho/rho_old)*(alpha/omega); - vector_PRECISION_saxpy( pp, p, v, -omega, start, end, l ); - vector_PRECISION_saxpy( p, r, pp, beta, start, end, l ); + vector_PRECISION_saxpy( pp, p, v, -omega, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( p, r, pp, beta, ps->v_start, ps->v_end, l, threading ); } apply_operator_PRECISION( v, p, ps, l, threading ); alpha = rho / global_inner_product_PRECISION( r_tilde, v, ps->v_start, ps->v_end, l, threading ); - vector_PRECISION_saxpy( s, r, v, -alpha, start, end, l ); + vector_PRECISION_saxpy( s, r, v, -alpha, ps->v_start, ps->v_end, l, threading ); s_norm = global_norm_PRECISION( s, ps->v_start, ps->v_end, l, threading ); - + if ( s_norm/b_norm < tol ) { - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); + vector_PRECISION_saxpy( x, x, p, alpha, ps->v_start, ps->v_end, l, threading ); break; } apply_operator_PRECISION( t, s, ps, l, threading ); omega = global_inner_product_PRECISION( t, s, ps->v_start, ps->v_end, l, threading ) / global_inner_product_PRECISION( t, t, ps->v_start, ps->v_end, l, threading ); - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( x, x, s, omega, start, end, l ); - vector_PRECISION_saxpy( r, s, t, -omega, start, end, l ); + vector_PRECISION_saxpy( x, x, p, alpha, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( x, x, s, omega, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( r, s, t, -omega, ps->v_start, ps->v_end, l, threading ); + r_norm = global_norm_PRECISION( r, ps->v_start, ps->v_end, l, threading ); + #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) START_MASTER(threading) if ( iter % 100 == 0 ) printf0("| biCGstab relres: %12.6le, iterations: %-8d |\n", r_norm/b_norm, iter ); @@ -510,7 +518,7 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * vector_PRECISION r_old, r_new, r_true, p, pp, Dp, x, b; complex_PRECISION alpha, beta=0, gamma; int maxiter, iter=0; - double tol, r0_norm, r_norm, prod_rr_old, t0=0, t1=0; + PRECISION tol, r0_norm, r_norm, prod_rr_old, t0=0, t1=0; // start and end indices for vector functions depending on thread int start; int end; @@ -529,14 +537,16 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads compute_core_start_end(ps->v_start, ps->v_end, &start, &end, l, threading); - vector_PRECISION_define( x, 0, start, end, l ); + vector_PRECISION_define_zero( x, ps->v_start, ps->v_end, l, threading ); apply_operator_PRECISION( Dp, x, ps, l, threading ); vector_PRECISION_minus( pp, b, Dp, start, end, l ); apply_operator_dagger_PRECISION( r_old, pp, ps, l, threading ); vector_PRECISION_copy( p, r_old, start, end, l ); - r0_norm = creal(global_norm_PRECISION( r_old, ps->v_start, ps->v_end, l, threading )); - prod_rr_old = global_inner_product_PRECISION( r_old, r_old, ps->v_start, ps->v_end, l, threading ); + r0_norm = global_norm_PRECISION( r_old, ps->v_start, ps->v_end, l, threading ); + // prod_rr_old = global_inner_product_PRECISION( r_old, r_old, ps->v_start, ps->v_end, l, threading ); + prod_rr_old = r0_norm*r0_norm; + #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( ps->print ) { START_MASTER(threading) @@ -552,13 +562,13 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading ); alpha = prod_rr_old / gamma; - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l ); + vector_PRECISION_saxpy( x, x, p, alpha, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, ps->v_start, ps->v_end, l, threading ); gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading ); beta = gamma / prod_rr_old; - vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l ); + vector_PRECISION_saxpy( p, r_new, p, beta, ps->v_start, ps->v_end, l, threading ); vector_PRECISION_copy( r_old, r_new, start, end, l ); prod_rr_old = gamma; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -570,10 +580,11 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * #endif } - r0_norm = creal(global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading )); + r0_norm = global_norm_PRECISION( b, ps->v_start, ps->v_end, l, threading ); apply_operator_PRECISION( Dp, x, ps, l, threading ); vector_PRECISION_minus( r_true, b, Dp, start, end, l ); - r_norm = creal(global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading )); + r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading ); + #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) if ( ps->print ) { START_MASTER(threading) @@ -592,16 +603,16 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * gamma = global_inner_product_PRECISION( p, Dp, ps->v_start, ps->v_end, l, threading ); alpha = prod_rr_old / gamma; - vector_PRECISION_saxpy( x, x, p, alpha, start, end, l ); - vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, start, end, l ); + vector_PRECISION_saxpy( x, x, p, alpha, ps->v_start, ps->v_end, l, threading ); + vector_PRECISION_saxpy( r_new, r_old, Dp, -alpha, ps->v_start, ps->v_end, l, threading ); // residual update - vector_PRECISION_saxpy( r_true, r_true, pp, -alpha, start, end, l ); - r_norm = creal(global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading )); + vector_PRECISION_saxpy( r_true, r_true, pp, -alpha, ps->v_start, ps->v_end, l, threading ); + r_norm = global_norm_PRECISION( r_true, ps->v_start, ps->v_end, l, threading ); gamma = global_inner_product_PRECISION( r_new, r_new, ps->v_start, ps->v_end, l, threading ); beta = gamma / prod_rr_old; - vector_PRECISION_saxpy( p, r_new, p, beta, start, end, l ); + vector_PRECISION_saxpy( p, r_new, p, beta, ps->v_start, ps->v_end, l, threading ); vector_PRECISION_copy( r_old, r_new, start, end, l ); prod_rr_old = gamma; #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -621,7 +632,8 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * END_MASTER(threading) apply_operator_PRECISION( Dp, x, ps, l, threading ); vector_PRECISION_minus( pp, b, Dp, start, end, l ); - beta = creal(global_norm_PRECISION( pp, ps->v_start, ps->v_end, l, threading )); + + beta = global_norm_PRECISION( pp, ps->v_start, ps->v_end, l, threading ); START_MASTER(threading) if ( ps->timing ) printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta/r0_norm) ); printf0("| elapsed wall clock time: %-12g seconds |\n", t1-t0 ); @@ -648,7 +660,7 @@ void cgn_PRECISION( gmres_PRECISION_struct *ps, level_struct *l, struct Thread * int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w, complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), - complex_PRECISION shift, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { /********************************************************************************* * Extends the Arnoldi basis by one vector. @@ -663,7 +675,6 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE * against all previous ones. * - void (*prec)(): Function pointer to preconditioner (can be NULL if no * preconditioning is used). -* - complex_PRECISION shift: Denotes the dirac shift (can be 0). *********************************************************************************/ #ifdef SINGLE_ALLREDUCE_ARNOLDI #ifdef PIPELINED_ARNOLDI @@ -675,19 +686,18 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE int start, end, i; const complex_PRECISION sigma = 0; compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); - + if ( j == 0 ) vector_PRECISION_copy( Z[0], V[0], start, end, l ); else vector_PRECISION_copy( V[j], Z[j], start, end, l ); - + complex_PRECISION tmp[j+1]; process_multi_inner_product_PRECISION( j+1, tmp, V, V[j], p->v_start, p->v_end, l, threading ); START_MASTER(threading) PROF_PRECISION_START( _ALLR ); - for( i=0; i<=j; i++ ) { + for( i=0; i<=j; i++ ) buffer[i] = tmp[i]; - } if ( g.num_processes > 1 ) { MPI_Iallreduce( buffer, H[MAX(0,j-1)], j+1, MPI_COMPLEX_PRECISION, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_PRECISION.level_comm, &req ); @@ -715,8 +725,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) for( i=0; iv_start, p->v_end, l, threading ); vector_PRECISION_real_scale( V[j], V[j], 1/H[MAX(0,j-1)][j], start, end, l ); START_MASTER(threading) @@ -727,14 +736,14 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) if ( j == 0 ) { - if ( sigma ) vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[j], -sigma, start, end, l ); + if ( sigma ) vector_PRECISION_saxpy( Z[j+1], Z[j+1], Z[j], -sigma, p->v_start, p->v_end, l, threading ); } else { for( i=0; iv_start, p->v_end, l, threading ); } vector_PRECISION_real_scale( Z[j+1], Z[j+1], 1/H[MAX(0,j-1)][j], start, end, l ); - + } else { #endif SYNC_MASTER_TO_ALL(threading) @@ -742,13 +751,12 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE int start, end, i; const complex_PRECISION sigma = 0; compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); - + if ( prec != NULL ) { if ( p->kind == _LEFT ) { apply_operator_PRECISION( Z[0], V[j], p, l, threading ); - if ( shift ) vector_PRECISION_saxpy( Z[0], Z[0], V[j], shift, start, end, l ); prec( V[j+1], NULL, Z[0], _NO_RES, l, threading ); - if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l ); + if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, p->v_start, p->v_end, l, threading ); } else { if ( l->level == 0 ) { prec( Z[j], NULL, V[j], _NO_RES, l, threading ); @@ -762,20 +770,19 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE apply_operator_PRECISION( V[j+1], Z[j], p, l, threading ); // w = D*Z[j] } } - if ( shift ) vector_PRECISION_saxpy( V[j+1], V[j+1], Z[j], shift, start, end, l ); - if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, start, end, l ); + if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, p->v_start, p->v_end, l, threading ); + } } else { apply_operator_PRECISION( V[j+1], V[j], p, l, threading ); // w = D*V[j] - if ( shift-sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], shift-sigma, start, end, l ); + if ( sigma ) vector_PRECISION_saxpy( V[j+1], V[j+1], V[j], -sigma, p->v_start, p->v_end, l, threading ); } complex_PRECISION tmp[j+2]; process_multi_inner_product_PRECISION( j+2, tmp, V, V[j+1], p->v_start, p->v_end, l, threading ); START_MASTER(threading) - for( i=0; i<=j+1; i++ ) { + for( i=0; i<=j+1; i++ ) buffer[i] = tmp[i]; - } if ( g.num_processes > 1 ) { PROF_PRECISION_START( _ALLR ); @@ -797,8 +804,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( V[j+1], V[j+1], V[i], -H[j][i], start, end, l ); - + vector_PRECISION_saxpy( V[j+1], V[j+1], V[i], -H[j][i], p->v_start, p->v_end, l, threading ); vector_PRECISION_real_scale( V[j+1], V[j+1], 1/H[j][j+1], start, end, l ); START_LOCKED_MASTER(threading) H[j][j] += sigma; @@ -815,11 +821,10 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE // compute start and end indices for core // this puts zero for all other hyperthreads, so we can call functions below with all hyperthreads compute_core_start_end(p->v_start, p->v_end, &start, &end, l, threading); - + if ( prec != NULL ) { if ( p->kind == _LEFT ) { apply_operator_PRECISION( Z[0], V[j], p, l, threading ); - if ( shift ) vector_PRECISION_saxpy( Z[0], Z[0], V[j], shift, start, end, l ); prec( w, NULL, Z[0], _NO_RES, l, threading ); } else { if ( l->level == 0 ) { @@ -832,12 +837,10 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE prec( Z[j], NULL, V[j], _NO_RES, l, threading ); apply_operator_PRECISION( w, Z[j], p, l, threading ); // w = D*Z[j] } - if ( shift ) vector_PRECISION_saxpy( w, w, Z[j], shift, start, end, l ); } } } else { apply_operator_PRECISION( w, V[j], p, l, threading ); // w = D*V[j] - if ( shift ) vector_PRECISION_saxpy( w, w, V[j], shift, start, end, l ); } // orthogonalization @@ -857,8 +860,7 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( w, w, V[i], -H[j][i], start, end, l ); - + vector_PRECISION_saxpy( w, w, V[i], -H[j][i], p->v_start, p->v_end, l, threading ); #ifdef REORTH // re-orthogonalization process_multi_inner_product_PRECISION( j+1, tmp, V, w, p->v_start, p->v_end, l, threading ); @@ -873,15 +875,15 @@ int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRE for( i=0; i<=j; i++ ) H[j][i] += tmp[i]; - + END_MASTER(threading) SYNC_MASTER_TO_ALL(threading) for( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( w, w, V[i], -tmp[i], start, end, l ); + vector_PRECISION_saxpy( w, w, V[i], -tmp[i], p->v_start, p->v_end, l, threading ); #endif // normalization - complex_PRECISION tmp2 = global_norm_PRECISION( w, p->v_start, p->v_end, l, threading ); + PRECISION tmp2 = global_norm_PRECISION( w, p->v_start, p->v_end, l, threading ); START_MASTER(threading) H[j][j+1] = tmp2; END_MASTER(threading) @@ -972,12 +974,14 @@ void compute_solution_PRECISION( vector_PRECISION x, vector_PRECISION *V, comple // x = x + V*y if ( ol ) { - for ( i=0; i<=j; i++ ) - vector_PRECISION_saxpy( x, x, V[i], y[i], start, end, l ); + for ( i=0; i<=j; i++ ) { + vector_PRECISION_saxpy( x, x, V[i], y[i], p->v_start, p->v_end, l, threading ); + } } else { - vector_PRECISION_scale( x, V[0], y[0], start, end, l ); - for ( i=1; i<=j; i++ ) - vector_PRECISION_saxpy( x, x, V[i], y[i], start, end, l ); + vector_PRECISION_scale( x, V[0], y[0], p->v_start, p->v_end, l, threading ); + for ( i=1; i<=j; i++ ) { + vector_PRECISION_saxpy( x, x, V[i], y[i], p->v_start, p->v_end, l, threading ); + } } } @@ -998,17 +1002,17 @@ void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_ START_UNTHREADED_FUNCTION(threading) - int i, end = (g.odd_even&&l->depth==0)?start+12*s->num_block_even_sites:start+s->block_vector_size, - n = l->block_iter; + int i, nv = l->num_lattice_site_var, n = l->block_iter, + end = (g.odd_even&&l->depth==0)?(start+nv*s->num_block_even_sites):(start+s->block_vector_size); vector_PRECISION Dr = s->local_minres_buffer[0]; vector_PRECISION r = s->local_minres_buffer[1]; vector_PRECISION lphi = s->local_minres_buffer[2]; complex_PRECISION alpha; void (*block_op)() = (l->depth==0)?(g.odd_even?apply_block_schur_complement_PRECISION:block_d_plus_clover_PRECISION) :coarse_block_operator_PRECISION; - + vector_PRECISION_copy( r, eta, start, end, l ); - vector_PRECISION_define( lphi, 0, start, end, l ); + vector_PRECISION_define_zero( lphi, start, end, l, no_threading ); for ( i=0; i/ alpha = local_xy_over_xx_PRECISION( Dr, r, start, end, l ); // phi += alpha * r - vector_PRECISION_saxpy( lphi, lphi, r, alpha, start, end, l ); + vector_PRECISION_saxpy( lphi, lphi, r, alpha, start, end, l, no_threading ); // r -= alpha * Dr - vector_PRECISION_saxpy( r, r, Dr, -alpha, start, end, l ); + vector_PRECISION_saxpy( r, r, Dr, -alpha, start, end, l, no_threading ); } if ( latest_iter != NULL ) vector_PRECISION_copy( latest_iter, lphi, start, end, l ); @@ -1038,7 +1042,7 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { int i, j=-1, finish=0, iter=0, il, ol; complex_PRECISION beta = 0, alpha; - double norm_r0=0, t0=0, t1=0; + PRECISION r0_norm=0, t0=0, t1=0; if ( p->timing || p->print ) t0 = MPI_Wtime(); #if defined(TRACK_RES) && !defined(WILSON_BENCHMARK) @@ -1048,13 +1052,14 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { if( ol == 0 && p->initial_guess_zero ) { vector_PRECISION_copy( p->r, p->b, p->v_start, p->v_end, l ); + } else { apply_operator_PRECISION( p->w, p->x, p, l, no_threading ); // compute w = D*x vector_PRECISION_minus( p->r, p->b, p->w, p->v_start, p->v_end, l ); // compute r = b - w } if( ol == 0) { - norm_r0 = (complex_PRECISION) global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ); + r0_norm = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ); } for( il=0; ilrestart_length && finish==0; il++ ) { @@ -1066,16 +1071,16 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { for( i=0; iZ[i], p->Z[j], p->v_start, p->v_end, l, no_threading ) / p->gamma[i]; - vector_PRECISION_saxpy( p->V[j], p->V[j], p->V[i], -beta, p->v_start, p->v_end, l ); - vector_PRECISION_saxpy( p->Z[j], p->Z[j], p->Z[i], -beta, p->v_start, p->v_end, l ); + vector_PRECISION_saxpy( p->V[j], p->V[j], p->V[i], -beta, p->v_start, p->v_end, l, no_threading ); + vector_PRECISION_saxpy( p->Z[j], p->Z[j], p->Z[i], -beta, p->v_start, p->v_end, l, no_threading ); } p->gamma[j] = global_inner_product_PRECISION( p->Z[j], p->Z[j], p->v_start, p->v_end, l, no_threading ); alpha = global_inner_product_PRECISION( p->Z[j], p->r, p->v_start, p->v_end, l, no_threading ) / p->gamma[j]; - vector_PRECISION_saxpy( p->x, p->x, p->V[j], alpha, p->v_start, p->v_end, l ); - vector_PRECISION_saxpy( p->r, p->r, p->Z[j], -alpha, p->v_start, p->v_end, l ); + vector_PRECISION_saxpy( p->x, p->x, p->V[j], alpha, p->v_start, p->v_end, l, no_threading ); + vector_PRECISION_saxpy( p->r, p->r, p->Z[j], -alpha, p->v_start, p->v_end, l, no_threading ); - alpha = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ) / norm_r0; + alpha = global_norm_PRECISION( p->r, p->v_start, p->v_end, l, no_threading ) / r0_norm; if ( creal(alpha) < p->tol ) { finish = 1; break; @@ -1098,7 +1103,7 @@ void fgcr_PRECISION( gmres_PRECISION_struct *p, level_struct *l ) { #endif printf0("+----------------------------------------------------------+\n"); printf0("| FGCR iterations: %-6d |\n", iter ); - printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta)/norm_r0 ); + printf0("| exact relative residual: ||r||/||b|| = %e |\n", creal(beta)/r0_norm ); printf0("| elapsed wall clock time: %-7lf seconds |\n", t1-t0 ); if ( g.coarse_time > 0 ) printf0("| coarse grid time: %-7lf seconds (%04.1lf%%) |\n", diff --git a/src/linsolve_generic.h b/src/linsolve_generic.h index 1a7f9cd..e28bb6b 100644 --- a/src/linsolve_generic.h +++ b/src/linsolve_generic.h @@ -36,8 +36,8 @@ void local_minres_PRECISION( vector_PRECISION phi, vector_PRECISION eta, vector_PRECISION latest_iter, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); int arnoldi_step_PRECISION( vector_PRECISION *V, vector_PRECISION *Z, vector_PRECISION w, - complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), - complex_PRECISION shift, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); + complex_PRECISION **H, complex_PRECISION* buffer, int j, void (*prec)(), + gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ); void qr_update_PRECISION( complex_PRECISION **H, complex_PRECISION *s, complex_PRECISION *c, complex_PRECISION *gamma, int j, level_struct *l, struct Thread *threading ); diff --git a/src/main.c b/src/main.c index 6723fa4..af5cf8c 100644 --- a/src/main.c +++ b/src/main.c @@ -75,7 +75,7 @@ int main( int argc, char **argv ) { commonthreaddata = (struct common_thread_data *)malloc(sizeof(struct common_thread_data)); init_common_thread_data(commonthreaddata); -#pragma omp parallel num_threads(g.num_openmp_processes) + THREADED(g.num_openmp_processes) { struct Thread threading; setup_threading(&threading, commonthreaddata, &l); diff --git a/src/main.h b/src/main.h index 4bdf103..0c9dc60 100644 --- a/src/main.h +++ b/src/main.h @@ -38,19 +38,21 @@ #define EPS_float 1E-6 #define EPS_double 1E-14 - #define HAVE_TM // flag for enable twisted mass + #define HAVE_TM // flag for enable twisted mass + #define HAVE_TM1p1 // flag for enable doublet for twisted mass + #undef INIT_ONE_PREC // flag undef for enabling additional features in the lib - #define FOR2( e ) { e e } - #define FOR3( e ) { e e e } - #define FOR4( e ) { e e e e } - #define FOR10( e ) { e e e e e e e e e e } - #define FOR20( e ) { e e e e e e e e e e e e e e e e e e e e } - #define FOR40( e ) { e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e e } + #define FOR2( e ) { e e } + #define FOR3( e ) { e e e } + #define FOR4( e ) { e e e e } #define FOR6( e ) { e e e e e e } + #define FOR10( e ) { e e e e e e e e e e } #define FOR12( e ) { e e e e e e e e e e e e } - #define FOR24( e ) { e e e e e e e e e e e e e e e e e e e e e e e e } + #define FOR20( e ) { FOR10( e ) FOR10( e ) } + #define FOR24( e ) { FOR12( e ) FOR12( e ) } #define FOR36( e ) { FOR12( e ) FOR12( e ) FOR12( e ) } + #define FOR40( e ) { FOR20( e ) FOR20( e ) } #define FOR42( e ) { FOR36( e ) FOR6( e ) } #define SQUARE( e ) (e)*(e) @@ -73,12 +75,14 @@ #define cimag_float cimagf #define csqrt_double csqrt #define csqrt_float csqrtf + #define sqrt_double sqrt + #define sqrt_float sqrtf #define cpow_double cpow #define cpow_float cpowf #define pow_double pow #define pow_float powf - #define abs_float fabs - #define abs_double abs + #define abs_double fabs + #define abs_float fabsf #ifdef SSE #define MALLOC( variable, kind, length ) do{ if ( variable != NULL ) { \ @@ -177,6 +181,7 @@ #define DEBUGOUTPUT( A, FORMAT ) #endif + #include "simd_vectorization_control.h" #include "vectorization_control.h" #include "threading.h" @@ -185,7 +190,7 @@ enum { _NO_DEFAULT_SET, _DEFAULT_SET }; enum { _NO_REORDERING, _REORDER }; enum { _ADD, _COPY }; - enum { _ORDINARY, _SCHWARZ }; + enum { _ORDINARY, _SCHWARZ, _ODDEVEN }; enum { _RES, _NO_RES }; enum { _STANDARD, _LIME }; //formats enum { _READ, _WRITE }; @@ -200,6 +205,7 @@ _SM1, _SM2, _SM3, _SM4, _SMALL1, _SMALL2, _NUM_PROF }; // _NUM_PROF has always to be the last constant! enum { _VTS = 20 }; enum { _TRCKD_VAL, _STP_TIME, _SLV_ITER, _SLV_TIME, _CRS_ITER, _CRS_TIME, _SLV_ERR, _CGNR_ERR, _NUM_OPTB }; + enum { _SSE, _AVX }; typedef struct block_struct { int start, color, no_comm, *bt; @@ -297,6 +303,7 @@ int *local_lattice; int *block_lattice; int num_eig_vect; + int num_parent_eig_vect; int coarsening[4]; int global_splitting[4]; int periodic_bc[4]; @@ -319,13 +326,7 @@ int schwarz_vector_size; int D_size; int clover_size; - // operator - double real_shift; - complex_double dirac_shift, even_shift, odd_shift; -#ifdef HAVE_TM int block_size; - complex_double tm_shift, tm_even_shift, tm_odd_shift; -#endif // buffer vectors vector_float vbuf_float[9], sbuf_float[2]; vector_double vbuf_double[9], sbuf_double[2]; @@ -337,10 +338,8 @@ // next coarser level struct level_struct *next_level; - } level_struct; - typedef struct global_struct { FILE *logfile; @@ -369,15 +368,21 @@ // profiling, analysis, output int coarse_iter_count, iter_count, iterator, print, conf_flag, setup_flag, in_setup; double coarse_time, prec_time, *output_table[8], cur_storage, max_storage, total_time, - plaq_hopp, plaq_clov, norm_res, plaq, setup_m0, solve_m0, bicgstab_tol, twisted_bc[4], - test; + plaq_hopp, plaq_clov, norm_res, plaq, bicgstab_tol, twisted_bc[4], test; + + double m0, setup_m0; #ifdef HAVE_TM // twisted mass parameters int downprop; - double tm_mu, setup_tm_mu, tm_mu_odd_shift, tm_mu_even_shift, *tm_mu_factor; + double mu, setup_mu, mu_odd_shift, mu_even_shift, *mu_factor; #endif - + +#ifdef HAVE_TM1p1 + int n_flavours; + double epsbar, epsbar_ig5_odd_shift, epsbar_ig5_even_shift, *epsbar_factor; +#endif + // index functions for external usage int (*conf_index_fct)(), (*vector_index_fct)(); int *odd_even_table; @@ -462,29 +467,19 @@ // functions #include "clifford.h" +#ifdef SIMD +#include "simd_complex_float.h" +#include "simd_complex_double.h" +#include "simd_blas_float.h" +#include "simd_blas_double.h" +#endif #ifdef SSE #include "vectorization_dirac_float.h" #include "vectorization_dirac_double.h" -#include "blas_vectorized.h" -#include "sse_blas_vectorized.h" #include "sse_complex_float_intrinsic.h" #include "sse_complex_double_intrinsic.h" -#include "sse_coarse_operator_float.h" -#include "sse_coarse_operator_double.h" -#include "sse_linalg_float.h" -#include "sse_linalg_double.h" -#include "sse_interpolation_float.h" -#include "sse_interpolation_double.h" -#include "sse_schwarz_float.h" -#include "sse_schwarz_double.h" -#else -//no intrinsics -#include "interpolation_float.h" -#include "interpolation_double.h" #endif -#include "data_float.h" -#include "data_double.h" #include "data_layout.h" #include "io.h" #include "init.h" @@ -500,6 +495,10 @@ #include "linalg_double.h" #include "ghost_float.h" #include "ghost_double.h" +#include "gram_schmidt_float.h" +#include "gram_schmidt_double.h" +#include "interpolation_float.h" +#include "interpolation_double.h" #include "linsolve_float.h" #include "linsolve_double.h" #include "linsolve.h" @@ -521,6 +520,8 @@ #include "gathering_double.h" #include "coarse_operator_float.h" #include "coarse_operator_double.h" +#include "coarse_coupling_float.h" +#include "coarse_coupling_double.h" #include "coarse_oddeven_float.h" #include "coarse_oddeven_double.h" #include "var_table.h" diff --git a/src/main_post_def_generic.h b/src/main_post_def_generic.h index c138409..690ef6b 100644 --- a/src/main_post_def_generic.h +++ b/src/main_post_def_generic.h @@ -26,19 +26,53 @@ #include "dirac_PRECISION.h" #include "coarse_operator_PRECISION.h" - static inline void apply_operator_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { + p->eval_operator( output, input, p->op, l, threading ); - if ( p->shift ) { - int start, end; - compute_core_start_end_custom(p->v_start, p->v_end, &start, &end, l, threading, l->num_lattice_site_var ); - vector_PRECISION_saxpy( output, output, input, -p->shift, start, end, l ); - } + } static inline void apply_operator_dagger_PRECISION( vector_PRECISION output, vector_PRECISION input, gmres_PRECISION_struct *p, level_struct *l, struct Thread *threading ) { - if ( l->depth > 0 ) apply_coarse_operator_dagger_PRECISION( output, input, &(l->s_PRECISION.op), l, threading ); - else d_plus_clover_dagger_PRECISION( output, input, p->op, l, threading ); + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + tau1_gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading ); + } else +#endif + { + gamma5_PRECISION( l->vbuf_PRECISION[6], input, l, threading ); +#ifdef HAVE_TM + //TODO: change_mu_sign_PRECISION( p->op, l, threading ); +#endif + } + + apply_operator_PRECISION( l->vbuf_PRECISION[7], l->vbuf_PRECISION[6], p, l, threading ); + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + tau1_gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading ); + } else +#endif + { + gamma5_PRECISION( output, l->vbuf_PRECISION[7], l, threading ); +#ifdef HAVE_TM + //TODO: change_mu_sign_PRECISION( p->op, l, threading ); +#endif + } + + } + + static inline void test0_PRECISION( char* format, int depth, PRECISION test ) { + if ( g.my_rank == 0 && g.print >= 0 ) { + if ( test > EPS_PRECISION ) + printf("\x1b[31m"); + printf(format, depth, test); + if ( test > EPS_PRECISION ) + printf("\x1b[0m"); + if ( test > g.test ) + g.test = test; + fflush(0); + } } #endif diff --git a/src/main_pre_def_generic.h b/src/main_pre_def_generic.h index 21e5576..d61b1b5 100644 --- a/src/main_pre_def_generic.h +++ b/src/main_pre_def_generic.h @@ -45,10 +45,9 @@ } gathering_PRECISION_struct; typedef struct { - config_PRECISION D, clover, oe_clover; -#ifdef HAVE_TM - config_PRECISION odd_proj, tm_term; -#endif + double m0; + config_PRECISION D, clover, clover_oo_inv; + config_PRECISION odd_proj; //identity on the odd sites int oe_offset, self_coupling, num_even_sites, num_odd_sites, *index_table, *neighbor_table, *translation_table, table_dim[4], *backward_neighbor_table, @@ -58,12 +57,22 @@ OPERATOR_TYPE_PRECISION *D_vectorized; OPERATOR_TYPE_PRECISION *D_transformed_vectorized; OPERATOR_TYPE_PRECISION *clover_vectorized; - OPERATOR_TYPE_PRECISION *oe_clover_vectorized; + OPERATOR_TYPE_PRECISION *clover_oo_inv_vectorized; +#ifdef HAVE_TM + double mu, mu_odd_shift, mu_even_shift; + config_PRECISION tm_term; +#endif +#ifdef HAVE_TM1p1 + double epsbar, epsbar_ig5_odd_shift, epsbar_ig5_even_shift; + config_PRECISION epsbar_term, clover_doublet_oo_inv; + OPERATOR_TYPE_PRECISION *clover_doublet_vectorized; + OPERATOR_TYPE_PRECISION *clover_doublet_oo_inv_vectorized; +#endif } operator_PRECISION_struct; typedef struct { vector_PRECISION x, b, r, w, *V, *Z; - complex_PRECISION **H, *y, *gamma, *c, *s, shift; + complex_PRECISION **H, *y, *gamma, *c, *s; config_PRECISION *D, *clover; operator_PRECISION_struct *op; PRECISION tol; @@ -75,7 +84,7 @@ typedef struct { operator_PRECISION_struct op; - vector_PRECISION buf1, buf2, buf3, buf4, buf5, bbuf1, bbuf2, bbuf3, oe_bbuf[6]; + vector_PRECISION buf1, buf2, buf3, buf4, buf5; vector_PRECISION oe_buf[4]; vector_PRECISION local_minres_buffer[3]; int block_oe_offset, *index[4], dir_length[4], num_blocks, num_colors, diff --git a/src/oddeven_generic.c b/src/oddeven_generic.c index d8da7af..801995f 100644 --- a/src/oddeven_generic.c +++ b/src/oddeven_generic.c @@ -76,6 +76,7 @@ void selfcoupling_cholesky_decomposition_PRECISION( const config_PRECISION outpu } } +#ifdef HAVE_TM void selfcoupling_LU_decomposition_PRECISION( const config_PRECISION output, config_double input ) { /********************************************************************************* @@ -89,51 +90,105 @@ void selfcoupling_LU_decomposition_PRECISION( const config_PRECISION output, con *********************************************************************************/ register int i, j, k; - int n, offset[4] = {0,12,6,27}; + int n; config_double in_pt; - config_PRECISION out_pt = output; - complex_PRECISION L[6][6]; - + config_PRECISION out_pt; + + int offset[4] = {0,12,6,27}; + + // construct initial L = A for n=0, L = B for n=1, L row major for ( n=0; n<2; n++ ) { - // construct initial L = A for n=0, L = B for n=1, L row major - in_pt = input+offset[2*n]; + + out_pt = output + n*36; + + in_pt = input + offset[2*n]; for ( j=0; j<6; j++ ) { - L[j][j] = (complex_PRECISION) *in_pt; in_pt++; + out_pt[6*j+j] = (complex_PRECISION) *in_pt; in_pt++; } in_pt = input+offset[2*n+1]; for ( j=0; j<5; j++ ) { for ( i=j+1; i<6; i++ ) { - L[j][i] = (complex_PRECISION) *in_pt; - L[i][j] = (complex_PRECISION) conj_double(*in_pt); in_pt++; + out_pt[6*j+i] = (complex_PRECISION) *in_pt; + out_pt[6*i+j] = (complex_PRECISION) conj_double(*in_pt); in_pt++; } } // calculate LU - for ( k=0; k<6; k++ ) { + for ( k=0; k<5; k++ ) { for ( i=k+1; i<6; i++ ) { - L[i][k] = L[i][k]/L[k][k]; // acts on L - for ( j=k+1; j<6; j++ ) - L[i][j] = L[i][j]-L[i][k]*L[k][j]; // acts on both, L and U + out_pt[6*i+k] = out_pt[6*i+k]/out_pt[6*k+k]; // L: out(i,k) = out(i,k)/out(k,k) + for ( j=k+1; j<6; j++ ) + out_pt[6*i+j] = out_pt[6*i+j]-out_pt[6*i+k]*out_pt[6*k+j]; // U: out(i,j) = out(i,j)-out(i,k)*out(k,j) } } + } +} +#endif - // output = tril(L,1) without diag row major - for ( i=0; i<6; i++ ) { - for ( j=0; j=0; i-- ) { - for ( j=i+1; j<6; j++ ) { - *out_pt = L[i][j]; out_pt++; + + in_pt = input+offset[4*n+3]; + for ( j=0; j<6; j++ ) { + for ( i=0; i<6; i++ ) { + out_pt[12*(j+6)+i] = out_pt[12*j+(i+6)] = _COMPLEX_PRECISION_ZERO; + } + } + for ( j=0; j<6; j++ ) { + out_pt[12*(j+6)+j] = out_pt[12*j+(j+6)] = (complex_PRECISION) *in_pt; in_pt++; + } + + // calculate LU + for ( k=0; k<11; k++ ) { + for ( i=k+1; i<12; i++ ) { + out_pt[12*i+k] = out_pt[12*i+k]/out_pt[12*k+k]; // L: out(i,k) = out(i,k)/out(k,k) + for ( j=k+1; j<12; j++ ) + out_pt[12*i+j] = out_pt[12*i+j]-out_pt[12*i+k]*out_pt[12*k+j]; // U: out(i,j) = out(i,j)-out(i,k)*out(k,j) } - *out_pt = L[i][i]; out_pt++; } - } } +#endif static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION L ) { @@ -172,7 +227,7 @@ static inline void LLH_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vecto } } -static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION L ) { +static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector_PRECISION b, config_PRECISION LU ) { /********************************************************************************* * Solves L*U*x = b for x, i.e., the clover coupling for a single lattice @@ -183,27 +238,52 @@ static inline void LU_perform_fwd_bwd_subs_PRECISION( vector_PRECISION x, vector * Note: U is given by u_{ii}=1, u_{ij}=l_{ji}* / l_{ii} *********************************************************************************/ - register int i, j; - int n; - - for ( n=0; n<2; n++ ) { - // forward substitution with L - for ( i=0; i<6; i++ ) { - x[i] = b[i]; - for ( j=0; j=0; i-- ) { + for ( j=i+1; j<12; j++ ) { + x[i] = x[i] - LU[i*12+j]*x[j]; + } + x[i] = x[i]/LU[i*(12+1)]; + } + x+=12; + b+=12; + LU+=12*12; } - // backward substitution with U - for ( i=5; i>=0; i-- ) { - for ( j=i+1; j<6; j++ ) { - x[i] = x[i] - *L * x[j]; L++; + else +#endif + for ( n=0; n<2; n++ ) { + // solve x = U^(-1) L^(-1) b + // forward substitution with L + for ( i=0; i<6; i++ ) { + x[i] = b[i]; + for ( j=0; j=0; i-- ) { + for ( j=i+1; j<6; j++ ) { + x[i] = x[i] - LU[i*6+j]*x[j]; + } + x[i] = x[i]/LU[i*(6+1)]; + } + x+=6; + b+=6; + LU+=6*6; } - x+=6; - b+=6; - } } @@ -252,32 +332,42 @@ static inline void LU_multiply_PRECISION( vector_PRECISION y, vector_PRECISION x * - config_PRECISION LU: LU decomposition *********************************************************************************/ - register int i, j; - int n; - complex_PRECISION z[6]; - - for ( n=0; n<2; n++ ) { - LU+=15; // moving to U - // z = U x - for ( i=5; i>=0; i-- ) { //row - z[i] = 0; - for ( j=i+1; j<6; j++ ) { //column - z[i] += *LU *x[j]; LU++; + register int i, j, n; + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2) + for ( n=0; n<2; n++ ) { + for ( i=0; i<12; i++ ) { + y[i] = LU[i*(12+1)]*x[i]; + for ( j=i+1; j<12; j++ ) + y[i] += LU[i*12+j]*x[j]; } - z[i] += *LU *x[i]; LU++; - } - LU-=36;// moving to L - // y = L*z; - for ( i=0; i<6; i++ ) { // rows - y[i] = z[i]; - for ( j=0; j0; i-- ) + for ( j=0; j0; i-- ) + for ( j=0; jclover; - x += start; y += start; - if ( g.csw ) { + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2) { + x += start; y += start; #ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start); + PRECISION *sc_pt = op->clover_doublet_vectorized + (start/24)*288; PRECISION *x_pt = (PRECISION*)x; PRECISION *y_pt = (PRECISION*)y; - for ( int i=start; iepsbar_term+(start/24)*12; + if ( g.n_flavours == 2 && + ( op->epsbar != 0 || op->epsbar_ig5_odd_shift != 0 || op->epsbar_ig5_odd_shift != 0 ) ) + apply_doublet_coupling_PRECISION( x, y, epsbar_term, end-start ); +#else + config_PRECISION sc = op->clover_doublet_oo_inv + (start/24)*288; // diagonal blocks applied to the even sites - for ( int i=start; iclover_vectorized + (start/12)*144; + PRECISION *x_pt = (PRECISION*)x; + PRECISION *y_pt = (PRECISION*)y; + for ( int i=start; iclover + (start/12)*72; + // diagonal blocks applied to the even sites + for ( int i=start; iclover + (start/12)*42; + // diagonal blocks applied to the even sites + for ( int i=start; iclover + start; + for ( int i=start; inum_even_sites; - config_PRECISION sc = op->clover; - if ( g.csw ) { +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2) { + int i, n1 = op->num_even_sites; + config_PRECISION sc = op->clover_doublet_oo_inv; // diagonal blocks applied to the even sites for ( i=0; inum_even_sites; + config_PRECISION sc = op->clover; + if ( g.csw ) { + // diagonal blocks applied to the even sites + for ( i=0; inum_even_sites, n2 = op->num_odd_sites; - config_PRECISION sc = op->clover; - x += n1*12; y += n1*12; - // diagonal blocks applied to the odd sites - if ( g.csw ) { -#ifndef HAVE_TM - sc += n1*42; - for ( i=0; inum_even_sites, n2 = op->num_odd_sites; + config_PRECISION sc = op->clover_doublet_oo_inv + n1*288; + x += n1*24; y += n1*24; + // diagonal blocks applied to the even sites for ( i=0; inum_even_sites, n2 = op->num_odd_sites; + config_PRECISION sc = op->clover; + x += n1*12; y += n1*12; + // diagonal blocks applied to the odd sites + if ( g.csw ) { +#ifndef HAVE_TM + sc += n1*42; + for ( i=0; iclover; - x += start; y += start; - // inverse diagonal blocks applied to the odd sites - if ( g.csw ) { +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2) { + x += start; y += start; + // inverse diagonal blocks applied to the odd sites #ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start); + PRECISION *sc_pt = op->clover_doublet_oo_inv_vectorized + (start/24)*2*288; PRECISION *x_pt = (PRECISION*)x; PRECISION *y_pt = (PRECISION*)y; - for ( int i=start; iclover_doublet_oo_inv + (start/24)*288; + for ( int i=start; iclover; + x += start; y += start; + // inverse diagonal blocks applied to the odd sites + if ( g.csw ) { +#ifdef OPTIMIZED_SELF_COUPLING_PRECISION + PRECISION *sc_pt = op->clover_vectorized + 2*2*(3*start); + PRECISION *x_pt = (PRECISION*)x; + PRECISION *y_pt = (PRECISION*)y; + for ( int i=start; inum_inner_lattice_sites, oe_offset=0, mu, nu, - sc_size = 42, lu_dec_size = 42, bs, **bt = NULL, + sc_size = g.csw ? 42:12, lu_dec_size = 42, bs, **bt = NULL, *eot = NULL, *nt = NULL, *tt = NULL, t, z, y, x, le[4], N[4]; config_double sc_in = in->clover, nc_in = in->D; config_PRECISION Aee = NULL, Aoo = NULL; operator_PRECISION_struct *op = &(l->oe_op_PRECISION); + op->m0 = in->m0; + #ifdef HAVE_TM + op->mu = in->mu; + op->mu_even_shift = in->mu_even_shift; + op->mu_odd_shift = in->mu_odd_shift; + lu_dec_size = 72; config_double tm_term_in = in->tm_term; #endif @@ -491,51 +666,55 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) { for ( x=0; xepsbar_term; + sc_in = in->clover; +#ifdef HAVE_TM + tm_term_in = in->tm_term; +#endif + op->epsbar = in->epsbar; + op->epsbar_ig5_even_shift = in->epsbar_ig5_even_shift; + op->epsbar_ig5_odd_shift = in->epsbar_ig5_odd_shift; + + // re-order clover term (i.e., self coupling) + MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, lu_doublet_dec_size*n ); + Aee = op->clover_doublet_oo_inv; + Aoo = op->clover_doublet_oo_inv + op->num_even_sites*lu_doublet_dec_size; +#ifdef OPTIMIZED_SELF_COUPLING_PRECISION + MALLOC_HUGEPAGES( op->clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36, 4*SIMD_LENGTH_PRECISION ); + MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, op->num_odd_sites*2*2*144, 4*SIMD_LENGTH_PRECISION ); + PRECISION *Aee_vectorized = op->clover_doublet_vectorized; + PRECISION *Aoo_vectorized = op->clover_doublet_vectorized + op->num_even_sites*288; + PRECISION *Aoo_inverse_vectorized = op->clover_doublet_oo_inv_vectorized; +#endif + for ( t=0; tD, complex_PRECISION, 36*n ); @@ -576,7 +867,7 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) { k++; } -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float // D_vectorized just used in the float environment MALLOC_HUGEPAGES( op->D_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); MALLOC_HUGEPAGES( op->D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); for ( int i=0; inum_inner_lattice_sites; i++ ) { @@ -621,13 +912,21 @@ void oddeven_setup_PRECISION( operator_double_struct *in, level_struct *l ) { define_eo_bt( bt, eot, op->c.num_even_boundary_sites, op->c.num_odd_boundary_sites, op->c.num_boundary_sites, N, l ); j = (l->num_lattice_site_var/2)*l->num_lattice_sites; +#ifdef HAVE_TM1p1 + j *= 2; +#endif MALLOC( op->prnT, complex_PRECISION, j*8 ); op->prnZ = op->prnT + j; op->prnY = op->prnZ + j; op->prnX = op->prnY + j; op->prpT = op->prnX + j; op->prpZ = op->prpT + j; op->prpY = op->prpZ + j; op->prpX = op->prpY + j; MALLOC( op->buffer, complex_PRECISION*, 2 ); op->buffer[0] = NULL; +#ifdef HAVE_TM1p1 + MALLOC( op->buffer[0], complex_PRECISION, 4*l->vector_size ); + op->buffer[1] = op->buffer[0] + 2*l->vector_size; +#else MALLOC( op->buffer[0], complex_PRECISION, 2*l->vector_size ); op->buffer[1] = op->buffer[0] + l->vector_size; +#endif ghost_alloc_PRECISION( 0, &(op->c), l ); ghost_sendrecv_init_PRECISION( _COARSE_GLOBAL, &(op->c), l ) ; l->sp_PRECISION.v_end = op->num_even_sites*l->num_lattice_site_var; @@ -642,12 +941,16 @@ void oddeven_free_PRECISION( level_struct *l ) { lu_dec_size = 72; #endif -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float FREE_HUGEPAGES( l->oe_op_PRECISION.D_vectorized, PRECISION, 2*4*l->inner_vector_size ); FREE_HUGEPAGES( l->oe_op_PRECISION.D_transformed_vectorized, PRECISION, 2*4*l->inner_vector_size ); #endif #ifdef OPTIMIZED_SELF_COUPLING_PRECISION FREE_HUGEPAGES( l->oe_op_PRECISION.clover_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*36 ); +#ifdef HAVE_TM1p1 + FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_vectorized, PRECISION, l->num_inner_lattice_sites*2*4*36 ); + FREE_HUGEPAGES( l->oe_op_PRECISION.clover_doublet_oo_inv_vectorized, PRECISION, l->num_inner_lattice_sites*2*2*144 ); +#endif #endif ghost_free_PRECISION( &(l->oe_op_PRECISION.c), l ); @@ -671,9 +974,18 @@ void oddeven_free_PRECISION( level_struct *l ) { l->oe_op_PRECISION.c.boundary_table[2*mu+1] = NULL; } +#ifdef HAVE_TM1p1 + FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 4*l->vector_size ); +#else FREE( l->oe_op_PRECISION.buffer[0], complex_PRECISION, 2*l->vector_size ); +#endif FREE( l->oe_op_PRECISION.buffer, complex_PRECISION*, 2 ); +#ifdef HAVE_TM1p1 + FREE( l->oe_op_PRECISION.prnT, complex_PRECISION, 2*(l->num_lattice_site_var/2)*l->num_lattice_sites*8 ); + FREE( l->oe_op_PRECISION.clover_doublet_oo_inv, complex_PRECISION, 288*n ); +#else FREE( l->oe_op_PRECISION.prnT, complex_PRECISION, (l->num_lattice_site_var/2)*l->num_lattice_sites*8 ); +#endif } @@ -772,19 +1084,20 @@ void block_to_oddeven_PRECISION( vector_PRECISION out, vector_PRECISION in, leve SYNC_CORES(threading) } -#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, const int amount, level_struct *l, struct Thread *threading ) { - - int start_even, end_even, start_odd, end_odd; - compute_core_start_end_custom(0, op->num_even_sites, &start_even, &end_even, l, threading, 1 ); - compute_core_start_end_custom(op->num_even_sites, op->num_even_sites+op->num_odd_sites, &start_odd, &end_odd, l, threading, 1 ); - int i, n = l->num_inner_lattice_sites, *neighbor = op->neighbor_table, *nb_pt, - start=0, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - complex_PRECISION pbuf[6]; - vector_PRECISION phi_pt, eta_pt, end_pt; - config_PRECISION D_pt; + int start_even, end_even, start_odd, end_odd, n = l->num_inner_lattice_sites, + *neighbor = op->neighbor_table, start=0, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; + + SYNC_CORES(threading) + + if ( amount == _EVEN_SITES || amount == _ODD_SITES ) { + compute_core_start_end_custom(0, op->num_even_sites, &start_even, &end_even, l, threading, 1 ); + compute_core_start_end_custom(op->num_even_sites, op->num_even_sites+op->num_odd_sites, &start_odd, &end_odd, l, threading, 1 ); + } else { + compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &n, l, threading, 1 ); + } SYNC_CORES(threading) @@ -797,134 +1110,290 @@ void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operato minus_dir_param = _EVEN_SITES; plus_dir_param = _ODD_SITES; } - // project in negative directions - for ( i=6*start, phi_pt=phi+12*start; i<6*n; i+=6, phi_pt+=12 ) { - prp_T_PRECISION( op->prnT+i, phi_pt ); - prp_Z_PRECISION( op->prnZ+i, phi_pt ); - prp_Y_PRECISION( op->prnY+i, phi_pt ); - prp_X_PRECISION( op->prnX+i, phi_pt ); - } - // start communication in negative direction - START_LOCKED_MASTER(threading) - ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); - END_LOCKED_MASTER(threading) - // project plus dir and multiply with U dagger - for ( phi_pt=phi+12*start, end_pt=phi+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptprpT+i, D_pt, pbuf ); - mvmh_PRECISION( op->prpT+i+3, D_pt, pbuf+3 ); D_pt += 9; - // Z dir - i = 6*(*nb_pt); nb_pt++; - prn_Z_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpZ+i, D_pt, pbuf ); - mvmh_PRECISION( op->prpZ+i+3, D_pt, pbuf+3 ); D_pt += 9; - // Y dir - i = 6*(*nb_pt); nb_pt++; - prn_Y_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpY+i, D_pt, pbuf ); - mvmh_PRECISION( op->prpY+i+3, D_pt, pbuf+3 ); D_pt += 9; - // X dir - i = 6*(*nb_pt); nb_pt++; - prn_X_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpX+i, D_pt, pbuf ); - mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 ); D_pt += 9; - } - if ( amount == _EVEN_SITES ) { - start = start_even, n = end_even; - } else if ( amount == _ODD_SITES ) { - start = start_odd, n = end_odd; - } - // start communication in positive direction - START_LOCKED_MASTER(threading) - ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); - // wait for communication in negative direction - ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); - ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l ); - ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l ); - ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); - END_LOCKED_MASTER(threading) - // multiply with U and lift up minus dir - for ( eta_pt=eta+12*start, end_pt=eta+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnT+i ); - mvm_PRECISION( pbuf+3, D_pt, op->prnT+i+3 ); - pbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; - // Z dir - i = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnZ+i ); - mvm_PRECISION( pbuf+3, D_pt, op->prnZ+i+3 ); - pbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; - // Y dir - i = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnY+i ); - mvm_PRECISION( pbuf+3, D_pt, op->prnY+i+3 ); - pbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; - // X dir - i = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnX+i ); - mvm_PRECISION( pbuf+3, D_pt, op->prnX+i+3 ); - pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; - } - // wait for communication in positive direction - START_LOCKED_MASTER(threading) - ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); - ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l ); - ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l ); - ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); - END_LOCKED_MASTER(threading) - // lift up plus dir - for ( i=6*start, eta_pt=eta+12*start; i<6*n; i+=6, eta_pt+=12 ) { - pbn_su3_T_PRECISION( op->prpT+i, eta_pt ); - pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); - pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); - pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); - } - SYNC_CORES(threading) -} +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; + complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; +#else + int i, *nb_pt; + vector_PRECISION phi_pt, eta_pt, end_pt; + config_PRECISION D_pt; #endif - -void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, - level_struct *l, struct Thread *threading ) { - -/********************************************************************************* -* Applies the Schur complement to a vector. -*********************************************************************************/ - - // start and end indices for vector functions depending on thread - int start_even, end_even, start_odd, end_odd; - - compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, 12 ); - compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, 12 ); - - vector_PRECISION *tmp = op->buffer; - - SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); - vector_PRECISION_define( tmp[0], 0, start_even, end_even, l ); - SYNC_CORES(threading) - PROF_PRECISION_START( _NC, threading ); - - PROF_PRECISION_START( _SC, threading ); - diag_ee_PRECISION( out, in, op, l, start_even, end_even ); - SYNC_CORES(threading) - PROF_PRECISION_STOP( _SC, 1, threading ); - hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); - PROF_PRECISION_STOP( _NC, 0, threading ); - PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd ); - SYNC_CORES(threading) - PROF_PRECISION_STOP( _SC, 0, threading ); +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + // project in negative directions +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dprp_PRECISION( prn, phi, 24*start, 24*n ); +#else + complex_PRECISION pbuf[12]; + for ( i=12*start, phi_pt=phi+24*start; i<12*n; i+=12, phi_pt+=24 ) { + dprp_T_PRECISION( op->prnT+i, phi_pt ); + dprp_Z_PRECISION( op->prnZ+i, phi_pt ); + dprp_Y_PRECISION( op->prnY+i, phi_pt ); + dprp_X_PRECISION( op->prnX+i, phi_pt ); + } +#endif + // start communication in negative direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); + END_LOCKED_MASTER(threading) + // project plus dir and multiply with U dagger +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dprn_su3_PRECISION( prp, phi, op, neighbor, 24*start, 24*n ); +#else + for ( phi_pt=phi+24*start, end_pt=phi+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptprpT+i, D_pt, pbuf ); + mvmh_PRECISION( op->prpT+i+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpT+i+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpT+i+9, D_pt, pbuf+9 ); D_pt += 9; + // Z dir + i = 12*(*nb_pt); nb_pt++; + dprn_Z_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpZ+i, D_pt, pbuf ); + mvmh_PRECISION( op->prpZ+i+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpZ+i+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpZ+i+9, D_pt, pbuf+9 ); D_pt += 9; + // Y dir + i = 12*(*nb_pt); nb_pt++; + dprn_Y_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpY+i, D_pt, pbuf ); + mvmh_PRECISION( op->prpY+i+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpY+i+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpY+i+9, D_pt, pbuf+9 ); D_pt += 9; + // X dir + i = 12*(*nb_pt); nb_pt++; + dprn_X_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpX+i, D_pt, pbuf ); + mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 ); + mvmh_PRECISION( op->prpX+i+6, D_pt, pbuf+6 ); + mvmh_PRECISION( op->prpX+i+9, D_pt, pbuf+9 ); D_pt += 9; + } +#endif + if ( amount == _EVEN_SITES ) { + start = start_even, n = end_even; + } else if ( amount == _ODD_SITES ) { + start = start_odd, n = end_odd; + } + // start communication in positive direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); + // wait for communication in negative direction + ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); + END_LOCKED_MASTER(threading) + // multiply with U and lift up minus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + su3_dpbp_PRECISION( eta, prn, op, neighbor, 24*start, 24*n ); +#else + for ( eta_pt=eta+24*start, end_pt=eta+24*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnT+i ); + mvm_PRECISION( pbuf+3, D_pt, op->prnT+i+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnT+i+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnT+i+9 ); + dpbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Z dir + i = 12*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnZ+i ); + mvm_PRECISION( pbuf+3, D_pt, op->prnZ+i+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnZ+i+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnZ+i+9 ); + dpbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Y dir + i = 12*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnY+i ); + mvm_PRECISION( pbuf+3, D_pt, op->prnY+i+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnY+i+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnY+i+9 ); + dpbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; + // X dir + i = 12*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnX+i ); + mvm_PRECISION( pbuf+3, D_pt, op->prnX+i+3 ); + mvm_PRECISION( pbuf+6, D_pt, op->prnX+i+6 ); + mvm_PRECISION( pbuf+9, D_pt, op->prnX+i+9 ); + dpbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; + } +#endif + // wait for communication in positive direction + START_LOCKED_MASTER(threading) + ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); + END_LOCKED_MASTER(threading) + // lift up plus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + dpbn_PRECISION( eta, prp, 24*start, 24*n ); +#else + for ( i=12*start, eta_pt=eta+24*start; i<12*n; i+=12, eta_pt+=24 ) { + dpbn_su3_T_PRECISION( op->prpT+i, eta_pt ); + dpbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); + dpbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); + dpbn_su3_X_PRECISION( op->prpX+i, eta_pt ); + } +#endif + } else { +#endif + // project in negative directions +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + prp_PRECISION( prn, phi, 12*start, 12*n ); +#else + complex_PRECISION pbuf[6]; + for ( i=6*start, phi_pt=phi+12*start; i<6*n; i+=6, phi_pt+=12 ) { + prp_T_PRECISION( op->prnT+i, phi_pt ); + prp_Z_PRECISION( op->prnZ+i, phi_pt ); + prp_Y_PRECISION( op->prnY+i, phi_pt ); + prp_X_PRECISION( op->prnX+i, phi_pt ); + } +#endif + // start communication in negative direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); + END_LOCKED_MASTER(threading) + // project plus dir and multiply with U dagger +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + prn_su3_PRECISION( prp, phi, op, neighbor, 12*start, 12*n ); +#else + for ( phi_pt=phi+12*start, end_pt=phi+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; phi_ptprpT+i, D_pt, pbuf ); + mvmh_PRECISION( op->prpT+i+3, D_pt, pbuf+3 ); D_pt += 9; + // Z dir + i = 6*(*nb_pt); nb_pt++; + prn_Z_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpZ+i, D_pt, pbuf ); + mvmh_PRECISION( op->prpZ+i+3, D_pt, pbuf+3 ); D_pt += 9; + // Y dir + i = 6*(*nb_pt); nb_pt++; + prn_Y_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpY+i, D_pt, pbuf ); + mvmh_PRECISION( op->prpY+i+3, D_pt, pbuf+3 ); D_pt += 9; + // X dir + i = 6*(*nb_pt); nb_pt++; + prn_X_PRECISION( pbuf, phi_pt ); + mvmh_PRECISION( op->prpX+i, D_pt, pbuf ); + mvmh_PRECISION( op->prpX+i+3, D_pt, pbuf+3 ); D_pt += 9; + } +#endif + if ( amount == _EVEN_SITES ) { + start = start_even, n = end_even; + } else if ( amount == _ODD_SITES ) { + start = start_odd, n = end_odd; + } + // start communication in positive direction + START_LOCKED_MASTER(threading) + ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l ); + ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); + // wait for communication in negative direction + ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l ); + ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); + END_LOCKED_MASTER(threading) + // multiply with U and lift up minus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + su3_pbp_PRECISION( eta, prn, op, neighbor, 12*start, 12*n ); +#else + for ( eta_pt=eta+12*start, end_pt=eta+12*n, D_pt = op->D+36*start, nb_pt=neighbor+4*start; eta_ptprnT+i ); + mvm_PRECISION( pbuf+3, D_pt, op->prnT+i+3 ); + pbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Z dir + i = 6*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnZ+i ); + mvm_PRECISION( pbuf+3, D_pt, op->prnZ+i+3 ); + pbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; + // Y dir + i = 6*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnY+i ); + mvm_PRECISION( pbuf+3, D_pt, op->prnY+i+3 ); + pbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; + // X dir + i = 6*(*nb_pt); nb_pt++; + mvm_PRECISION( pbuf, D_pt, op->prnX+i ); + mvm_PRECISION( pbuf+3, D_pt, op->prnX+i+3 ); + pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; + } +#endif + // wait for communication in positive direction + START_LOCKED_MASTER(threading) + ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l ); + ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); + END_LOCKED_MASTER(threading) + // lift up plus dir +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + pbn_PRECISION( eta, prp, 12*start, 12*n ); +#else + for ( i=6*start, eta_pt=eta+12*start; i<6*n; i+=6, eta_pt+=12 ) { + pbn_su3_T_PRECISION( op->prpT+i, eta_pt ); + pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); + pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); + pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); + } +#endif +#ifdef HAVE_TM1p1 + } +#endif + + SYNC_CORES(threading) +} + +void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, + level_struct *l, struct Thread *threading ) { + +/********************************************************************************* +* Applies the Schur complement to a vector. +*********************************************************************************/ + + // start and end indices for vector functions depending on thread + int start_even, end_even, start_odd, end_odd; + + compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var ); + compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var ); + + vector_PRECISION *tmp = op->buffer; + + SYNC_CORES(threading) + vector_PRECISION_define_zero( tmp[0], 0, l->inner_vector_size, l, threading ); + SYNC_CORES(threading) + PROF_PRECISION_START( _NC, threading ); + + PROF_PRECISION_START( _SC, threading ); + diag_ee_PRECISION( out, in, op, l, start_even, end_even ); + SYNC_CORES(threading) + PROF_PRECISION_STOP( _SC, 1, threading ); + hopping_term_PRECISION( tmp[0], in, op, _ODD_SITES, l, threading ); + PROF_PRECISION_STOP( _NC, 0, threading ); + + PROF_PRECISION_START( _SC, threading ); + diag_oo_inv_PRECISION( tmp[1], tmp[0], op, l, start_odd, end_odd ); + SYNC_CORES(threading) + PROF_PRECISION_STOP( _SC, 0, threading ); PROF_PRECISION_START( _NC, threading ); hopping_term_PRECISION( tmp[0], tmp[1], op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); @@ -935,19 +1404,18 @@ void apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { // start and end indices for vector functions depending on thread - int start; - int end; - compute_core_start_end(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start, &end, l, threading); + int start=op->num_even_sites*l->num_lattice_site_var, end=l->inner_vector_size, thread_start, thread_end; + compute_core_start_end(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &thread_start, &thread_end, l, threading); vector_PRECISION tmp = op->buffer[0]; // odd to even PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp, p->b, op, l, start, end ); + diag_oo_inv_PRECISION( tmp, p->b, op, l, thread_start, thread_end ); PROF_PRECISION_STOP( _SC, 0, threading ); - SYNC_CORES(threading) - vector_PRECISION_scale( tmp, tmp, -1, start, end, l ); - SYNC_CORES(threading) + SYNC_CORES(threading); + vector_PRECISION_scale( tmp, tmp, -1, op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l, threading ); + SYNC_CORES(threading); PROF_PRECISION_START( _NC, threading ); hopping_term_PRECISION( p->b, tmp, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); @@ -956,20 +1424,20 @@ void solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_stru fgmres_PRECISION( p, l, threading ); else if ( g.method == 5 ) bicgstab_PRECISION( p, l, threading ); - diag_oo_inv_PRECISION( p->x, p->b, op, l, start, end ); + diag_oo_inv_PRECISION( p->x, p->b, op, l, thread_start, thread_end ); // even to odd - SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start, end, l ); + SYNC_CORES(threading); + vector_PRECISION_define_zero( tmp, start, end, l, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( p->b, tmp, op, l, start, end ); + diag_oo_inv_PRECISION( p->b, tmp, op, l, thread_start, thread_end ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) - vector_PRECISION_minus( p->x, p->x, p->b, start, end, l ); + vector_PRECISION_minus( p->x, p->x, p->b, thread_start, thread_end, l ); SYNC_CORES(threading) } @@ -1024,14 +1492,13 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO // start and end indices for vector functions depending on thread int start_even, end_even, start_odd, end_odd; - compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, 12 ); - compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, 12 ); + compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, l->num_lattice_site_var ); + compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, l->num_lattice_site_var ); vector_PRECISION *tmp = op->buffer; - SYNC_CORES(threading) - vector_PRECISION_define( tmp[0], 0, start_odd, end_odd, l ); - vector_PRECISION_define( tmp[0], 0, start_even, end_even, l ); + SYNC_CORES(threading); + vector_PRECISION_define_zero( tmp[0], 0, l->inner_vector_size, l, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); @@ -1059,70 +1526,77 @@ void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISIO void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - int start_even, end_even, start_odd, end_odd; - compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &start_even, &end_even, l, threading, 12 ); - compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &start_odd, &end_odd, l, threading, 12 ); + int start_even = 0, end_even = op->num_even_sites*l->num_lattice_site_var, + start_odd = end_even, end_odd = l->inner_vector_size; + int thread_start_even, thread_end_even, thread_start_odd, thread_end_odd; + compute_core_start_end_custom(0, op->num_even_sites*l->num_lattice_site_var, &thread_start_even, &thread_end_even, l, threading, l->num_lattice_site_var ); + compute_core_start_end_custom(op->num_even_sites*l->num_lattice_site_var, l->inner_vector_size, &thread_start_odd, &thread_end_odd, l, threading, l->num_lattice_site_var ); vector_PRECISION tmp = op->buffer[0]; // odd to even PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( tmp, p->b, op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( tmp, p->b, op, l, thread_start_odd, thread_end_odd ); PROF_PRECISION_STOP( _SC, 0, threading ); SYNC_CORES(threading) -// g5_PRECISION( tmp, tmp, start_odd, end_odd, l ); -// vector_PRECISION_scale( tmp, tmp, -1, start_odd, end_odd, l ); - minus_g5_PRECISION( tmp, tmp, start_odd, end_odd, l ); + minus_g5_PRECISION( tmp, tmp, thread_start_odd, thread_end_odd, l ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); - vector_PRECISION_define( p->x, 0, start_even, end_even, l ); + vector_PRECISION_define_zero( p->x, start_even, end_even, l, threading ); hopping_term_PRECISION( p->x, tmp, op, _EVEN_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 0, threading ); SYNC_CORES(threading) - g5_PRECISION( p->x, p->x, start_even, end_even, l ); - vector_PRECISION_plus( p->b, p->b, p->x, start_even, end_even, l ); + g5_PRECISION( p->x, p->x, thread_start_even, thread_end_even, l ); + vector_PRECISION_plus( p->b, p->b, p->x, thread_start_even, thread_end_even, l ); SYNC_CORES(threading) ASSERT( g.method == 6 ); fgmres_PRECISION( p, l, threading ); - diag_oo_inv_PRECISION( p->x, p->b, op, l, start_odd, end_odd ); - g5_PRECISION( p->x, p->x, start_odd, end_odd, l ); + diag_oo_inv_PRECISION( p->x, p->b, op, l, thread_start_odd, thread_end_odd ); + g5_PRECISION( p->x, p->x, thread_start_odd, thread_end_odd, l ); // even to odd - SYNC_CORES(threading) - vector_PRECISION_define( tmp, 0, start_odd, end_odd, l ); + SYNC_CORES(threading); + vector_PRECISION_define_zero( tmp, start_odd, end_odd, l, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _NC, threading ); hopping_term_PRECISION( tmp, p->x, op, _ODD_SITES, l, threading ); PROF_PRECISION_STOP( _NC, 1, threading ); SYNC_CORES(threading) PROF_PRECISION_START( _SC, threading ); - diag_oo_inv_PRECISION( p->b, tmp, op, l, start_odd, end_odd ); + diag_oo_inv_PRECISION( p->b, tmp, op, l, thread_start_odd, thread_end_odd ); PROF_PRECISION_STOP( _SC, 1, threading ); SYNC_CORES(threading) - vector_PRECISION_minus( p->x, p->x, p->b, start_odd, end_odd, l ); + vector_PRECISION_minus( p->x, p->x, p->b, thread_start_odd, thread_end_odd, l ); SYNC_CORES(threading) } // ----- block odd even ----------------------------------------------------------- -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION -void schwarz_PRECISION_oddeven_setup( operator_PRECISION_struct *op, level_struct *l ) { +void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct *l ) { - config_PRECISION clover_pt = op->clover, oe_clover_pt = op->oe_clover; - complex_double buffer[42]; int mu, i, d0, c0, b0, a0, d1, c1, b1, a1, t, z, y, x, agg_split[4], block_split[4], block_size[4]; + operator_PRECISION_struct *op = &(s->op); + int n1 = s->num_block_even_sites; #ifdef HAVE_TM config_PRECISION tm_term_pt = op->tm_term; #endif + + for ( mu=0; mu<4; mu++ ) { + agg_split[mu] = l->local_lattice[mu]/l->coarsening[mu]; + block_split[mu] = l->coarsening[mu]/l->block_lattice[mu]; + block_size[mu] = l->block_lattice[mu]; + } if ( g.csw ) { - for ( mu=0; mu<4; mu++ ) { - agg_split[mu] = l->local_lattice[mu]/l->coarsening[mu]; - block_split[mu] = l->coarsening[mu]/l->block_lattice[mu]; - block_size[mu] = l->block_lattice[mu]; - } - +#ifndef OPTIMIZED_SELF_COUPLING_PRECISION + config_PRECISION clover_pt = op->clover, clover_oo_inv_pt = op->clover_oo_inv; + complex_double buffer[42]; + int cs = 42; +#else + PRECISION *clover_pt = op->clover_vectorized, *clover_oo_inv_pt = op->clover_oo_inv_vectorized; + int cs = 144; +#endif for ( d0=0; d0oe_clover, op->clover, 0, l->inner_vector_size, l ); + } + +#ifdef HAVE_TM1p1 +#ifndef OPTIMIZED_SELF_COUPLING_PRECISION + complex_double buffer[66]; + config_PRECISION clover_oo_inv_pt = op->clover_doublet_oo_inv, clover_pt = op->clover; + int cs = g.csw ? 42:12; +#else + PRECISION *clover_pt = g.csw ? op->clover_doublet_vectorized:(PRECISION*)op->clover, *clover_oo_inv_pt = op->clover_doublet_oo_inv_vectorized; + int cs = g.csw ? 288:24; +#endif + config_PRECISION eps_term_pt = op->epsbar_term; #ifdef HAVE_TM - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - vector_PRECISION_plus( op->oe_clover, op->oe_clover, op->tm_term, 0, l->inner_vector_size, l ); + tm_term_pt = op->tm_term; #endif - } -} + + for ( d0=0; d0num_block_even_sites; - config_PRECISION clover = (g.csw==0.0)?s->op.clover+start:s->op.clover+(start/12)*42; - vector_PRECISION lphi = phi+start, leta = eta+start; + int n1 = s->num_block_even_sites, nv = l->num_lattice_site_var; + clover_PRECISION( eta, phi, &(s->op), start, start+nv*n1, l, threading ); - // diagonal blocks applied to the even sites of a block - clover_PRECISION( leta, lphi, clover, 12*n1, l, no_threading ); -#ifdef HAVE_TM - config_PRECISION tm_term = s->op.tm_term+start; - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) - add_diagonal_PRECISION( leta, lphi, tm_term, 12*n1 ); -#endif - END_UNTHREADED_FUNCTION(threading) } -#endif - -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION + +// diagonal blocks applied to the odd sites of a block void block_diag_oo_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) - int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites; -#ifndef HAVE_TM - config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42; + +#ifdef OPTIMIZED_SELF_COUPLING_PRECISION + //we don't have the LU decomposition here, for debugging only + int n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites, nv = l->num_lattice_site_var; + clover_PRECISION( eta, phi, &(s->op), start+nv*n1, start+nv*(n1+n2), l, threading ); + #else - config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*72; + + int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites; +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + int block_num = start/24/(n1+n2); + // config_PRECISION clover = s->op.clover_doublet_oo_inv+n1*288+(start/24)*288; + config_PRECISION clover = s->op.clover_doublet_oo_inv+(start/24-block_num*n1)*288; + vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start; + for ( i=0; iop.clover_oo_inv+(start/12-block_num*n1)*42; + for ( i=0; iop.clover_oo_inv+(start/12-block_num*n1)*72; + for ( i=0; iop.clover+n1*12+start; #ifndef HAVE_TM - LLH_multiply_PRECISION( leta, lphi, clover ); - leta+=12; lphi+=12; clover+=42; + for ( i=0; i<12*n2; i++ ) + leta[i] = lphi[i]*(clover[i]); #else - LU_multiply_PRECISION( leta, lphi, clover ); - leta+=12; lphi+=12; clover+=72; + config_PRECISION tm_term = s->op.tm_term+n1*12+start; + for ( i=0; i<12*n2; i++ ) + leta[i] = lphi[i]*(clover[i]+tm_term[i]); #endif } - } else { - leta += n1*12; lphi += n1*12; clover += n1*12; - for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]*clover[i]; +#ifdef HAVE_TM1p1 } +#endif +#endif END_UNTHREADED_FUNCTION(threading) } -#endif -#ifndef OPTIMIZED_SELF_COUPLING_PRECISION -void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, - int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { +// inverted diagonal blocks applied to the odd sites of a block +void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, + level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites; -#ifndef HAVE_TM - config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + + vector_PRECISION lphi = phi+n1*24+start, leta = eta+n1*24+start; + int block_num = start/24/(n1+n2); +#ifndef OPTIMIZED_SELF_COUPLING_PRECISION + config_PRECISION clover = s->op.clover_doublet_oo_inv + (start/24-block_num*n1)*288; + for ( i=0; iop.oe_clover+start:s->op.oe_clover+(start/12)*72; + PRECISION *clover_vectorized = s->op.clover_doublet_oo_inv_vectorized + (start/24-block_num*n1)*2*288; + for ( i=0; iop.clover_oo_inv+(start/12-block_num*n1)*42; + for ( i=0; iop.clover_oo_inv+(start/12-block_num*n1)*72; + for ( i=0; iop.clover_oo_inv_vectorized + (start/12-block_num*n1)*144; + for ( i=0; iop.clover+n1*12+start; #ifndef HAVE_TM - LLH_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover ); - leta+=12; lphi+=12; clover+=42; + for ( i=0; i<12*n2; i++ ) + leta[i] = lphi[i]/(clover[i]); #else - LU_perform_fwd_bwd_subs_PRECISION( leta, lphi, clover ); - leta+=12; lphi+=12; clover+=72; + config_PRECISION tm_term = s->op.tm_term+n1*12+start; + for ( i=0; i<12*n2; i++ ) + leta[i] = lphi[i]/(clover[i]+tm_term[i]); #endif } - } else { - leta += n1*12; lphi += n1*12; clover += n1*12; - for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]/clover[i]; +#ifdef HAVE_TM1p1 } - +#endif + END_UNTHREADED_FUNCTION(threading) } -#endif -#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION void block_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { START_UNTHREADED_FUNCTION(threading) int a1, a2, n1, n2, *length_even = s->dir_length_even, *length_odd = s->dir_length_odd, - **index = s->oe_index, *ind, *neighbor = s->op.neighbor_table; - config_PRECISION D = s->op.D + (start/12)*36; - int i, j, k; - complex_PRECISION buf1[13] = {0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2 = buf1+6; + **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float + PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; + PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; + + for ( int mu=0; mu<4; mu++ ) { + if ( amount == _EVEN_SITES ) { + a1 = 0; n1 = length_even[mu]; + a2 = n1; n2 = a2 + length_odd[mu]; + } else if ( amount == _ODD_SITES ) { + a1 = length_even[mu]; n1 = a1 + length_odd[mu]; + a2 = 0; n2 = a1; + } else { + a1 = 0; n1 = length_even[mu]+length_odd[mu]; + a2 = 0; n2 = n1; + } + block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor ); + block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor ); + } + +#else + config_PRECISION D = s->op.D + (start/nv)*36; + int i, j, k, *ind; config_PRECISION D_pt; vector_PRECISION lphi = phi+start, leta = eta+start; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + complex_PRECISION buf1[25] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2 = buf1+12; + // T direction + if ( amount == _EVEN_SITES ) { + a1 = 0; n1 = length_even[T]; + a2 = n1; n2 = a2 + length_odd[T]; + } else if ( amount == _ODD_SITES ) { + a1 = length_even[T]; n1 = a1 + length_odd[T]; + a2 = 0; n2 = a1; + } else { + a1 = 0; n1 = length_even[T]+length_odd[T]; + a2 = 0; n2 = n1; + } + // "amount" of a block, +T coupling + ind = index[T]; + for ( i=a1; idir_length_even, *length_odd = s->dir_length_odd, - **index = s->oe_index, *ind, *neighbor = s->op.neighbor_table; - complex_PRECISION buf1[12], *buf2 = buf1+6; - vector_PRECISION lphi = phi+start, leta = eta+start; - config_PRECISION D_pt, D = s->op.D + (start/12)*36; - - // T direction - if ( amount == _EVEN_SITES ) { - a1 = 0; n1 = length_even[T]; - a2 = n1; n2 = a2 + length_odd[T]; - } else if ( amount == _ODD_SITES ) { - a1 = length_even[T]; n1 = a1 + length_odd[T]; - a2 = 0; n2 = a1; - } else { - a1 = 0; n1 = length_even[T]+length_odd[T]; - a2 = 0; n2 = n1; - } - // "amount" of a block, +T coupling - ind = index[T]; - for ( i=a1; idir_length_even, *length_odd = s->dir_length_odd, + **index = s->oe_index, *neighbor = s->op.neighbor_table, nv = l->num_lattice_site_var; + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float + PRECISION *Dplus = s->op.D_vectorized + (start/nv)*96; + PRECISION *Dminus = s->op.D_transformed_vectorized + (start/nv)*96; + + for ( int mu=0; mu<4; mu++ ) { + if ( amount == _EVEN_SITES ) { + a1 = 0; n1 = length_even[mu]; + a2 = n1; n2 = a2 + length_odd[mu]; + } else if ( amount == _ODD_SITES ) { + a1 = length_even[mu]; n1 = a1 + length_odd[mu]; + a2 = 0; n2 = a1; + } else { + a1 = 0; n1 = length_even[mu]+length_odd[mu]; + a2 = 0; n2 = n1; + } + block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), mu, a1, n1, index[mu], neighbor ); + block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), mu, a2, n2, index[mu], neighbor ); } - - // X direction - if ( amount == _EVEN_SITES ) { - a1 = 0; n1 = length_even[X]; - a2 = n1; n2 = a2 + length_odd[X]; - } else if ( amount == _ODD_SITES ) { - a1 = length_even[X]; n1 = a1 + length_odd[X]; - a2 = 0; n2 = a1; + +#else + int i, j, k, *ind; + vector_PRECISION lphi = phi+start, leta = eta+start; + config_PRECISION D_pt, D = s->op.D + (start/nv)*36; + +#ifdef HAVE_TM1p1 + if ( g.n_flavours == 2 ) { + complex_PRECISION buf1[24], *buf2 = buf1+12; + // T direction + if ( amount == _EVEN_SITES ) { + a1 = 0; n1 = length_even[T]; + a2 = n1; n2 = a2 + length_odd[T]; + } else if ( amount == _ODD_SITES ) { + a1 = length_even[T]; n1 = a1 + length_odd[T]; + a2 = 0; n2 = a1; + } else { + a1 = 0; n1 = length_even[T]+length_odd[T]; + a2 = 0; n2 = n1; + } + // "amount" of a block, +T coupling + ind = index[T]; + for ( i=a1; ioe_buf; block_diag_ee_PRECISION( out, in, start, s, l, threading ); - START_LOCKED_MASTER(threading) - vector_PRECISION_define( tmp[0], 0, start + 12*s->num_block_even_sites, start + s->block_vector_size, l ); - END_LOCKED_MASTER(threading) + vector_PRECISION_define_zero( tmp[0], start + l->num_lattice_site_var*s->num_block_even_sites, start + s->block_vector_size, l, threading ); block_hopping_term_PRECISION( tmp[0], in, start, _ODD_SITES, s, l, threading ); block_diag_oo_inv_PRECISION( tmp[1], tmp[0], start, s, l, threading ); block_n_hopping_term_PRECISION( out, tmp[1], start, _EVEN_SITES, s, l, threading ); @@ -1578,12 +2503,11 @@ void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, ve // odd to even vector_PRECISION_copy( tmp[3], r, start, end, l ); - block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading ); block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _EVEN_SITES, s, l, no_threading ); local_minres_PRECISION( NULL, tmp[3], tmp[2], start, s, l, no_threading ); - + // even to odd block_n_hopping_term_PRECISION( tmp[3], tmp[2], start, _ODD_SITES, s, l, no_threading ); block_diag_oo_inv_PRECISION( tmp[2], tmp[3], start, s, l, no_threading ); @@ -1592,45 +2516,50 @@ void block_solve_oddeven_PRECISION( vector_PRECISION phi, vector_PRECISION r, ve vector_PRECISION_copy( latest_iter, tmp[2], start, end, l ); vector_PRECISION_plus( phi, phi, tmp[2], start, end, l ); // update r - vector_PRECISION_copy( r, tmp[3], start, start+12*s->num_block_even_sites, l ); - vector_PRECISION_define( r, 0, start+12*s->num_block_even_sites, end, l ); + vector_PRECISION_copy( r, tmp[3], start, start+l->num_lattice_site_var*s->num_block_even_sites, l ); + vector_PRECISION_define_zero( r, start+l->num_lattice_site_var*s->num_block_even_sites, end, l, no_threading ); END_UNTHREADED_FUNCTION(threading) } void block_oddeven_PRECISION_test( level_struct *l, struct Thread *threading ) { -#if !defined( OPTIMIZED_NEIGHBOR_COUPLING_PRECISION ) && !defined( OPTIMIZED_SELF_COUPLING_PRECISION ) START_UNTHREADED_FUNCTION(threading) schwarz_PRECISION_struct *s = &(l->s_PRECISION); vector_PRECISION b1 = NULL, b2 = NULL, b3 = NULL, b4 = NULL, b5 = NULL; - MALLOC( b1, complex_PRECISION, s->block_vector_size ); - MALLOC( b2, complex_PRECISION, s->block_vector_size ); - MALLOC( b3, complex_PRECISION, s->block_vector_size ); - MALLOC( b4, complex_PRECISION, s->block_vector_size ); - MALLOC( b5, complex_PRECISION, s->block_vector_size ); - - vector_PRECISION_define_random( b1, 0, s->block_vector_size, l ); - - block_diag_ee_PRECISION( b2, b1, 0, s, l, no_threading ); - block_diag_oo_PRECISION( b2, b1, 0, s, l, no_threading ); - block_hopping_term_PRECISION( b2, b1, 0, _FULL_SYSTEM, s, l, no_threading ); + PRECISION diff; + + int vs = s->block_vector_size * s->num_blocks; + + MALLOC( b1, complex_PRECISION, vs ); + MALLOC( b2, complex_PRECISION, vs ); + MALLOC( b3, complex_PRECISION, vs ); + MALLOC( b4, complex_PRECISION, vs ); + MALLOC( b5, complex_PRECISION, vs ); - block_d_plus_clover_PRECISION( b3, b1, 0, s, l, no_threading ); + vector_PRECISION_define_random( b1, 0, vs, l, no_threading ); + + for (int i = 0; i< s->num_blocks; i++ ) { + block_diag_ee_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_diag_oo_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + block_hopping_term_PRECISION( b2, b1, s->block[i].start*l->num_lattice_site_var, _FULL_SYSTEM, s, l, no_threading ); + + block_d_plus_clover_PRECISION( b3, b1, s->block[i].start*l->num_lattice_site_var, s, l, no_threading ); + } - vector_PRECISION_minus( b3, b3, b2, 0, s->block_vector_size, l ); - double diff = process_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / process_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading ); + vector_PRECISION_minus( b3, b3, b2, 0, vs, l ); + diff = global_norm_PRECISION( b3, 0, vs, l, no_threading ) / global_norm_PRECISION( b2, 0, vs, l, no_threading ); - printf0("depth: %d, correctness of block odd even layout: %le\n", l->depth, diff ); + test0_PRECISION("depth: %d, correctness of block odd even layout: %le\n", l->depth, diff ); vector_PRECISION_copy( b4, b1, 0, s->block_vector_size, l ); - vector_PRECISION_define( b3, 0, 12*s->num_block_even_sites, s->block_vector_size, l ); + vector_PRECISION_define_zero( b3, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l, no_threading ); block_hopping_term_PRECISION( b3, b4, 0, _ODD_SITES, s, l, no_threading ); block_diag_oo_inv_PRECISION( b5, b3, 0, s, l, no_threading ); - vector_PRECISION_plus( b4, b4, b5, 12*s->num_block_even_sites, s->block_vector_size, l ); + vector_PRECISION_plus( b4, b4, b5, l->num_lattice_site_var*s->num_block_even_sites, s->block_vector_size, l ); apply_block_schur_complement_PRECISION( b3, b4, 0, s, l, no_threading ); block_diag_oo_PRECISION( b3, b4, 0, s, l, no_threading ); @@ -1639,21 +2568,19 @@ void block_oddeven_PRECISION_test( level_struct *l, struct Thread *threading ) { block_hopping_term_PRECISION( b3, b5, 0, _EVEN_SITES, s, l, no_threading ); vector_PRECISION_minus( b3, b2, b3, 0, s->block_vector_size, l ); - diff = process_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / process_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading ); + diff = global_norm_PRECISION( b3, 0, s->block_vector_size, l, no_threading ) / global_norm_PRECISION( b2, 0, s->block_vector_size, l, no_threading ); - printf0("depth: %d, correctness of block odd even schur complement: %le\n", l->depth, diff ); + test0_PRECISION("depth: %d, correctness of block odd even schur complement: %le\n", l->depth, diff ); - FREE( b1, complex_PRECISION, s->block_vector_size ); - FREE( b2, complex_PRECISION, s->block_vector_size ); - FREE( b3, complex_PRECISION, s->block_vector_size ); - FREE( b4, complex_PRECISION, s->block_vector_size ); - FREE( b5, complex_PRECISION, s->block_vector_size ); + FREE( b1, complex_PRECISION, vs ); + FREE( b2, complex_PRECISION, vs ); + FREE( b3, complex_PRECISION, vs ); + FREE( b4, complex_PRECISION, vs ); + FREE( b5, complex_PRECISION, vs ); END_UNTHREADED_FUNCTION(threading) -#endif } - void oddeven_PRECISION_test( level_struct *l ) { /********************************************************************************* @@ -1666,7 +2593,7 @@ void oddeven_PRECISION_test( level_struct *l ) { vector_double d1=NULL, d2=NULL, d3=NULL; vector_PRECISION f1=NULL, f2=NULL, f3=NULL, f4=NULL, f5=NULL; - double norm; + double diff; MALLOC( d1, complex_double, l->inner_vector_size ); MALLOC( d2, complex_double, l->inner_vector_size ); @@ -1677,7 +2604,7 @@ void oddeven_PRECISION_test( level_struct *l ) { MALLOC( f4, complex_PRECISION, l->inner_vector_size ); MALLOC( f5, complex_PRECISION, l->inner_vector_size ); - vector_double_define_random( d1, 0, l->inner_vector_size, l ); + vector_double_define_random( d1, 0, l->inner_vector_size, l, no_threading ); serial_to_oddeven_PRECISION( f1, d1, l, no_threading ); diag_ee_PRECISION( f2, f1, &(l->oe_op_PRECISION), l, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var ); @@ -1689,10 +2616,9 @@ void oddeven_PRECISION_test( level_struct *l ) { oddeven_to_serial_PRECISION( d1, f2, l, no_threading ); vector_double_minus( d3, d1, d2, 0, l->num_inner_lattice_sites, l ); - norm = global_norm_double( d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( d1, 0, l->num_inner_lattice_sites, l, no_threading ); + diff = global_norm_double( d3, 0, l->num_inner_lattice_sites, l, no_threading )/global_norm_double( d1, 0, l->num_inner_lattice_sites, l, no_threading ); - printf0("depth: %d, correctness of odd even layout: %le\n", l->depth, norm ); - if(norm > g.test) g.test = norm; + test0_PRECISION("depth: %d, correctness of odd even layout: %le\n", l->depth, diff ); // -------------- @@ -1701,16 +2627,15 @@ void oddeven_PRECISION_test( level_struct *l ) { diag_oo_inv_PRECISION( f4, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); vector_PRECISION_minus( f4, f4, f1, 0, l->inner_vector_size, l ); - norm = (PRECISION) (global_norm_PRECISION( f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )); + diff = (PRECISION) (global_norm_PRECISION( f4, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )); - printf0("depth: %d, correctness of odd even diagonal term: %le\n", l->depth, norm ); - if(norm > g.test) g.test = norm; + test0_PRECISION("depth: %d, correctness of odd even diagonal term: %le\n", l->depth, diff ); // transformation part vector_PRECISION_copy( f4, f1, 0, l->inner_vector_size, l ); // even to odd // set odd part of f3 to 0. - vector_PRECISION_define( f3, 0, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l ); + vector_PRECISION_define_zero( f3, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size, l, no_threading ); hopping_term_PRECISION( f3, f4, &(l->oe_op_PRECISION), _ODD_SITES, l, no_threading ); diag_oo_inv_PRECISION( f5, f3, &(l->oe_op_PRECISION), l, l->oe_op_PRECISION.num_even_sites*l->num_lattice_site_var, l->inner_vector_size ); @@ -1724,10 +2649,9 @@ void oddeven_PRECISION_test( level_struct *l ) { hopping_term_PRECISION( f3, f5, &(l->oe_op_PRECISION), _EVEN_SITES, l, no_threading ); vector_PRECISION_minus( f1, f2, f3, 0, l->inner_vector_size, l ); - norm = (PRECISION) (global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f2, 0, l->inner_vector_size, l, no_threading )); + diff = (PRECISION) (global_norm_PRECISION( f1, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( f2, 0, l->inner_vector_size, l, no_threading )); - printf0("depth: %d, correctness of odd even schur complement: %le\n", l->depth, norm ); - if(norm > g.test) g.test = norm; + test0_PRECISION("depth: %d, correctness of odd even schur complement: %le\n", l->depth, diff ); FREE( d1, complex_double, l->inner_vector_size ); FREE( d2, complex_double, l->inner_vector_size ); diff --git a/src/oddeven_generic.h b/src/oddeven_generic.h index 66cce02..4fac101 100644 --- a/src/oddeven_generic.h +++ b/src/oddeven_generic.h @@ -52,7 +52,7 @@ struct Thread; void g5D_apply_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); void g5D_solve_oddeven_PRECISION( gmres_PRECISION_struct *p, operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); - void schwarz_PRECISION_oddeven_setup( operator_PRECISION_struct *op, level_struct *l ); + void schwarz_PRECISION_oddeven_setup( schwarz_PRECISION_struct *s, level_struct *l ); void apply_block_schur_complement_PRECISION( vector_PRECISION out, vector_PRECISION in, int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ); diff --git a/src/operator_generic.c b/src/operator_generic.c index 41a3895..2c783ea 100644 --- a/src/operator_generic.c +++ b/src/operator_generic.c @@ -29,13 +29,29 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) { op->backward_neighbor_table = NULL; op->translation_table = NULL; op->D = NULL; + op->D_vectorized = NULL; + op->D_transformed_vectorized = NULL; op->clover = NULL; - op->oe_clover = NULL; - op->oe_clover_vectorized = NULL; + op->clover_oo_inv = NULL; + op->clover_vectorized = NULL; + op->clover_oo_inv_vectorized = NULL; + op->m0 = 0; #ifdef HAVE_TM + op->mu = 0; + op->mu_even_shift = 0; + op->mu_odd_shift = 0; op->odd_proj = NULL; op->tm_term = NULL; #endif +#ifdef HAVE_TM1p1 + op->epsbar = 0; + op->epsbar_ig5_even_shift = 0; + op->epsbar_ig5_odd_shift = 0; + op->epsbar_term = NULL; + op->clover_doublet_oo_inv = NULL; + op->clover_doublet_vectorized = NULL; + op->clover_doublet_oo_inv_vectorized = NULL; +#endif for ( int mu=0; mu<4; mu++ ) op->config_boundary_table[mu] = NULL; @@ -47,11 +63,6 @@ void operator_PRECISION_init( operator_PRECISION_struct *op ) { } op->c.comm = 1; op->buffer = NULL; -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - op->D_vectorized = NULL; - op->D_transformed_vectorized = NULL; - op->clover_vectorized = NULL; -#endif } @@ -61,6 +72,9 @@ void operator_PRECISION_alloc_projection_buffers( operator_PRECISION_struct *op, // g.method >= 4: then oddeven_setup_float() is called in init.c, method_setup(). if ( l->depth == 0 ) { int its = (l->num_lattice_site_var/2)*l->num_lattice_sites; +#ifdef HAVE_TM1p1 + its *= 2; +#endif MALLOC( op->prnT, complex_PRECISION, its*8 ); op->prnZ = op->prnT + its; op->prnY = op->prnZ + its; op->prnX = op->prnY + its; op->prpT = op->prnX + its; op->prpZ = op->prpT + its; op->prpY = op->prpZ + its; op->prpX = op->prpY + its; @@ -70,6 +84,9 @@ void operator_PRECISION_free_projection_buffers( operator_PRECISION_struct *op, if ( l->depth == 0 ) { int its = (l->num_lattice_site_var/2)*l->num_lattice_sites; +#ifdef HAVE_TM1p1 + its *= 2; +#endif FREE( op->prnT, complex_PRECISION, its*8 ); } } @@ -102,35 +119,56 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, le its *= (l->local_lattice[mu]+its_boundary); } - nls = (type==_ORDINARY)?l->num_inner_lattice_sites:2*l->num_lattice_sites-l->num_inner_lattice_sites; + nls = (type==_SCHWARZ) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites):l->num_inner_lattice_sites; + MALLOC( op->D, complex_PRECISION, coupling_site_size*nls ); MALLOC( op->clover, complex_PRECISION, clover_site_size*l->num_inner_lattice_sites ); + + int block_site_size = ( l->depth == 0 ) ? 12 : (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)); + MALLOC( op->odd_proj, complex_PRECISION, block_site_size*l->num_inner_lattice_sites ); #ifdef HAVE_TM - int tm_site_size; - if ( l->depth == 0 ) - tm_site_size = 12; - else - tm_site_size = (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)); - - MALLOC( op->tm_term, complex_PRECISION, tm_site_size*l->num_inner_lattice_sites ); - MALLOC( op->odd_proj, complex_PRECISION, tm_site_size*l->num_inner_lattice_sites ); - if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) //we use LU here - MALLOC( op->oe_clover, complex_PRECISION, 72*l->num_inner_lattice_sites ); -#else - if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) - MALLOC( op->oe_clover, complex_PRECISION, clover_site_size*l->num_inner_lattice_sites ); + MALLOC( op->tm_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites ); #endif +#ifdef HAVE_TM1p1 + MALLOC( op->epsbar_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites ); +#endif + MALLOC( op->index_table, int, its ); - MALLOC( op->neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites ); - MALLOC( op->backward_neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites ); - MALLOC( op->translation_table, int, l->num_inner_lattice_sites ); -#ifdef SSE - if ( l->depth == 0 ) { - MALLOC( op->oe_clover_vectorized, PRECISION, 144*l->num_inner_lattice_sites ); + if ( type ==_ODDEVEN ) { + MALLOC( op->neighbor_table, int, 5*its ); + MALLOC( op->backward_neighbor_table, int, 5*its ); + } else { + MALLOC( op->neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites ); + MALLOC( op->backward_neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites ); } + MALLOC( op->translation_table, int, l->num_inner_lattice_sites ); + + if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) { +#ifndef OPTIMIZED_SELF_COUPLING_PRECISION + + if( g.csw ) { +#ifdef HAVE_TM //we use LU here + MALLOC( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) ); +#else + MALLOC( op->clover_oo_inv, complex_PRECISION, clover_site_size*(l->num_inner_lattice_sites/2+1) ); #endif - - operator_PRECISION_alloc_projection_buffers( op, l ); + } +#ifdef HAVE_TM1p1 + MALLOC( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) ); +#endif + +#else + if( g.csw ) + MALLOC_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION ); +#ifdef HAVE_TM1p1 + MALLOC_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1), 4*SIMD_LENGTH_PRECISION ); +#endif + +#endif + } + + if ( type != _ODDEVEN ) + operator_PRECISION_alloc_projection_buffers( op, l ); ghost_alloc_PRECISION( 0, &(op->c), l ); @@ -176,35 +214,54 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev its *= (l->local_lattice[mu]+its_boundary); } - int nls = (type==_ORDINARY)?l->num_inner_lattice_sites:2*l->num_lattice_sites-l->num_inner_lattice_sites; + int nls = (type==_SCHWARZ) ? (2*l->num_lattice_sites-l->num_inner_lattice_sites) : l->num_inner_lattice_sites; FREE( op->D, complex_PRECISION, coupling_site_size*nls ); FREE( op->clover, complex_PRECISION, clover_site_size*l->num_inner_lattice_sites ); + + int block_site_size = ( l->depth == 0 ) ? 12 : (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)); + FREE( op->odd_proj, complex_PRECISION, block_site_size*l->num_inner_lattice_sites ); #ifdef HAVE_TM - int tm_site_size; - if ( l->depth == 0 ) - tm_site_size = 12; - else - tm_site_size = (l->num_lattice_site_var/2*(l->num_lattice_site_var/2+1)); - - FREE( op->tm_term, complex_PRECISION, tm_site_size*l->num_inner_lattice_sites ); - FREE( op->odd_proj, complex_PRECISION, tm_site_size*l->num_inner_lattice_sites ); - if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) //we use LU here - FREE( op->oe_clover, complex_PRECISION, 72*l->num_inner_lattice_sites ); + FREE( op->tm_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites ); +#endif + if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) { +#ifndef OPTIMIZED_SELF_COUPLING_PRECISION + + if( g.csw ) { +#ifdef HAVE_TM //we use LU here + FREE( op->clover_oo_inv, complex_PRECISION, 72*(l->num_inner_lattice_sites/2+1) ); #else - if ( type == _SCHWARZ && l->depth == 0 && g.odd_even ) - FREE( op->oe_clover, complex_PRECISION, clover_site_size*l->num_inner_lattice_sites ); + FREE( op->clover_oo_inv, complex_PRECISION, clover_site_size*(l->num_inner_lattice_sites/2+1) ); +#endif + } +#ifdef HAVE_TM1p1 + FREE( op->clover_doublet_oo_inv, complex_PRECISION, 12*12*2*(l->num_inner_lattice_sites/2+1) ); +#endif + +#else + if( g.csw ) + FREE_HUGEPAGES( op->clover_oo_inv_vectorized, PRECISION, 144*(l->num_inner_lattice_sites/2+1) ); +#ifdef HAVE_TM1p1 + FREE_HUGEPAGES( op->clover_doublet_oo_inv_vectorized, PRECISION, 2*2*144*(l->num_inner_lattice_sites/2+1) ); +#endif + +#endif + } + +#ifdef HAVE_TM1p1 + FREE( op->epsbar_term, complex_PRECISION, block_site_size*l->num_inner_lattice_sites ); #endif FREE( op->index_table, int, its ); - FREE( op->neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites ); - FREE( op->backward_neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites ); - FREE( op->translation_table, int, l->num_inner_lattice_sites ); -#ifdef SSE - if ( l->depth == 0 ) { - FREE( op->oe_clover_vectorized, PRECISION, 144*l->num_inner_lattice_sites ); + if ( type ==_ODDEVEN ) { + FREE( op->neighbor_table, int, 5*its ); + FREE( op->backward_neighbor_table, int, 5*its ); + } else { + FREE( op->neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites ); + FREE( op->backward_neighbor_table, int, (l->depth==0?4:5)*l->num_inner_lattice_sites ); } -#endif + FREE( op->translation_table, int, l->num_inner_lattice_sites ); - operator_PRECISION_free_projection_buffers( op, l ); + if ( type != _ODDEVEN ) + operator_PRECISION_free_projection_buffers( op, l ); ghost_free_PRECISION( &(op->c), l ); @@ -229,8 +286,8 @@ void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, lev void operator_PRECISION_define( operator_PRECISION_struct *op, level_struct *l ) { - int i, mu, t, z, y, x, *it = op->index_table, - ls[4], le[4], l_st[4], l_en[4], *dt = op->table_dim; + int i, mu, t, z, y, x, *it = op->index_table, + ls[4], le[4], l_st[4], l_en[4], *dt = op->table_dim; for ( mu=0; mu<4; mu++ ) { dt[mu] = l->local_lattice[mu]+1; @@ -273,6 +330,36 @@ void operator_PRECISION_define( operator_PRECISION_struct *op, level_struct *l ) define_nt_bt_tt( op->neighbor_table, op->backward_neighbor_table, op->c.boundary_table, op->translation_table, it, dt, l ); } +void operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l ) { + + operator_PRECISION_set_self_couplings( op, l ); + operator_PRECISION_set_neighbor_couplings( op, l ); + +} + +void operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l ) { + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + int i, n = 2*l->num_lattice_sites - l->num_inner_lattice_sites; + + for ( i=0; iD_vectorized + 96*i; + PRECISION *D_transformed_vectorized = op->D_transformed_vectorized + 96*i; + complex_PRECISION *D_pt = op->D + 36*i; + for ( int mu=0; mu<4; mu++ ) + set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_pt+9*mu ); + } +#endif + +} + +void operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l ) { + +#ifdef OPTIMIZED_SELF_COUPLING_PRECISION + if ( g.csw != 0 ) + set_clover_vectorized_PRECISION( op, l, no_threading ); +#endif +} void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { @@ -294,11 +381,11 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc PUBLIC_MALLOC( vd1, complex_double, 4*ivs ); PUBLIC_MALLOC( vp1, complex_PRECISION, 2*ivs ); - vd2 = vd1+ivs; vd3 = vd2+ivs; vd4 = vd3 + ivs; vp2 = vp1 + ivs; + vd2 = vd1 + ivs; vd3 = vd2 + ivs; vd4 = vd3 + ivs; vp2 = vp1 + ivs; START_LOCKED_MASTER(threading) - vector_double_define_random( vd1, 0, l->inner_vector_size, l ); + vector_double_define_random( vd1, 0, l->inner_vector_size, l, no_threading ); apply_operator_double( vd2, vd1, &(g.p), l, no_threading ); trans_PRECISION( vp1, vd1, op->translation_table, l, no_threading ); @@ -306,9 +393,10 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading ); vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vd4, 0, ivs, l, no_threading )/global_norm_double( vd3, 0, ivs, l, no_threading ); - printf0("depth: 0, correctness of schwarz PRECISION Dirac operator: %le\n", diff ); - if(diff > g.test) g.test = diff; + diff = global_norm_double( vd4, 0, ivs, l, no_threading )/ + global_norm_double( vd3, 0, ivs, l, no_threading ); + + test0_PRECISION("depth: %d, correctness of schwarz PRECISION Dirac operator: %le\n", l->depth, diff ); END_LOCKED_MASTER(threading) if(threading->n_core > 1) { @@ -319,15 +407,20 @@ void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struc START_LOCKED_MASTER(threading) trans_back_PRECISION( vd3, vp2, op->translation_table, l, no_threading ); - vector_double_minus( vd4, vd3, vd2, 0, l->inner_vector_size, l ); - diff = global_norm_double( vd4, 0, ivs, l, no_threading )/global_norm_double( vd3, 0, ivs, l, no_threading ); + diff = global_norm_double( vd4, 0, ivs, l, no_threading ) / + global_norm_double( vd3, 0, ivs, l, no_threading ); - printf0("depth: 0, correctness of schwarz PRECISION Dirac operator with threading: %le\n", diff ); + if ( diff > EPS_PRECISION ) + printf0("\x1b[31m"); + printf0("depth: %d, correctness of schwarz PRECISION Dirac operator with threading: %le\n", l->depth, diff ); + if ( diff > EPS_PRECISION ) + printf0("\x1b[0m"); if(diff > g.test) g.test = diff; + END_LOCKED_MASTER(threading) } - + PUBLIC_FREE( vd1, complex_double, 4*ivs ); PUBLIC_FREE( vp1, complex_PRECISION, 2*ivs ); diff --git a/src/operator_generic.h b/src/operator_generic.h index 966e56f..753cf04 100644 --- a/src/operator_generic.h +++ b/src/operator_generic.h @@ -28,6 +28,10 @@ void operator_PRECISION_alloc( operator_PRECISION_struct *op, const int type, level_struct *l ); void operator_PRECISION_define( operator_PRECISION_struct *op, level_struct *l ); void operator_PRECISION_free( operator_PRECISION_struct *op, const int type, level_struct *l ); + + void operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l ); + void operator_PRECISION_set_self_couplings( operator_PRECISION_struct *op, level_struct *l ); + void operator_PRECISION_set_neighbor_couplings( operator_PRECISION_struct *op, level_struct *l ); void operator_PRECISION_test_routine( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ); diff --git a/src/schwarz_generic.c b/src/schwarz_generic.c index a9493ee..28be21b 100644 --- a/src/schwarz_generic.c +++ b/src/schwarz_generic.c @@ -27,6 +27,8 @@ void smoother_PRECISION_def( level_struct *l ) { schwarz_PRECISION_def( &(l->s_PRECISION), &(g.op_double), l ); l->p_PRECISION.op = &(l->s_PRECISION.op); + l->p_PRECISION.v_start = 0; + l->p_PRECISION.v_end = l->inner_vector_size; if ( g.method == 6 ) { l->p_PRECISION.eval_operator = (l->depth > 0)?g5D_apply_coarse_operator_PRECISION:g5D_plus_clover_PRECISION; } else { @@ -49,15 +51,12 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) { s->index[T] = NULL; s->oe_index[T] = NULL; s->block = NULL; - s->bbuf1 = NULL; s->buf1 = NULL; s->buf2 = NULL; s->buf3 = NULL; s->buf4 = NULL; s->buf5 = NULL; l->sbuf_PRECISION[0] = NULL; - s->oe_bbuf[0] = NULL; - s->oe_bbuf[1] = NULL; s->oe_buf[0] = NULL; s->oe_buf[1] = NULL; s->oe_buf[2] = NULL; @@ -73,7 +72,7 @@ void schwarz_PRECISION_init( schwarz_PRECISION_struct *s, level_struct *l ) { void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { - int i, j, n, mu, nu, *bl = l->block_lattice, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + int i, j, n, mu, nu, *bl = l->block_lattice; if ( g.method == 4 ) { fgmres_PRECISION_struct_alloc( l->block_iter, 1, (l->depth==0)?l->inner_vector_size:l->vector_size, @@ -140,22 +139,19 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { } MALLOC( s->block, block_struct, s->num_blocks ); - MALLOC( s->bbuf1, complex_PRECISION, (l->depth==0&&g.odd_even?9:3)*s->block_vector_size ); + + int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + +#ifdef HAVE_TM1p1 + svs *= 2; + vs *= 2; +#endif + if ( l->depth == 0 ) { - MALLOC( s->oe_buf[0], complex_PRECISION, 4*l->inner_vector_size ); - s->oe_buf[1] = s->oe_buf[0] + l->inner_vector_size; - s->oe_buf[2] = s->oe_buf[1] + l->inner_vector_size; - s->oe_buf[3] = s->oe_buf[2] + l->inner_vector_size; - } - s->bbuf2 = s->bbuf1 + s->block_vector_size; - s->bbuf3 = s->bbuf2 + s->block_vector_size; - if ( l->depth == 0 && g.odd_even ) { - s->oe_bbuf[0] = s->bbuf3 + s->block_vector_size; - s->oe_bbuf[1] = s->oe_bbuf[0] + s->block_vector_size; - s->oe_bbuf[2] = s->oe_bbuf[1] + s->block_vector_size; - s->oe_bbuf[3] = s->oe_bbuf[2] + s->block_vector_size; - s->oe_bbuf[4] = s->oe_bbuf[3] + s->block_vector_size; - s->oe_bbuf[5] = s->oe_bbuf[4] + s->block_vector_size; + MALLOC( s->oe_buf[0], complex_PRECISION, 4*vs ); + s->oe_buf[1] = s->oe_buf[0] + vs; + s->oe_buf[2] = s->oe_buf[1] + vs; + s->oe_buf[3] = s->oe_buf[2] + vs; } n = 0; @@ -176,24 +172,24 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { s->block[i].bt = NULL; MALLOC( s->block[i].bt, int, n ); } - - MALLOC( s->buf1, complex_PRECISION, vs+3*l->schwarz_vector_size ); + + MALLOC( s->buf1, complex_PRECISION, vs+3*svs ); s->buf2 = s->buf1 + vs; - s->buf3 = s->buf2 + l->schwarz_vector_size; - s->buf4 = s->buf3 + l->schwarz_vector_size; - + s->buf3 = s->buf2 + svs; + s->buf4 = s->buf3 + svs; + if ( g.method == 1 ) - MALLOC( s->buf5, complex_PRECISION, l->schwarz_vector_size ); + MALLOC( s->buf5, complex_PRECISION, svs ); MALLOC( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs ); l->sbuf_PRECISION[1] = l->sbuf_PRECISION[0] + vs; // these buffers are introduced to make local_minres_PRECISION thread-safe - MALLOC( s->local_minres_buffer[0], complex_PRECISION, l->schwarz_vector_size ); - MALLOC( s->local_minres_buffer[1], complex_PRECISION, l->schwarz_vector_size ); - MALLOC( s->local_minres_buffer[2], complex_PRECISION, l->schwarz_vector_size ); + MALLOC( s->local_minres_buffer[0], complex_PRECISION, svs ); + MALLOC( s->local_minres_buffer[1], complex_PRECISION, svs ); + MALLOC( s->local_minres_buffer[2], complex_PRECISION, svs ); -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float if ( l->depth == 0 ) { MALLOC_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION ); MALLOC_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size), 4*SIMD_LENGTH_PRECISION ); @@ -202,6 +198,9 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { #ifdef OPTIMIZED_SELF_COUPLING_PRECISION if ( l->depth == 0 ) { MALLOC_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); +#ifdef HAVE_TM1p1 + MALLOC_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size, 4*SIMD_LENGTH_PRECISION ); +#endif } #endif } @@ -209,7 +208,7 @@ void schwarz_PRECISION_alloc( schwarz_PRECISION_struct *s, level_struct *l ) { void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { - int i, n, mu, nu, *bl = l->block_lattice, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + int i, n, mu, nu, *bl = l->block_lattice; if ( g.method == 4 || g.method == 5 || g.method == 6 ) fgmres_PRECISION_struct_free( &(l->sp_PRECISION), l ); @@ -252,37 +251,42 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { } FREE( s->block, block_struct, s->num_blocks ); - FREE( s->bbuf1, complex_PRECISION, (l->depth==0&&g.odd_even?9:3)*s->block_vector_size ); - if ( l->depth == 0 ) { + + int svs = l->schwarz_vector_size, vs = (l->depth==0)?l->inner_vector_size:l->vector_size; + +#ifdef HAVE_TM1p1 + svs *= 2; + vs *= 2; +#endif + if ( l->depth == 0 ) { s->oe_buf[1] = NULL; s->oe_buf[2] = NULL; s->oe_buf[3] = NULL; - FREE( s->oe_buf[0], complex_PRECISION, 4*l->inner_vector_size ); + FREE( s->oe_buf[0], complex_PRECISION, 4*vs ); s->oe_buf[0] = NULL; } - s->bbuf2 = NULL; s->bbuf3 = NULL; s->oe_bbuf[0] = NULL; s->oe_bbuf[1] = NULL; - s->oe_bbuf[2] = NULL; s->oe_bbuf[3] = NULL; s->oe_bbuf[4] = NULL; s->oe_bbuf[5] = NULL; - FREE( s->buf1, complex_PRECISION, vs+3*l->schwarz_vector_size ); + + FREE( s->buf1, complex_PRECISION, vs+3*svs ); s->buf2 = NULL; s->buf3 = NULL; s->buf4 = NULL; if ( g.method == 1 ) - FREE( s->buf5, complex_PRECISION, l->schwarz_vector_size ); + FREE( s->buf5, complex_PRECISION, svs ); operator_PRECISION_free( &(s->op), _SCHWARZ, l ); FREE( l->sbuf_PRECISION[0], complex_PRECISION, 2*vs ); l->sbuf_PRECISION[1] = NULL; - FREE( s->local_minres_buffer[0], complex_PRECISION, l->schwarz_vector_size ); - FREE( s->local_minres_buffer[1], complex_PRECISION, l->schwarz_vector_size ); - FREE( s->local_minres_buffer[2], complex_PRECISION, l->schwarz_vector_size ); + FREE( s->local_minres_buffer[0], complex_PRECISION, svs ); + FREE( s->local_minres_buffer[1], complex_PRECISION, svs ); + FREE( s->local_minres_buffer[2], complex_PRECISION, svs ); s->local_minres_buffer[0] = NULL; s->local_minres_buffer[1] = NULL; s->local_minres_buffer[2] = NULL; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float if ( l->depth == 0 ) { FREE_HUGEPAGES( s->op.D_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) ); FREE_HUGEPAGES( s->op.D_transformed_vectorized, PRECISION, 2*4*(2*l->vector_size-l->inner_vector_size) ); @@ -291,6 +295,9 @@ void schwarz_PRECISION_free( schwarz_PRECISION_struct *s, level_struct *l ) { #ifdef OPTIMIZED_SELF_COUPLING_PRECISION if ( l->depth == 0 ) { FREE_HUGEPAGES( s->op.clover_vectorized, PRECISION, 2*6*l->inner_vector_size ); +#ifdef HAVE_TM1p1 + FREE_HUGEPAGES( s->op.clover_doublet_vectorized, PRECISION, 4*2*6*l->inner_vector_size ); +#endif } #endif } @@ -726,245 +733,547 @@ void schwarz_PRECISION_boundary_update( schwarz_PRECISION_struct *s, level_struc } } -#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION + void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block - int i, mu, index, neighbor_index, *bbl = s->block_boundary_length; - complex_PRECISION buf1[12], *buf2=buf1+6; - config_PRECISION D_pt, D = s->op.D; - vector_PRECISION phi_pt, eta_pt; - - mu=T; - // plus mu direction - for ( i=bbl[2*mu]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prp_T_PRECISION( buf1, phi_pt ); - mvm_PRECISION( buf2, D_pt, buf1 ); - mvm_PRECISION( buf2+3, D_pt, buf1+3 ); - pbp_su3_T_PRECISION( buf2, eta_pt ); - } - // minus mu direction - for ( i=bbl[2*mu+1]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prn_T_PRECISION( buf1, phi_pt ); - mvmh_PRECISION( buf2, D_pt, buf1 ); - mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); - pbn_su3_T_PRECISION( buf2, eta_pt ); - } + int *bbl = s->block_boundary_length; +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float + PRECISION *Dplus = s->op.D_vectorized; + PRECISION *Dminus = s->op.D_transformed_vectorized; - mu=Z; - // plus mu direction - for ( i=bbl[2*mu]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prp_Z_PRECISION( buf1, phi_pt ); - mvm_PRECISION( buf2, D_pt, buf1 ); - mvm_PRECISION( buf2+3, D_pt, buf1+3 ); - pbp_su3_Z_PRECISION( buf2, eta_pt ); - } - // minus mu direction - for ( i=bbl[2*mu+1]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prn_Z_PRECISION( buf1, phi_pt ); - mvmh_PRECISION( buf2, D_pt, buf1 ); - mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); - pbn_su3_Z_PRECISION( buf2, eta_pt ); + for ( int mu=0; mu<4; mu++ ) { + boundary_plus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi, + mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); + boundary_minus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi, + mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); } +#else + int i, mu, index, neighbor_index; + config_PRECISION D_pt, D = s->op.D; + vector_PRECISION phi_pt, eta_pt; + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + complex_PRECISION buf1[24], *buf2=buf1+12; + mu=T; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprp_T_PRECISION( buf1, phi_pt ); + mvm_PRECISION( buf2, D_pt, buf1 ); + mvm_PRECISION( buf2+3, D_pt, buf1+3 ); + mvm_PRECISION( buf2+6, D_pt, buf1+6 ); + mvm_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbp_su3_T_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprn_T_PRECISION( buf1, phi_pt ); + mvmh_PRECISION( buf2, D_pt, buf1 ); + mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + mvmh_PRECISION( buf2+6, D_pt, buf1+6 ); + mvmh_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbn_su3_T_PRECISION( buf2, eta_pt ); + } + + mu=Z; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprp_Z_PRECISION( buf1, phi_pt ); + mvm_PRECISION( buf2, D_pt, buf1 ); + mvm_PRECISION( buf2+3, D_pt, buf1+3 ); + mvm_PRECISION( buf2+6, D_pt, buf1+6 ); + mvm_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbp_su3_Z_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprn_Z_PRECISION( buf1, phi_pt ); + mvmh_PRECISION( buf2, D_pt, buf1 ); + mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + mvmh_PRECISION( buf2+6, D_pt, buf1+6 ); + mvmh_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbn_su3_Z_PRECISION( buf2, eta_pt ); + } + + mu=Y; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprp_Y_PRECISION( buf1, phi_pt ); + mvm_PRECISION( buf2, D_pt, buf1 ); + mvm_PRECISION( buf2+3, D_pt, buf1+3 ); + mvm_PRECISION( buf2+6, D_pt, buf1+6 ); + mvm_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbp_su3_Y_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprn_Y_PRECISION( buf1, phi_pt ); + mvmh_PRECISION( buf2, D_pt, buf1 ); + mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + mvmh_PRECISION( buf2+6, D_pt, buf1+6 ); + mvmh_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbn_su3_Y_PRECISION( buf2, eta_pt ); + } - mu=Y; - // plus mu direction - for ( i=bbl[2*mu]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prp_Y_PRECISION( buf1, phi_pt ); - mvm_PRECISION( buf2, D_pt, buf1 ); - mvm_PRECISION( buf2+3, D_pt, buf1+3 ); - pbp_su3_Y_PRECISION( buf2, eta_pt ); - } - // minus mu direction - for ( i=bbl[2*mu+1]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prn_Y_PRECISION( buf1, phi_pt ); - mvmh_PRECISION( buf2, D_pt, buf1 ); - mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); - pbn_su3_Y_PRECISION( buf2, eta_pt ); - } + mu=X; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprp_X_PRECISION( buf1, phi_pt ); + mvm_PRECISION( buf2, D_pt, buf1 ); + mvm_PRECISION( buf2+3, D_pt, buf1+3 ); + mvm_PRECISION( buf2+6, D_pt, buf1+6 ); + mvm_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbp_su3_X_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprn_X_PRECISION( buf1, phi_pt ); + mvmh_PRECISION( buf2, D_pt, buf1 ); + mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + mvmh_PRECISION( buf2+6, D_pt, buf1+6 ); + mvmh_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbn_su3_X_PRECISION( buf2, eta_pt ); + } + } else { +#endif + complex_PRECISION buf1[12], *buf2=buf1+6; + mu=T; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prp_T_PRECISION( buf1, phi_pt ); + mvm_PRECISION( buf2, D_pt, buf1 ); + mvm_PRECISION( buf2+3, D_pt, buf1+3 ); + pbp_su3_T_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prn_T_PRECISION( buf1, phi_pt ); + mvmh_PRECISION( buf2, D_pt, buf1 ); + mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + pbn_su3_T_PRECISION( buf2, eta_pt ); + } + + mu=Z; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prp_Z_PRECISION( buf1, phi_pt ); + mvm_PRECISION( buf2, D_pt, buf1 ); + mvm_PRECISION( buf2+3, D_pt, buf1+3 ); + pbp_su3_Z_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prn_Z_PRECISION( buf1, phi_pt ); + mvmh_PRECISION( buf2, D_pt, buf1 ); + mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + pbn_su3_Z_PRECISION( buf2, eta_pt ); + } + + mu=Y; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prp_Y_PRECISION( buf1, phi_pt ); + mvm_PRECISION( buf2, D_pt, buf1 ); + mvm_PRECISION( buf2+3, D_pt, buf1+3 ); + pbp_su3_Y_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prn_Y_PRECISION( buf1, phi_pt ); + mvmh_PRECISION( buf2, D_pt, buf1 ); + mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + pbn_su3_Y_PRECISION( buf2, eta_pt ); + } - mu=X; - // plus mu direction - for ( i=bbl[2*mu]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prp_X_PRECISION( buf1, phi_pt ); - mvm_PRECISION( buf2, D_pt, buf1 ); - mvm_PRECISION( buf2+3, D_pt, buf1+3 ); - pbp_su3_X_PRECISION( buf2, eta_pt ); + mu=X; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prp_X_PRECISION( buf1, phi_pt ); + mvm_PRECISION( buf2, D_pt, buf1 ); + mvm_PRECISION( buf2+3, D_pt, buf1+3 ); + pbp_su3_X_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prn_X_PRECISION( buf1, phi_pt ); + mvmh_PRECISION( buf2, D_pt, buf1 ); + mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + pbn_su3_X_PRECISION( buf2, eta_pt ); + } +#ifdef HAVE_TM1p1 } - // minus mu direction - for ( i=bbl[2*mu+1]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prn_X_PRECISION( buf1, phi_pt ); - mvmh_PRECISION( buf2, D_pt, buf1 ); - mvmh_PRECISION( buf2+3, D_pt, buf1+3 ); - pbn_su3_X_PRECISION( buf2, eta_pt ); - } -} #endif +#endif +} -#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block - int i, mu, index, neighbor_index, *bbl = s->block_boundary_length; - complex_PRECISION buf1[12], *buf2=buf1+6; - config_PRECISION D_pt, D = s->op.D; - vector_PRECISION phi_pt, eta_pt; + int *bbl = s->block_boundary_length; +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float + PRECISION *Dplus = s->op.D_vectorized; + PRECISION *Dminus = s->op.D_transformed_vectorized; - mu=T; - // plus mu direction - for ( i=bbl[2*mu]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prp_T_PRECISION( buf1, phi_pt ); - nmvm_PRECISION( buf2, D_pt, buf1 ); - nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); - pbp_su3_T_PRECISION( buf2, eta_pt ); - } - // minus mu direction - for ( i=bbl[2*mu+1]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prn_T_PRECISION( buf1, phi_pt ); - nmvmh_PRECISION( buf2, D_pt, buf1 ); - nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); - pbn_su3_T_PRECISION( buf2, eta_pt ); - } - - mu=Z; - // plus mu direction - for ( i=bbl[2*mu]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prp_Z_PRECISION( buf1, phi_pt ); - nmvm_PRECISION( buf2, D_pt, buf1 ); - nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); - pbp_su3_Z_PRECISION( buf2, eta_pt ); - } - // minus mu direction - for ( i=bbl[2*mu+1]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prn_Z_PRECISION( buf1, phi_pt ); - nmvmh_PRECISION( buf2, D_pt, buf1 ); - nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); - pbn_su3_Z_PRECISION( buf2, eta_pt ); - } - - mu=Y; - // plus mu direction - for ( i=bbl[2*mu]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prp_Y_PRECISION( buf1, phi_pt ); - nmvm_PRECISION( buf2, D_pt, buf1 ); - nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); - pbp_su3_Y_PRECISION( buf2, eta_pt ); - } - // minus mu direction - for ( i=bbl[2*mu+1]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prn_Y_PRECISION( buf1, phi_pt ); - nmvmh_PRECISION( buf2, D_pt, buf1 ); - nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); - pbn_su3_Y_PRECISION( buf2, eta_pt ); + for ( int mu=0; mu<4; mu++ ) { + boundary_nplus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi, + mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); + boundary_nminus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi, + mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); } +#else + int i, mu, index, neighbor_index; + config_PRECISION D_pt, D = s->op.D; + vector_PRECISION phi_pt, eta_pt; + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { + complex_PRECISION buf1[24], *buf2=buf1+12; + mu=T; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprp_T_PRECISION( buf1, phi_pt ); + nmvm_PRECISION( buf2, D_pt, buf1 ); + nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); + nmvm_PRECISION( buf2+6, D_pt, buf1+6 ); + nmvm_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbp_su3_T_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprn_T_PRECISION( buf1, phi_pt ); + nmvmh_PRECISION( buf2, D_pt, buf1 ); + nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + nmvmh_PRECISION( buf2+6, D_pt, buf1+6 ); + nmvmh_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbn_su3_T_PRECISION( buf2, eta_pt ); + } + + mu=Z; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprp_Z_PRECISION( buf1, phi_pt ); + nmvm_PRECISION( buf2, D_pt, buf1 ); + nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); + nmvm_PRECISION( buf2+6, D_pt, buf1+6 ); + nmvm_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbp_su3_Z_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprn_Z_PRECISION( buf1, phi_pt ); + nmvmh_PRECISION( buf2, D_pt, buf1 ); + nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + nmvmh_PRECISION( buf2+6, D_pt, buf1+6 ); + nmvmh_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbn_su3_Z_PRECISION( buf2, eta_pt ); + } + + mu=Y; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprp_Y_PRECISION( buf1, phi_pt ); + nmvm_PRECISION( buf2, D_pt, buf1 ); + nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); + nmvm_PRECISION( buf2+6, D_pt, buf1+6 ); + nmvm_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbp_su3_Y_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprn_Y_PRECISION( buf1, phi_pt ); + nmvmh_PRECISION( buf2, D_pt, buf1 ); + nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + nmvmh_PRECISION( buf2+6, D_pt, buf1+6 ); + nmvmh_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbn_su3_Y_PRECISION( buf2, eta_pt ); + } - mu=X; - // plus mu direction - for ( i=bbl[2*mu]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prp_X_PRECISION( buf1, phi_pt ); - nmvm_PRECISION( buf2, D_pt, buf1 ); - nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); - pbp_su3_X_PRECISION( buf2, eta_pt ); + mu=X; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprp_X_PRECISION( buf1, phi_pt ); + nmvm_PRECISION( buf2, D_pt, buf1 ); + nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); + nmvm_PRECISION( buf2+6, D_pt, buf1+6 ); + nmvm_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbp_su3_X_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 24*neighbor_index; + eta_pt = eta + 24*index; + dprn_X_PRECISION( buf1, phi_pt ); + nmvmh_PRECISION( buf2, D_pt, buf1 ); + nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + nmvmh_PRECISION( buf2+6, D_pt, buf1+6 ); + nmvmh_PRECISION( buf2+9, D_pt, buf1+9 ); + dpbn_su3_X_PRECISION( buf2, eta_pt ); + } + } else { +#endif + complex_PRECISION buf1[12], *buf2=buf1+6; + mu=T; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prp_T_PRECISION( buf1, phi_pt ); + nmvm_PRECISION( buf2, D_pt, buf1 ); + nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); + pbp_su3_T_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prn_T_PRECISION( buf1, phi_pt ); + nmvmh_PRECISION( buf2, D_pt, buf1 ); + nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + pbn_su3_T_PRECISION( buf2, eta_pt ); + } + + mu=Z; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prp_Z_PRECISION( buf1, phi_pt ); + nmvm_PRECISION( buf2, D_pt, buf1 ); + nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); + pbp_su3_Z_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prn_Z_PRECISION( buf1, phi_pt ); + nmvmh_PRECISION( buf2, D_pt, buf1 ); + nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + pbn_su3_Z_PRECISION( buf2, eta_pt ); + } + + mu=Y; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prp_Y_PRECISION( buf1, phi_pt ); + nmvm_PRECISION( buf2, D_pt, buf1 ); + nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); + pbp_su3_Y_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prn_Y_PRECISION( buf1, phi_pt ); + nmvmh_PRECISION( buf2, D_pt, buf1 ); + nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + pbn_su3_Y_PRECISION( buf2, eta_pt ); + } + + mu=X; + // plus mu direction + for ( i=bbl[2*mu]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prp_X_PRECISION( buf1, phi_pt ); + nmvm_PRECISION( buf2, D_pt, buf1 ); + nmvm_PRECISION( buf2+3, D_pt, buf1+3 ); + pbp_su3_X_PRECISION( buf2, eta_pt ); + } + // minus mu direction + for ( i=bbl[2*mu+1]; iblock[k].bt[i]; + neighbor_index = s->block[k].bt[i+1]; + D_pt = D + 36*neighbor_index + 9*mu; + phi_pt = phi + 12*neighbor_index; + eta_pt = eta + 12*index; + prn_X_PRECISION( buf1, phi_pt ); + nmvmh_PRECISION( buf2, D_pt, buf1 ); + nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); + pbn_su3_X_PRECISION( buf2, eta_pt ); + } +#ifdef HAVE_TM1p1 } - // minus mu direction - for ( i=bbl[2*mu+1]; iblock[k].bt[i]; - neighbor_index = s->block[k].bt[i+1]; - D_pt = D + 36*neighbor_index + 9*mu; - phi_pt = phi + 12*neighbor_index; - eta_pt = eta + 12*index; - prn_X_PRECISION( buf1, phi_pt ); - nmvmh_PRECISION( buf2, D_pt, buf1 ); - nmvmh_PRECISION( buf2+3, D_pt, buf1+3 ); - pbn_su3_X_PRECISION( buf2, eta_pt ); - } -} #endif +#endif +} -#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, schwarz_PRECISION_struct *s, level_struct *l ) { // k: number of current block int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; + +#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset; + + for ( int mu=0; mu<4; mu++ ) { + OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset; + OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset; + // plus mu direction + for ( int i=bbl[2*mu]; iblock[k].bt[i]; + int neighbor_index = s->block[k].bt[i+1]; + vector_PRECISION phi_pt = phi + n*neighbor_index; + vector_PRECISION eta_pt = eta + n*index; + coarse_pn_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, +1, l ); + } + // minus mu direction + for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; + int neighbor_index = s->block[k].bt[i+1]; + vector_PRECISION phi_pt = phi + n*neighbor_index; + vector_PRECISION eta_pt = eta + n*index; + coarse_pn_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, +1, l ); + } + } +#else config_PRECISION D = s->op.D; - int link_size = SQUARE(l->num_lattice_site_var), site_size=4*link_size; + int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size; for ( int mu=0; mu<4; mu++ ) { // plus mu direction @@ -974,7 +1283,7 @@ void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION vector_PRECISION phi_pt = phi + n*neighbor_index; vector_PRECISION eta_pt = eta + n*index; config_PRECISION D_pt = D + site_size*index + link_size*mu; - coarse_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_pn_hopp_PRECISION( eta_pt, phi_pt, D_pt, +1, l ); } // minus mu direction for ( int i=bbl[2*mu+1]; iblock_boundary_length, n = l->num_lattice_site_var; - int link_size = SQUARE(l->num_lattice_site_var), site_size=4*link_size; +#ifdef OPTIMIZED_COARSE_NEIGHBOR_COUPLING_PRECISION + int column_offset = 2*SIMD_LENGTH_PRECISION*((l->num_parent_eig_vect+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); + int vectorized_link_offset = 4*l->num_parent_eig_vect*column_offset; + + for ( int mu=0; mu<4; mu++ ) { + OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset; + OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset; + // plus mu direction + for ( int i=bbl[2*mu]; iblock[k].bt[i]; + int neighbor_index = s->block[k].bt[i+1]; + vector_PRECISION phi_pt = phi + n*neighbor_index; + vector_PRECISION eta_pt = eta + n*index; + coarse_pn_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, -1, l ); + } + // minus mu direction + for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; + int neighbor_index = s->block[k].bt[i+1]; + vector_PRECISION phi_pt = phi + n*neighbor_index; + vector_PRECISION eta_pt = eta + n*index; + coarse_pn_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, -1, l ); + } + } +#else + int link_size = SQUARE(2*l->num_parent_eig_vect), site_size=4*link_size; config_PRECISION D = s->op.D; for ( int mu=0; mu<4; mu++ ) { @@ -1005,7 +1339,7 @@ void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISIO vector_PRECISION phi_pt = phi + n*neighbor_index; vector_PRECISION eta_pt = eta + n*index; config_PRECISION D_pt = D + site_size*index + link_size*mu; - coarse_n_hopp_PRECISION( eta_pt, phi_pt, D_pt, l ); + coarse_pn_hopp_PRECISION( eta_pt, phi_pt, D_pt, -1, l ); } // minus mu direction for ( int i=bbl[2*mu+1]; inum_inner_lattice_sites, *tt = s->op.translation_table; - config_PRECISION D_out_pt, clover_out_pt; - config_double D_in_pt = op_in->D, clover_in_pt = op_in->clover; + config_PRECISION D_out_pt, clover_out_pt, odd_proj_out_pt; + config_double D_in_pt = op_in->D, clover_in_pt = op_in->clover, odd_proj_in_pt = op_in->odd_proj; + s->op.m0 = op_in->m0; + for ( i=0; iop.D + 36*index; - FOR36( *D_out_pt = (complex_PRECISION) *D_in_pt; D_out_pt++; D_in_pt++; ) + FOR36( *D_out_pt = (complex_PRECISION) *D_in_pt; D_out_pt++; D_in_pt++; ); } if ( g.csw != 0 ) { for ( i=0; iop.clover + 42*index; - FOR42( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; ) + FOR42( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; ); } } else { for ( i=0; iop.clover + 12*index; - FOR12( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; ) + FOR12( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; ); } } -#ifdef HAVE_TM - config_PRECISION tm_term_out_pt, odd_proj_out_pt; - config_double tm_term_in_pt = op_in->tm_term, odd_proj_in_pt = op_in->odd_proj; - for ( i=0; iop.tm_term + 12*index; - FOR12( *tm_term_out_pt = (complex_PRECISION) *tm_term_in_pt; tm_term_out_pt++; tm_term_in_pt++; ) - } - for ( i=0; iop.odd_proj + 12*index; - FOR12( *odd_proj_out_pt = (complex_PRECISION) *odd_proj_in_pt; odd_proj_out_pt++; odd_proj_in_pt++; ) + FOR12( *odd_proj_out_pt = (complex_PRECISION) *odd_proj_in_pt; odd_proj_out_pt++; odd_proj_in_pt++; ); } + +#ifdef HAVE_TM + tm_term_PRECISION_setup( (PRECISION) (g.mu_factor[l->depth]*op_in->mu), (PRECISION) (g.mu_factor[l->depth]*op_in->mu_even_shift), (PRECISION) (g.mu_factor[l->depth]*op_in->mu_odd_shift), &(s->op), l, no_threading ); #endif - - if ( g.odd_even ) - schwarz_PRECISION_oddeven_setup( &(s->op), l ); - + +#ifdef HAVE_TM1p1 + epsbar_term_PRECISION_setup( (PRECISION) (g.epsbar_factor[l->depth]*op_in->epsbar), (PRECISION) (g.epsbar_factor[l->depth]*op_in->epsbar_ig5_even_shift), (PRECISION) (g.epsbar_factor[l->depth]*op_in->epsbar_ig5_odd_shift), &(s->op), l, no_threading ); +#endif + schwarz_PRECISION_boundary_update( s, l ); + + operator_PRECISION_set_couplings( &(s->op), l ); + + if ( g.method >= 4 && g.odd_even ) + oddeven_setup_PRECISION( &(g.op_double), l ); + else + schwarz_PRECISION_oddeven_setup( s, l ); + } -#endif void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION eta, const int cycles, int res, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { @@ -1083,10 +1421,11 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v int k, mu, i, nb = s->num_blocks; vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3, latest_iter2 = s->buf5, swap = NULL; - void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION, - (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, + void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; + void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, - (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; + (* block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; + int nb_thread_start; int nb_thread_end; @@ -1096,17 +1435,13 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v if ( res == _NO_RES ) { vector_PRECISION_copy( r, eta, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); - vector_PRECISION_define( x, 0, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); + vector_PRECISION_define_zero( x, 0, nb*s->block_vector_size, l, threading ); + vector_PRECISION_define_zero( x, l->inner_vector_size, l->schwarz_vector_size, l, threading ); } else { vector_PRECISION_copy( x, phi, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); vector_PRECISION_copy( latest_iter, phi, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); } - START_MASTER(threading) - if ( res == _NO_RES ) { - vector_PRECISION_define( x, 0, l->inner_vector_size, l->schwarz_vector_size, l ); - } - END_MASTER(threading) SYNC_CORES(threading) @@ -1132,7 +1467,7 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v boundary_op( Dphi, latest_iter, i, s, l, no_threading ); vector_PRECISION_minus( r, eta, Dphi, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); - } else { + } else { n_boundary_op( r, latest_iter, i, s, l ); } } @@ -1176,7 +1511,7 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v for ( i=nb_thread_start; irelax_fac != 1.0 ) - vector_PRECISION_scale( phi, x, l->relax_fac, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); + vector_PRECISION_scale( phi, x, l->relax_fac, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); else vector_PRECISION_copy( phi, x, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); } @@ -1198,7 +1533,7 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); if ( l->relax_fac != 1.0 ) vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var, - s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); + s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); } } @@ -1217,7 +1552,7 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); if ( l->relax_fac != 1.0 ) vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var, - s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); + s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); } } } @@ -1248,9 +1583,9 @@ void additive_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, v } } } - double rnorm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading ); + PRECISION r_norm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading ); char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); - printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, rnorm ); + printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm ); printf0("\033[0m\n"); fflush(0); END_LOCKED_MASTER(threading) #endif @@ -1269,15 +1604,12 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRECISION Dphi = s->buf4; vector_PRECISION latest_iter = s->buf2; vector_PRECISION x = s->buf3; - void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION, - (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, + void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; + void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; void (*communicate[2])() = {ghost_update_wait_PRECISION, ghost_update_PRECISION}; int commdir[8] = {+1,-1,-1,+1,-1,+1,+1,-1}; -#ifdef SCHWARZ_RES - int nb = s->num_blocks; -#endif SYNC_CORES(threading) @@ -1289,10 +1621,8 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, if ( res == _NO_RES ) { vector_PRECISION_copy( r, eta, start, end, l ); - vector_PRECISION_define( x, 0, start, end, l ); - START_MASTER(threading) - vector_PRECISION_define( x, 0, l->inner_vector_size, l->schwarz_vector_size, l ); - END_MASTER(threading) + vector_PRECISION_define_zero( x, 0, l->inner_vector_size, l, threading ); + vector_PRECISION_define_zero( x, l->inner_vector_size, l->schwarz_vector_size, l, threading ); SYNC_CORES(threading) } else { vector_PRECISION_copy( x, phi, start, end, l ); @@ -1350,7 +1680,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, // copy phi = x if ( l->relax_fac != 1.0 ) - vector_PRECISION_scale( phi, x, l->relax_fac, start, end, l ); + vector_PRECISION_scale( phi, x, l->relax_fac, start, end, l, no_threading ); else vector_PRECISION_copy( phi, x, start, end, l ); @@ -1363,7 +1693,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l ); if ( l->relax_fac != 1.0 ) { vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[index].start*l->num_lattice_site_var, - s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l ); + s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); } } } @@ -1380,7 +1710,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l ); if ( l->relax_fac != 1.0 ) { vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[index].start*l->num_lattice_site_var, - s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l ); + s->block[index].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); } START_MASTER(threading) PROF_PRECISION_STOP( _SM3, 1 ); @@ -1388,8 +1718,9 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, } if ( step == 0 || step == 1 ) { START_LOCKED_MASTER(threading) - for ( mu=0; mu<4; mu++ ) + for ( mu=0; mu<4; mu++ ) { communicate[0]( latest_iter, mu, commdir[step], &(s->op.c), l ); + } END_LOCKED_MASTER(threading) } else { SYNC_CORES(threading) @@ -1399,6 +1730,7 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, SYNC_CORES(threading) #ifdef SCHWARZ_RES + int nb = s->num_blocks; START_LOCKED_MASTER(threading) if ( D_phi == NULL ) { for ( mu=0; mu<4; mu++ ) { @@ -1406,26 +1738,23 @@ void red_black_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, ghost_update_PRECISION( latest_iter, mu, -1, &(s->op.c), l ); } - for ( i=0; iblock[i].no_comm ) { + for ( i=0; iblock[i].no_comm ) n_boundary_op( r, latest_iter, i, s, l ); - } - } for ( mu=0; mu<4; mu++ ) { ghost_update_wait_PRECISION( latest_iter, mu, +1, &(s->op.c), l ); ghost_update_wait_PRECISION( latest_iter, mu, -1, &(s->op.c), l ); } - for ( i=0; iblock[i].no_comm ) { + for ( i=0; iblock[i].no_comm ) n_boundary_op( r, latest_iter, i, s, l ); - } - } + } - double rnorm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading ); + PRECISION r_norm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading ); char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); - printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, rnorm ); + printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm ); printf0("\033[0m\n"); fflush(0); END_LOCKED_MASTER(threading) #endif @@ -1443,8 +1772,8 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE vector_PRECISION Dphi = s->buf4; vector_PRECISION latest_iter = s->buf2; vector_PRECISION x = s->buf3; - void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION, - (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, + void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; + void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; @@ -1456,17 +1785,12 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE if ( res == _NO_RES ) { vector_PRECISION_copy( r, eta, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); - vector_PRECISION_define( x, 0, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); + vector_PRECISION_define_zero( x, 0, nb*s->block_vector_size, l, threading ); + vector_PRECISION_define_zero( x, l->inner_vector_size, l->schwarz_vector_size, l, threading ); } else { vector_PRECISION_copy( x, phi, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); } - START_MASTER(threading) - if ( res == _NO_RES ) { - vector_PRECISION_define( x, 0, l->inner_vector_size, l->schwarz_vector_size, l ); - } - END_MASTER(threading) - SYNC_CORES(threading) for ( k=0; krelax_fac != 1.0 ) - vector_PRECISION_scale( phi, x, l->relax_fac, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); + vector_PRECISION_scale( phi, x, l->relax_fac, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); else vector_PRECISION_copy( phi, x, s->block[i].start*l->num_lattice_site_var, s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); } @@ -1581,7 +1905,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); if ( l->relax_fac != 1.0 ) { vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var, - s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); + s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); } } if ( 1 == s->block[i].color ) { @@ -1589,7 +1913,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); if ( l->relax_fac != 1.0 ) { vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var, - s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); + s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); } } } @@ -1608,7 +1932,7 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); if ( l->relax_fac != 1.0 ) { vector_PRECISION_scale( D_phi, D_phi, l->relax_fac, s->block[i].start*l->num_lattice_site_var, - s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l ); + s->block[i].start*l->num_lattice_site_var+s->block_vector_size, l, no_threading ); } } } @@ -1623,26 +1947,24 @@ void schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_phi, vector_PRE ghost_update_PRECISION( latest_iter, mu, -1, &(s->op.c), l ); } - for ( i=0; iblock[i].no_comm ) { + for ( i=0; iblock[i].no_comm ) n_boundary_op( r, latest_iter, i, s, l ); - } - } + for ( mu=0; mu<4; mu++ ) { ghost_update_wait_PRECISION( latest_iter, mu, +1, &(s->op.c), l ); ghost_update_wait_PRECISION( latest_iter, mu, -1, &(s->op.c), l ); } - for ( i=0; iblock[i].no_comm ) { + for ( i=0; iblock[i].no_comm ) n_boundary_op( r, latest_iter, i, s, l ); - } - } } - double rnorm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading ); + + PRECISION r_norm = global_norm_PRECISION( r, 0, l->inner_vector_size, l, no_threading ); char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); - printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, rnorm ); + printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm ); printf0("\033[0m\n"); fflush(0); END_LOCKED_MASTER(threading) #endif @@ -1661,10 +1983,10 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p else { int color, k, mu, i, nb = s->num_blocks; vector_PRECISION r = s->buf1, Dphi = s->buf4, latest_iter = s->buf2, x = s->buf3; - void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION, - (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, - (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, - (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; + void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; + void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op, + (*n_boundary_op)() = (l->depth==0)?n_block_PRECISION_boundary_op:n_coarse_block_PRECISION_boundary_op, + (*block_solve)() = (l->depth==0&&g.odd_even)?block_solve_oddeven_PRECISION:local_minres_PRECISION; int color_to_comm[16][2] = { {T,-1}, {X,+1}, {Y,+1}, {X,-1}, {Z,+1}, {Y,-1}, {X,+1}, {Y,+1}, {T,+1}, {X,-1}, {Y,-1}, {X,+1}, {Z,-1}, {Y,+1}, {X,-1}, {Y,-1} }; @@ -1677,16 +1999,12 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p if ( res == _NO_RES ) { vector_PRECISION_copy( r, eta, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); - vector_PRECISION_define( x, 0, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); + vector_PRECISION_define_zero( x, 0, nb*s->block_vector_size, l, threading ); + vector_PRECISION_define_zero( x, l->inner_vector_size, l->schwarz_vector_size, l, threading ); } else { vector_PRECISION_copy( x, phi, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); } - START_MASTER(threading) - if ( res == _NO_RES ) { - vector_PRECISION_define( x, 0, l->inner_vector_size, l->schwarz_vector_size, l ); - } - END_MASTER(threading) SYNC_CORES(threading) @@ -1764,7 +2082,7 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p SYNC_CORES(threading) if ( l->relax_fac != 1.0 ) - vector_PRECISION_scale( phi, x, l->relax_fac, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); + vector_PRECISION_scale( phi, x, l->relax_fac, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l, no_threading ); else vector_PRECISION_copy( phi, x, nb_thread_start*s->block_vector_size, nb_thread_end*s->block_vector_size, l ); @@ -1773,8 +2091,11 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p #ifdef SCHWARZ_RES START_LOCKED_MASTER(threading) vector_PRECISION true_r = NULL; + PUBLIC_MALLOC( true_r, complex_PRECISION, l->vector_size ); - vector_PRECISION_define( true_r, 0, 0, l->inner_vector_size, l ); + vector_PRECISION_define_zero( true_r, 0, l->inner_vector_size, l, no_threading ); + + if ( D_phi == NULL ) { for ( mu=0; mu<4; mu++ ) { ghost_update_PRECISION( x, mu, +1, &(s->op.c), l ); @@ -1791,11 +2112,12 @@ void sixteen_color_schwarz_PRECISION( vector_PRECISION phi, vector_PRECISION D_p boundary_op( true_r, x, i, s, l ); } } - vector_PRECISION_saxpy( true_r, eta, true_r, -1, 0, l->inner_vector_size, l ); - double rnorm = global_norm_PRECISION( true_r, 0, l->inner_vector_size, l, no_threading ) - / global_norm_PRECISION( eta, 0, l->inner_vector_size, l, no_threading ); + vector_PRECISION_saxpy( true_r, eta, true_r, -1, 0, l->inner_vector_size, l, no_threading ); + PRECISION r_norm = global_norm_PRECISION( true_r, 0, l->inner_vector_size, l, no_threading ), + den = global_norm_PRECISION( eta, 0, l->inner_vector_size, l, no_threading ); + r_norm/=den; char number[3]; sprintf( number, "%2d", 31+l->depth ); printf0("\033[1;%2sm|", number ); - printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, rnorm ); + printf0(" ---- depth: %d, c: %d, schwarz iter %2d, norm: %11.6le |", l->depth, s->num_colors, k, r_norm ); printf0("\033[0m\n"); fflush(0); PUBLIC_FREE( true_r, complex_PRECISION, l->vector_size ); END_LOCKED_MASTER(threading) @@ -1816,6 +2138,16 @@ void trans_PRECISION( vector_PRECISION out, vector_double in, int *tt, level_str // this function seems to do some data reordering, barriers ensure that everything is in sync SYNC_CORES(threading) START_NO_HYPERTHREADS(threading) +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) + for ( i=start; inum_blocks; + int svs = l->schwarz_vector_size; + int ivs = l->inner_vector_size; + int vs = l->vector_size; + void (*block_op)() = (l->depth==0)?block_d_plus_clover_PRECISION:coarse_block_operator_PRECISION; - void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op; void (*op)() = (l->depth==0)?d_plus_clover_PRECISION:apply_coarse_operator_PRECISION; + void (*boundary_op)() = (l->depth==0)?block_PRECISION_boundary_op:coarse_block_PRECISION_boundary_op; vector_PRECISION v1 = NULL, v2 = NULL, v3 = NULL; - PRECISION norm; - - MALLOC( v1, complex_PRECISION, l->schwarz_vector_size ); - MALLOC( v2, complex_PRECISION, l->vector_size ); - MALLOC( v3, complex_PRECISION, l->vector_size ); - - vector_PRECISION_define_random( v1, 0, l->inner_vector_size, l ); + PRECISION diff; + MALLOC( v1, complex_PRECISION, svs ); + MALLOC( v2, complex_PRECISION, vs ); + MALLOC( v3, complex_PRECISION, vs ); + + vector_PRECISION_define_random( v1, 0, ivs, l, no_threading ); + op( v3, v1, &(s->op), l, no_threading ); - + for ( mu=0; mu<4; mu++ ) { ghost_update_PRECISION( v1, mu, +1, &(s->op.c), l ); ghost_update_PRECISION( v1, mu, -1, &(s->op.c), l ); } - + for ( mu=0; mu<4; mu++ ) { ghost_update_wait_PRECISION( v1, mu, +1, &(s->op.c), l ); ghost_update_wait_PRECISION( v1, mu, -1, &(s->op.c), l ); } - + for ( i=0; iblock[i].start*l->num_lattice_site_var, s, l, no_threading ); boundary_op( v2, v1, i, s, l, no_threading ); } - + vector_PRECISION_minus( v3, v3, v2, 0, l->inner_vector_size, l ); - norm = global_norm_PRECISION( v3, 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( v2, 0, l->inner_vector_size, l, no_threading ); + diff = global_norm_PRECISION( v3, 0, l->inner_vector_size, l, no_threading ) / + global_norm_PRECISION( v2, 0, l->inner_vector_size, l, no_threading ); - printf0("depth: %d, correctness of local residual vector: %le\n", l->depth, norm ); - if(norm > g.test) g.test = norm; - + test0_PRECISION("depth: %d, correctness of local residual vector: %le\n", l->depth, diff ); + FREE( v1, complex_PRECISION, l->schwarz_vector_size ); FREE( v2, complex_PRECISION, l->vector_size ); FREE( v3, complex_PRECISION, l->vector_size ); diff --git a/src/schwarz_generic.h b/src/schwarz_generic.h index 2834fc5..fab1613 100644 --- a/src/schwarz_generic.h +++ b/src/schwarz_generic.h @@ -73,5 +73,23 @@ struct Thread; return site_index( coord[T], coord[Z], coord[Y], coord[X], dt, it ); } } - + +#ifdef OPTIMIZED_NEIGHBOR_COUPLING_float +static inline void set_PRECISION_D_vectorized( PRECISION *out1, PRECISION *out2, complex_PRECISION *in ) { + // out1: column major, out2: row major + for ( int i=0; i<3; i++ ) { // column + for ( int j=0; j<3; j++ ) { // row + out1[8*i +j] = creal_PRECISION(in[3*j+i]); + out1[8*i+4+j] = cimag_PRECISION(in[3*j+i]); + out2[8*i +j] = creal_PRECISION(in[j+3*i]); + out2[8*i+4+j] = cimag_PRECISION(in[j+3*i]); + } + out1[8*i+3] = 0.0; + out1[8*i+7] = 0.0; + out2[8*i+3] = 0.0; + out2[8*i+7] = 0.0; + } +} +#endif + #endif diff --git a/src/setup_generic.c b/src/setup_generic.c index 6280f98..61013da 100644 --- a/src/setup_generic.c +++ b/src/setup_generic.c @@ -32,7 +32,7 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr START_LOCKED_MASTER(threading) coarse_operator_PRECISION_alloc( l ); -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION +#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); END_LOCKED_MASTER(threading) #else @@ -53,16 +53,6 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr } else { interpolation_PRECISION_dummy_alloc( l->next_level ); } - -#ifdef HAVE_TM - l->next_level->tm_shift = g.tm_mu*g.tm_mu_factor[l->next_level->depth]; - l->next_level->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->next_level->depth]; - l->next_level->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->next_level->depth]; - - if( g.tm_mu_factor[l->next_level->depth]!=g.tm_mu_factor[l->depth] ) - tm_term_PRECISION_setup( l->next_level->op_PRECISION.tm_term, l->next_level->op_PRECISION.odd_proj, l->next_level, no_threading ); -#endif - conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) @@ -73,8 +63,9 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr END_LOCKED_MASTER(threading) if ( g.method >= 4 && g.odd_even ) { START_LOCKED_MASTER(threading) - coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level ); + coarse_oddeven_alloc_PRECISION( l->next_level ); END_LOCKED_MASTER(threading) + coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading ); } coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading ); START_LOCKED_MASTER(threading) @@ -83,8 +74,9 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr } if ( !l->next_level->idle && l->next_level->level == 0 && g.odd_even ) { START_LOCKED_MASTER(threading) - coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level ); + coarse_oddeven_alloc_PRECISION( l->next_level ); END_LOCKED_MASTER(threading) + coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading ); } else if ( !l->next_level->idle && l->next_level->level == 0 ) { coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading ); } @@ -101,13 +93,12 @@ void coarse_grid_correction_PRECISION_setup( level_struct *l, struct Thread *thr for ( int i=0; inext_level->num_eig_vect,l->num_eig_vect); i++ ) { restrict_PRECISION( l->next_level->is_PRECISION.test_vector[i], l->is_PRECISION.test_vector[i], l, threading ); } - START_LOCKED_MASTER(threading) for ( int i=MIN(l->next_level->num_eig_vect,l->num_eig_vect); inext_level->num_eig_vect; i++ ) { if ( !l->next_level->idle ) vector_PRECISION_define_random( l->next_level->is_PRECISION.test_vector[i], 0, - l->next_level->inner_vector_size, l->next_level ); + l->next_level->inner_vector_size, l->next_level, threading ); } - END_LOCKED_MASTER(threading) + SYNC_CORES(threading); } if ( !l->next_level->idle ) interpolation_PRECISION_define( NULL, l->next_level, threading ); @@ -226,19 +217,14 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T for ( k=0; kdepth == 0 ) { - START_LOCKED_MASTER(threading) - vector_PRECISION_define_random( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l ); - END_LOCKED_MASTER(threading) + vector_PRECISION_define_random( l->is_PRECISION.test_vector[k], 0, l->inner_vector_size, l, threading ); } - - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], - 1, _NO_RES, _NO_SHIFT, l, threading ); + SYNC_CORES(threading) + smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], 1, _NO_RES, l, threading ); vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], - g.method>=4?1:2, _NO_RES, _NO_SHIFT, l, threading ); + smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:2, _NO_RES, l, threading ); vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); - smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], - g.method>=4?1:3, _NO_RES, _NO_SHIFT, l, threading ); + smoother_PRECISION( buffer[0], NULL, l->is_PRECISION.test_vector[k], g.method>=4?1:3, _NO_RES, l, threading ); vector_PRECISION_copy( l->is_PRECISION.test_vector[k], buffer[0], start, end, l ); pc += 6; @@ -270,16 +256,16 @@ void interpolation_PRECISION_define( vector_double *V, level_struct *l, struct T } } -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION +#ifndef OPTIMIZED_INTERPOLATION_SETUP_PRECISION for ( k=0; kis_PRECISION.interpolation[k], l->is_PRECISION.test_vector[k], start, end, l ); #endif testvector_analysis_PRECISION( l->is_PRECISION.test_vector, l, threading ); -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION +#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, n, l, threading ); + gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, n, l, threading ); #else gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, n, l, threading ); define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); @@ -292,17 +278,17 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { if ( l->level > 0 ) { if ( !l->idle ) { -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION +#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); + gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); + gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); START_LOCKED_MASTER(threading) #else for ( int i=0; inum_eig_vect; i++ ) { vector_PRECISION_copy( l->is_PRECISION.interpolation[i], l->is_PRECISION.test_vector[i], - threading->start_index[l->depth], threading->end_index[l->depth], l ); + threading->start_index[l->depth], threading->end_index[l->depth], l ); } gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.interpolation, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) @@ -310,15 +296,6 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { define_interpolation_PRECISION_operator( l->is_PRECISION.interpolation, l, threading ); START_LOCKED_MASTER(threading) coarse_operator_PRECISION_setup( l->is_PRECISION.interpolation, l ); -#endif -#ifdef HAVE_TM - l->next_level->tm_shift = g.tm_mu*g.tm_mu_factor[l->next_level->depth]; - l->next_level->tm_even_shift = g.tm_mu_even_shift*g.tm_mu_factor[l->next_level->depth]; - l->next_level->tm_odd_shift = g.tm_mu_odd_shift*g.tm_mu_factor[l->next_level->depth]; - - if( g.tm_mu_factor[l->next_level->depth]!=g.tm_mu_factor[l->depth] ) - tm_term_PRECISION_setup( l->next_level->op_PRECISION.tm_term, l->next_level->op_PRECISION.odd_proj, - l->next_level, no_threading ); #endif conf_PRECISION_gather( &(l->next_level->s_PRECISION.op), &(l->next_level->op_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) @@ -327,13 +304,13 @@ void re_setup_PRECISION( level_struct *l, struct Thread *threading ) { schwarz_PRECISION_boundary_update( &(l->next_level->s_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( g.method >= 4 && g.odd_even ) { - coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading ); + coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading ); } else { coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading ); } } if ( !l->next_level->idle && l->next_level->level == 0 && g.odd_even ) { - coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading ); + coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading ); } else if ( !l->next_level->idle && l->next_level->level == 0 ) { coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading ); } @@ -379,14 +356,15 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s fgmres_PRECISION( &gmres, l->next_level, threading ); } } - interpolate3_PRECISION( buf1, gmres.x, l, threading ); - smoother_PRECISION( buf1, NULL, l->is_PRECISION.test_vector[i], l->post_smooth_iter, _RES, _NO_SHIFT, l, threading ); + vector_PRECISION_define_zero( buf1, 0, l->inner_vector_size, l, threading ); + interpolate_PRECISION( buf1, gmres.x, l, threading ); + smoother_PRECISION( buf1, NULL, l->is_PRECISION.test_vector[i], l->post_smooth_iter, _RES, l, threading ); vector_PRECISION_real_scale( l->is_PRECISION.test_vector[i], buf1, 1.0/global_norm_PRECISION( buf1, 0, l->inner_vector_size, l, threading ), threading->start_index[l->depth], threading->end_index[l->depth], l ); pc += l->post_smooth_iter; #ifdef DEBUG - START_MASTER(threading) + START_MASTER(threading) if ( pc >= 0.2*pi*pn ) { printf0("%4d%% |", 20*pi); fflush(0); pi++; } END_MASTER(threading) #endif @@ -397,11 +375,11 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s END_MASTER(threading) #endif -#ifdef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION +#ifdef OPTIMIZED_INTERPOLATION_SETUP_PRECISION define_interpolation_PRECISION_operator( l->is_PRECISION.test_vector, l, threading ); - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); + gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); if ( l->depth > 0 ) - gram_schmidt_on_aggregates_PRECISION_vectorized( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); + gram_schmidt_on_aggregates_PRECISION( l->is_PRECISION.operator, l->num_eig_vect, l, threading ); coarse_operator_PRECISION_setup_vectorized( l->is_PRECISION.operator, l, threading ); START_LOCKED_MASTER(threading) #else @@ -422,13 +400,13 @@ void inv_iter_2lvl_extension_setup_PRECISION( int setup_iter, level_struct *l, s schwarz_PRECISION_boundary_update( &(l->next_level->s_PRECISION), l->next_level ); END_LOCKED_MASTER(threading) if ( g.method >= 4 && g.odd_even ) { - coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading ); + coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _REORDER, l->next_level, threading ); } else { coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading ); } } if ( !l->next_level->idle && l->next_level->level == 0 && g.odd_even ) { - coarse_oddeven_re_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading ); + coarse_oddeven_setup_PRECISION( &(l->next_level->s_PRECISION.op), _NO_REORDERING, l->next_level, threading ); } else if ( !l->next_level->idle && l->next_level->level == 0 ) { coarse_operator_PRECISION_set_couplings( &(l->next_level->s_PRECISION.op), l->next_level, threading ); } @@ -552,7 +530,7 @@ void testvector_analysis_PRECISION( vector_PRECISION *test_vectors, level_struct coarse_gamma5_PRECISION( l->vbuf_PRECISION[0], l->vbuf_PRECISION[3], 0, l->inner_vector_size, l ); lambda = global_inner_product_PRECISION( test_vectors[i], l->vbuf_PRECISION[0], 0, l->inner_vector_size, l, no_threading ); lambda /= global_inner_product_PRECISION( test_vectors[i], test_vectors[i], 0, l->inner_vector_size, l, no_threading ); - vector_PRECISION_saxpy( l->vbuf_PRECISION[1], l->vbuf_PRECISION[0], test_vectors[i], -lambda, 0, l->inner_vector_size, l ); + vector_PRECISION_saxpy( l->vbuf_PRECISION[1], l->vbuf_PRECISION[0], test_vectors[i], -lambda, 0, l->inner_vector_size, l, no_threading ); mu = global_norm_PRECISION( l->vbuf_PRECISION[1], 0, l->inner_vector_size, l, no_threading )/global_norm_PRECISION( test_vectors[i], 0, l->inner_vector_size, l, no_threading ); printf0("singular value: %+lf%+lfi, singular vector precision: %le\n", (double)creal(lambda), (double)cimag(lambda), (double)mu ); } diff --git a/src/simd_avx_intrinsic.h b/src/simd_avx_intrinsic.h new file mode 100644 index 0000000..f017e80 --- /dev/null +++ b/src/simd_avx_intrinsic.h @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2016 Simone Bacchio. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#ifndef SIMD_AVX_INTRINSIC_HEADER +#define SIMD_AVX_INTRINSIC_HEADER + +#include "immintrin.h" +#include "xmmintrin.h" +#include "emmintrin.h" +#include "pmmintrin.h" + +#define SIMD _AVX +#define SIMD_LENGTH_float 8 +#define SIMD_LENGTH_double 4 +#define mm_FOR_float(e) { e e e e e e e e } +#define mm_FOR_double(e) { e e e e } + +#define mm_float __m256 +#define mm_double __m256d + +#define mm_mul_float _mm256_mul_ps +#define mm_mul_double _mm256_mul_pd +#define mm_add_float _mm256_add_ps +#define mm_add_double _mm256_add_pd +#define mm_sub_float _mm256_sub_ps +#define mm_sub_double _mm256_sub_pd +#define mm_and_float _mm256_and_ps +#define mm_and_double _mm256_and_pd + +#define mm_setzero_float _mm256_setzero_ps +#define mm_setzero_double _mm256_setzero_pd +#define mm_setr_float _mm256_setr_ps +#define mm_setr_double _mm256_setr_pd +#define mm_set1_float _mm256_set1_ps +#define mm_set1_double _mm256_set1_pd +#define mm_load_float _mm256_load_ps +#define mm_load_double _mm256_load_pd +#define mm_unpacklo_float _mm256_unpacklo_ps +#define mm_unpacklo_double _mm256_unpacklo_pd +#define mm_unpackhi_float _mm256_unpackhi_ps +#define mm_unpackhi_double _mm256_unpackhi_pd +#define mm_store_float _mm256_store_ps +#define mm_store_double _mm256_store_pd + +#ifdef _FMA_ + +#define mm_fmadd_float _mm256_fmadd_ps +#define mm_fmadd_double _mm256_fmadd_pd +#define mm_fnmadd_float _mm256_fnmadd_ps +#define mm_fnmadd_double _mm256_fnmadd_pd +#define mm_fmsub_float _mm256_fmsub_ps +#define mm_fmsub_double _mm256_fmsub_pd +#define mm_fnmsub_float _mm256_fnmsub_ps +#define mm_fnmsub_double _mm256_fnmsub_pd + +#endif + +// Load even components +static inline mm_float mm_seti_float( float *data, const int i ) { + return mm_setr_float( data[0*i], data[1*i], data[2*i], data[3*i], data[4*i], data[5*i], data[6*i], data[7*i] ); +} +static inline mm_double mm_seti_double( double *data, const int i ) { + return mm_setr_double( data[0*i], data[1*i], data[2*i], data[3*i] ); +} + +// Loading 6 time the same component and then jumping 12 components +static inline void mm_set1_6times_float( float *data, mm_float *pack1of3, mm_float *pack2of3, + mm_float *pack3of3, const int skip ) { + *pack1of3 = mm_setr_float( data[0*i+0*skip], data[1*i+0*skip], data[2*i+0*skip], data[3*i+0*skip], + data[4*i+0*skip], data[5*i+0*skip], data[0*i+1*skip], data[1*i+1*skip] ); + *pack2of3 = mm_setr_float( data[2*i+1*skip], data[3*i+1*skip], data[4*i+1*skip], data[5*i+1*skip], + data[0*i+2*skip], data[1*i+2*skip], data[2*i+2*skip], data[3*i+2*skip] ); + *pack3of3 = mm_setr_float( data[4*i+2*skip], data[5*i+2*skip], data[0*i+3*skip], data[1*i+3*skip], + data[2*i+3*skip], data[3*i+3*skip], data[4*i+3*skip], data[5*i+3*skip] ); +} +static inline void mm_loadi_6times_double( double *data, mm_double *pack1of3, mm_double *pack2of3, + mm_double *pack3of3, const int i, const int skip ) { + *pack1of3 = mm_setr_double( data[0*i+0*skip], data[1*i+0*skip], data[2*i+0*skip], data[3*i+0*skip] ); + *pack2of3 = mm_setr_double( data[4*i+0*skip], data[5*i+0*skip], data[0*i+1*skip], data[1*i+1*skip] ); + *pack3of3 = mm_setr_double( data[2*i+1*skip], data[3*i+1*skip], data[4*i+1*skip], data[5*i+1*skip] ); +} + +static inline mm_float mm_set_from_list_float( float *data, float *alpha, int *list ) { + return mm_setr_float( alpha[0]*data[list[0]], alpha[1]*data[list[1]], alpha[2]*data[list[2]], alpha[3]*data[list[3]], + alpha[4]*data[list[4]], alpha[5]*data[list[5]], alpha[6]*data[list[6]], alpha[7]*data[list[7]] ); +} +static inline mm_double mm_set_from_list_double( double *data, double *alpha, int *list ) { + return mm_setr_double( alpha[0]*data[list[0]], alpha[1]*data[list[1]], alpha[2]*data[list[2]], alpha[3]*data[list[3]] ); +} + +// Sum all components of mm_PRECISION +static inline float mm_reduce_add_float( mm_float v) { + __m128 vlow = _mm256_castps256_ps128(v); + __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128 + vlow = _mm_add_ps(vlow, vhigh); // add the low 128 + // same of SSE + __m128 shuf = _mm_movehdup_ps(v); // broadcast elements 3,1 to 2,0 + __m128 sums = _mm_add_ps(v, shuf); + shuf = _mm_movehl_ps(shuf, sums); // high half -> low half + sums = _mm_add_ss(sums, shuf); + return _mm_cvtss_f32(sums); +} +static inline double mm_reduce_add_double( mm_double v ) { + __m128d vlow = _mm256_castpd256_pd128(v); + __m128d vhigh = _mm256_extractf128_pd(v, 1); + vlow = _mm_add_pd(vlow, vhigh); + // same of SSE + double tmp; + _mm_storeh_pd(&tmp, vlow); // store the high half + return _mm_cvtsd_f64(vlow) + tmp; // cast the low half and sum +} + +// Transpose a block of SIMD_LENGTH * SIMD_LENGTH +static inline void mm_transpose_float( mm_float *data ) { + mm_float __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; + mm_float __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; + __t0 = _mm256_unpacklo_ps(data[0], data[1]); + __t1 = _mm256_unpackhi_ps(data[0], data[1]); + __t2 = _mm256_unpacklo_ps(data[2], data[3]); + __t3 = _mm256_unpackhi_ps(data[2], data[3]); + __t4 = _mm256_unpacklo_ps(data[4], data[5]); + __t5 = _mm256_unpackhi_ps(data[4], data[5]); + __t6 = _mm256_unpacklo_ps(data[6], data[7]); + __t7 = _mm256_unpackhi_ps(data[6], data[7]); + __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); + __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); + __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); + __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); + __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); + __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); + __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); + __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); + data[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); + data[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); + data[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); + data[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); + data[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); + data[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); + data[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); + data[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); +} +static inline void mm_transpose_double( mm_double *data) +{ + mm_double tmp[4]; + + tmp[0] = _mm256_unpacklo_pd( data[0], data[1] ); + tmp[1] = _mm256_unpacklo_pd( data[2], data[3] ); + tmp[2] = _mm256_unpackhi_pd( data[0], data[1] ); + tmp[3] = _mm256_unpackhi_pd( data[2], data[3] ); + //TODO + data[0] = _mm256_movelh_pd( tmp[0], tmp[1] ); + data[1] = _mm256_movehl_pd( tmp[1], tmp[0] ); + data[2] = _mm256_movelh_pd( tmp[2], tmp[3] ); + data[3] = _mm256_movehl_pd( tmp[3], tmp[2] ); +} + +#endif diff --git a/src/simd_blas_generic.h b/src/simd_blas_generic.h new file mode 100644 index 0000000..d54b807 --- /dev/null +++ b/src/simd_blas_generic.h @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#ifndef SIMD_BLAS_PRECISION_HEADER +#define SIMD_BLAS_PRECISION_HEADER + +static inline void cgem_inverse_PRECISION( const int N, PRECISION *A_inverse, PRECISION *A, int lda ) { + + // generate LU decomp in A + int i, j, k; + complex_PRECISION alpha; + + complex_PRECISION tmpA[N*N]; + complex_PRECISION tmpA_inverse[N*N]; + + for ( j=0; j0 ) + b[k-1] = 0; + + for ( i=0; i=0; i-- ) { + for ( j=i+1; j= j*offset; i -= SIMD_LENGTH_PRECISION ) { + ip = i%offset + 2*(i/offset)*padded; + cstore_PRECISION( C+2*ip, C1_re[i/SIMD_LENGTH_PRECISION], C1_im[i/SIMD_LENGTH_PRECISION] ); + cstore_PRECISION( C+2*(ip+padded), C2_re[i/SIMD_LENGTH_PRECISION], C2_im[i/SIMD_LENGTH_PRECISION] ); + } + } + } else { +#endif + mm_PRECISION A_re; + mm_PRECISION A_im; + mm_PRECISION B_re; + mm_PRECISION B_im; + mm_PRECISION C_re[lda/SIMD_LENGTH_PRECISION]; + mm_PRECISION C_im[lda/SIMD_LENGTH_PRECISION]; + + // deinterleaved load + for ( i=0; i= j*offset; i -= SIMD_LENGTH_PRECISION ) { + ip = i%offset + 2*(i/offset)*padded; + cstore_PRECISION( C+2*ip, C1_re[i/SIMD_LENGTH_PRECISION], C1_im[i/SIMD_LENGTH_PRECISION] ); + cstore_PRECISION( C+2*(ip+padded), C2_re[i/SIMD_LENGTH_PRECISION], C2_im[i/SIMD_LENGTH_PRECISION] ); + } + } + } else { +#endif + mm_PRECISION A_re; + mm_PRECISION A_im; + mm_PRECISION B_re; + mm_PRECISION B_im; + mm_PRECISION C_re[lda/SIMD_LENGTH_PRECISION]; + mm_PRECISION C_im[lda/SIMD_LENGTH_PRECISION]; + + // deinterleaved load + for ( i=0; i low half + sums = _mm_add_ss(sums, shuf); + return _mm_cvtss_f32(sums); +} +static inline double mm_reduce_add_double( mm_double v ) { + double tmp; + _mm_storeh_pd(&tmp, v); // store the high half + return _mm_cvtsd_f64(v) + tmp; // cast the low half and sum +} + +// Transpose a block of SIMD_LENGTH * SIMD_LENGTH +static inline void mm_transpose_float( mm_float *data ) { + _MM_TRANSPOSE4_PS(data[0],data[1],data[2],data[3]); +} +static inline void mm_transpose_double( mm_double *data ) { + double tmp01, tmp10 = _mm_cvtsd_f64(data[1]); + _mm_storeh_pd(&tmp01, data[0]); + _mm_loadl_pd(data[1], &tmp01); + _mm_loadh_pd(data[0], &tmp10); +} + +#endif diff --git a/src/simd_vectorization_control.h b/src/simd_vectorization_control.h new file mode 100644 index 0000000..0fbee29 --- /dev/null +++ b/src/simd_vectorization_control.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. + * + * This file is part of the DDalphaAMG solver library. + * + * The DDalphaAMG solver library is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The DDalphaAMG solver library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * You should have received a copy of the GNU General Public License + * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. + * + */ + +#ifndef SIMD_VECTORIZATION_CONTROL_HEADER +#define SIMD_VECTORIZATION_CONTROL_HEADER + +#ifdef NOT_YET//__AVX__ +#include "simd_avx_intrinsic.h" +#elif defined SSE //__SSE__ +#include "simd_sse_intrinsic.h" +#endif + +#ifdef SIMD + +#define OPTIMIZED_COARSE_NEIGHBOR_COUPLING_float +#define OPTIMIZED_COARSE_SELF_COUPLING_float +#define OPTIMIZED_INTERPOLATION_OPERATOR_float +#define OPTIMIZED_INTERPOLATION_SETUP_float +#define OPTIMIZED_NEIGHBOR_COUPLING_double +#define OPTIMIZED_NEIGHBOR_COUPLING_float +//#define OPTIMIZED_SELF_COUPLING_double +#define OPTIMIZED_SELF_COUPLING_float +#define OPTIMIZED_LINALG_float +#define OPTIMIZED_LINALG_double + +#define OPERATOR_COMPONENT_OFFSET_float (SIMD_LENGTH_float *((l->num_eig_vect+SIMD_LENGTH_float -1)/SIMD_LENGTH_float )) +#define OPERATOR_COMPONENT_OFFSET_double (SIMD_LENGTH_double*((l->num_eig_vect+SIMD_LENGTH_double-1)/SIMD_LENGTH_double)) + +#define OPERATOR_TYPE_float float +#define OPERATOR_TYPE_double double + +#endif + +#ifndef __FMA__ +// a*b + c +static inline mm_double mm_fmadd_double( mm_double a, mm_double b, mm_double c ) { + return mm_add_double( mm_mul_double( a, b ), c ); +} +static inline mm_float mm_fmadd_float( mm_float a, mm_float b, mm_float c ) { + return mm_add_float( mm_mul_float( a, b ), c ); +} + +// -a*b + c +static inline mm_double mm_fnmadd_double( mm_double a, mm_double b, mm_double c ) { + return mm_sub_double( c, mm_mul_double( a, b ) ); +} +static inline mm_float mm_fnmadd_float( mm_float a, mm_float b, mm_float c ) { + return mm_sub_float( c, mm_mul_float( a, b ) ); +} + +// a*b - c +static inline mm_double mm_fmsub_double( mm_double a, mm_double b, mm_double c ) { + return mm_sub_double( mm_mul_double( a, b ), c ); +} +static inline mm_float mm_fmsub_float( mm_float a, mm_float b, mm_float c ) { + return mm_sub_float( mm_mul_float( a, b ), c ); +} + +// res = -a*b - c +static inline mm_double mm_fnmsub_double( mm_double a, mm_double b, mm_double c ) { + mm_double na = mm_sub_double( mm_setzero_double(), a ); + return mm_sub_double( mm_mul_double( na, b ), c ); +} +static inline mm_float mm_fnmsub_float( mm_float a, mm_float b, mm_float c ) { + mm_float na = mm_sub_float( mm_setzero_float(), a ); + return mm_sub_float( mm_mul_float( na, b ), c ); +} +#endif + +#endif // SIMD_VECTORIZATION_CONTROL_HEADER diff --git a/src/solver_analysis.c b/src/solver_analysis.c index ab07aa8..325165e 100644 --- a/src/solver_analysis.c +++ b/src/solver_analysis.c @@ -24,8 +24,22 @@ void test_routine( level_struct *l, struct Thread *threading ) { - g.test = 0; - if ( g.method > 0 ) { + if ( g.method >= 0 ) { + START_MASTER(threading) + g.test = 0; + if ( l->depth == 0 ) { +#ifdef HAVE_TM1p1 + if( g.n_flavours==2 ) + printf0("\nRunning tests with D = TM doublet operator:\n"); + else +#endif +#ifdef HAVE_TM + printf0("\nRunning tests with D = TM Wilson operator:\n"); +#else + printf0("\nRunning tests with D = Wilson operator:\n"); +#endif + } + END_MASTER(threading) if ( g.mixed_precision ) { operator_float_test_routine( &(l->s_float.op), l, threading ); if ( g.method > 0 && g.method < 4 ) schwarz_float_mvm_testfun( &(l->s_float), l, threading ); @@ -36,25 +50,69 @@ void test_routine( level_struct *l, struct Thread *threading ) { if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_double_test( l, threading ); } - if ( g.interpolation ) { + if ( g.interpolation && g.method > 0 ) { if ( g.mixed_precision ) coarse_operator_float_test_routine( l, threading ); else coarse_operator_double_test_routine( l, threading ); } + START_MASTER(threading) + if (g.test < 1e-5) + printf0("TESTS passed, highest error %e < 1e-5\n", g.test); + else + warning0("some TESTS not passed, highest error %e > 1e-5\n", g.test); + printf0("\n"); + END_MASTER(threading) } - START_LOCKED_MASTER(threading) - if (g.test < 1e-5) - printf0("TESTS passed, max error %e < 1e-5", g.test); - else - warning0("some TEST not passed, max error %e > 1e-5", g.test); - printf0("\n"); - prof_init( l ); - END_LOCKED_MASTER(threading) - if ( g.restart > 0 ) - rhs_define( g.p.b, l, threading ); +#ifdef HAVE_TM1p1 + if( g.n_flavours==1 && + (g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0) ) { + + if ( g.method >= 0 ) { + START_MASTER(threading) + g.test = 0; + printf0("Running tests with D = TM doublet operator:\n"); + END_MASTER(threading) + + data_layout_n_flavours( 2, l, threading ); + + if ( g.mixed_precision ) + two_flavours_test_float( &(l->s_float.op), l, threading ); + else + two_flavours_test_double( &(l->s_double.op), l, threading ); + + if ( g.mixed_precision ) { + operator_float_test_routine( &(l->s_float.op), l, threading ); + if ( g.method > 0 && g.method < 4 ) schwarz_float_mvm_testfun( &(l->s_float), l, threading ); + if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_float_test( l, threading ); + } else { + operator_double_test_routine( &(l->s_double.op), l, threading ); + if ( g.method > 0 && g.method < 4 ) schwarz_double_mvm_testfun( &(l->s_double), l, threading ); + if ( g.method > 0 && g.method < 4 && g.odd_even ) block_oddeven_double_test( l, threading ); + } + + if ( g.interpolation && g.method > 0 ) { + if ( g.mixed_precision ) + coarse_operator_float_test_routine( l, threading ); + else + coarse_operator_double_test_routine( l, threading ); + } + + START_MASTER(threading) + if (g.test < 1e-5) + printf0("TESTS passed, highest error %e < 1e-5\n", g.test); + else + warning0("some TESTS not passed, highest error %e > 1e-5\n", g.test); + printf0("\n"); + END_MASTER(threading) + + data_layout_n_flavours( 1, l, threading ); + } + } +#endif + } diff --git a/src/sse_blas_vectorized.h b/src/sse_blas_vectorized.h deleted file mode 100644 index a23a9a5..0000000 --- a/src/sse_blas_vectorized.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_BLAS_VECTORIZED_H -#define SSE_BLAS_VECTORIZED_H -#ifdef SSE - -static inline void sse_cgem_inverse( const int N, float *A_inverse, float *A, int lda ) { - // generate LU decomp in A - - int i, j, k; - complex_float alpha; - - complex_float tmpA[N*N]; - complex_float tmpA_inverse[N*N]; - - for ( j=0; j0 ) - b[k-1] = 0; - - for ( i=0; i=0; i-- ) { - for ( j=i+1; jnext_level->num_lattice_site_var/2, - offset = l->num_lattice_site_var/2; - float *spin_0_1_pt; - float *spin_2_3_pt; - float *interpolation_data; - - int component_offset = SIMD_LENGTH_float; - int fine_components = l->num_lattice_site_var; - - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; nvector_size + fine_components*SIMD_LENGTH_float*site); - - // A - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // D - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_2_3 is the same for all k => broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - - // index k used for vectorization - for ( k=0; kvector_size + fine_components*component_offset*site); - - // B - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - } -#endif -} - - -static inline void sse_set_coarse_neighbor_coupling_float( complex_float *spin_0_1, complex_float *spin_2_3, - complex_float *V, const int mu, level_struct *l, int site, const int n_rhs, complex_float *tmp ) { - -#ifdef SSE - int k, k1, k2, m, num_eig_vect = l->next_level->num_lattice_site_var/2, - offset = l->num_lattice_site_var/2; - - float *spin_0_1_pt; - float *spin_2_3_pt; - float *interpolation_data; - - int component_offset = SIMD_LENGTH_float; - int fine_components = l->num_lattice_site_var; - - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D, each column wise - for ( int n=0; nvector_size + fine_components*component_offset*site); - - k1 = (n+0*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - k2 = (n+1*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - - // A - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // C - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_0_1 is the same for all k => broadcast - __m128 spin_0_1_re = _mm_set1_ps(spin_0_1_pt[(2*m+0)*component_offset]); - __m128 spin_0_1_im = _mm_set1_ps(spin_0_1_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_0_1_re, spin_0_1_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - - k1 = (n+2*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - k2 = (n+3*num_eig_vect)*OPERATOR_COMPONENT_OFFSET_float; - - // B - buffer_re = _mm_load_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=0; m broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k1)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k1)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - - // D - buffer_re = _mm_load_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float); - buffer_im = _mm_load_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float); - for ( m=offset; m<2*offset; m++ ) { - // spin_2_3 is the same for all k => broadcast - __m128 spin_2_3_re = _mm_set1_ps(spin_2_3_pt[(2*m+0)*component_offset]); - __m128 spin_2_3_im = _mm_set1_ps(spin_2_3_pt[(2*m+1)*component_offset]); - __m128 interpolation_data_re = _mm_load_ps(interpolation_data + (2*m+0)*component_offset); - __m128 interpolation_data_im = _mm_load_ps(interpolation_data + (2*m+1)*component_offset); - - cfmadd_conj(interpolation_data_re, interpolation_data_im, spin_2_3_re, spin_2_3_im, &buffer_re, &buffer_im); - } - _mm_store_ps((float *)(tmp+k2)+k+0*OPERATOR_COMPONENT_OFFSET_float, buffer_re); - _mm_store_ps((float *)(tmp+k2)+k+1*OPERATOR_COMPONENT_OFFSET_float, buffer_im); - } - } -#endif -} - - -static inline void sse_coarse_spinwise_site_self_couplings_float( complex_float *eta1, complex_float *eta2, - complex_float *phi, config_float clover, int elements, level_struct *l ) { - -#ifdef SSE - int num_eig_vect = l->num_lattice_site_var/2; - int clover_step_size1 = (num_eig_vect * (num_eig_vect+1))/2; - complex_float *eta[2] = {eta1, eta2}; - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - - __m128 clover_re; - __m128 clover_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - - // zero output matrices - __m128 zero = _mm_setzero_ps(); - for(int s=0; s<2; s++) { - for(int i=0; ieta1) or 2and3 (->eta2) - eta[1] += num_eig_vect*elements; - for(int s=0; s<2; s++) { - // A and D: column major hermitian, stored as upper triangular - for(int i=0; inum_eig_vect, j, num_aggregates = l->is_PRECISION.num_agg, - aggregate_sites = l->num_inner_lattice_sites / num_aggregates, - clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2, - D_link_size = 4*l->num_eig_vect*l->num_eig_vect*4, // size of links in all 4 directions - fine_components = l->num_lattice_site_var; - - - - START_LOCKED_MASTER(threading) - operator_PRECISION_define( &(l->next_level->op_PRECISION), l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - // each thread loops overs its aggregates and then over internal d.o.f. - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - for ( j=0; jnext_level->op_PRECISION.D[j+a*D_link_size] = _COMPLEX_PRECISION_ZERO; - for ( j=0; jnext_level->op_PRECISION.clover[j+a*clover_site_size] = _COMPLEX_PRECISION_ZERO; - } - - complex_PRECISION *mpi_buffer = NULL; - START_MASTER(threading) - MALLOC_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size), 64 ); - END_MASTER(threading) - - int direction_flags[8*l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X]]; - - // set up table for direction flags - int *flags = direction_flags; - if(l->depth == 0) { - // even sites - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]/2; x++) { - flags[2*X+0] = 1; - flags[2*X+1] = 1; - if((y+z+t)%2 == 0) { - if(x == 0) - flags[2*X+0] = 0; - } else { - if(x == l->block_lattice[X]/2-1) - flags[2*X+1] = 0; - } - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - // odd sites - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]/2; x++) { - flags[2*X+0] = 1; - flags[2*X+1] = 1; - if((y+z+t)%2 == 1) { - if(x == 0) - flags[2*X+0] = 0; - } else { - if(x == l->block_lattice[X]/2-1) - flags[2*X+1] = 0; - } - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - } else { - for(int t=0; t < l->block_lattice[T]; t++) { - for(int z=0; z < l->block_lattice[Z]; z++) { - for(int y=0; y < l->block_lattice[Y]; y++) { - for(int x=0; x < l->block_lattice[X]; x++) { - flags[2*X+0] = (x == 0)?0:1; - flags[2*X+1] = (x == l->block_lattice[X]-1)?0:1; - flags[2*Y+0] = (y == 0)?0:1; - flags[2*Y+1] = (y == l->block_lattice[Y]-1)?0:1; - flags[2*Z+0] = (z == 0)?0:1; - flags[2*Z+1] = (z == l->block_lattice[Z]-1)?0:1; - flags[2*T+0] = (t == 0)?0:1; - flags[2*T+1] = (t == l->block_lattice[T]-1)?0:1; - flags += 8; - } - } - } - } - } - - complex_PRECISION eta1[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - complex_PRECISION eta2[fine_components*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - complex_PRECISION tmp[4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION] __attribute__((aligned(64))); - - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - - // new aggregate is starting, zero out tmp - for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) - tmp[i] = 0.0; - - for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { - if(l->depth == 0) { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - d_plus_clover_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site, - direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) ); - } else { - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - coarse_aggregate_self_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, &(l->s_PRECISION), l, site, - direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])) ); - } - set_coarse_self_coupling_PRECISION_vectorized( eta1, eta2, operator, l, site, n, tmp ); - } - - // aggregate is done, finalize - set_coarse_self_coupling_PRECISION_vectorized_finalize( l, a*aggregate_sites, n, tmp ); - - } - - - SYNC_HYPERTHREADS(threading) - START_LOCKED_MASTER(threading) - // neighbors - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) { - for ( mu=0; mu<4; mu++ ) { - // determine start of buffer for this mu - int start = 0; - for ( int j=0; js_PRECISION.op.c.num_boundary_sites[2*j]; - - // update ghost cells of V[i] - negative_sendrecv_PRECISION_vectorized( operator+c*l->vector_size, mu, &(l->s_PRECISION.op.c), l, - SIMD_LENGTH_PRECISION, mpi_buffer+c*(l->vector_size-l->inner_vector_size)+fine_components*start*SIMD_LENGTH_PRECISION ); - } - for ( mu=0; mu<4; mu++ ) { - // finish updating ghostcells of V[i] - negative_wait_PRECISION( mu, &(l->s_PRECISION.op.c), l ); - } - } - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - - for ( int a=threading->n_core*threading->thread+threading->core; an_core*threading->n_thread ) { - - // new aggregate is starting, zero out tmp - for(int i=0; i<4*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION; i++) - tmp[i] = 0.0; - - for ( int site=a*aggregate_sites; site<(a+1)*aggregate_sites; site++ ) { - for ( mu=0; mu<4; mu++ ) { - if( (direction_flags+8*(site%(l->block_lattice[T]*l->block_lattice[Z]*l->block_lattice[Y]*l->block_lattice[X])))[2*mu+1] != 0) - continue; - - if(l->depth == 0) - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - d_neighbor_aggregate_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site ); - else - for ( int c=0; cnum_eig_vect; c+=SIMD_LENGTH_PRECISION ) - coarse_aggregate_neighbor_couplings_PRECISION_vectorized( eta1+c*fine_components, eta2+c*fine_components, - operator+c*l->vector_size, mu, &(l->s_PRECISION), l, site ); - set_coarse_neighbor_coupling_PRECISION_vectorized( eta1, eta2, operator, mu, l, site, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION ); - } - } - - // aggregate is done, finalize - for ( mu=0; mu<4; mu++ ) - set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( mu, l, a*aggregate_sites, n, tmp+mu*4*n*OPERATOR_COMPONENT_OFFSET_PRECISION ); - - } - START_MASTER(threading) - FREE_HUGEPAGES( mpi_buffer, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*(l->vector_size-l->inner_vector_size) ); - - t1 = MPI_Wtime(); - if ( g.print > 0 ) printf0("depth: %d, time spent for setting up next coarser operator: %lf seconds\n", l->depth, t1-t0 ); - END_MASTER(threading) - - SYNC_HYPERTHREADS(threading) - SYNC_CORES(threading) -} - -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION -void coarse_operator_PRECISION_set_couplings( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - - int n = l->num_inner_lattice_sites; - int sc_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1); - int nc_size = SQUARE(l->num_lattice_site_var); - int n1, n2; - if ( l->depth > 0 ) { - n1 = l->num_lattice_sites; - n2 = 2*l->num_lattice_sites-l->num_inner_lattice_sites; - } else { - n1 = l->num_inner_lattice_sites; - n2 = l->num_inner_lattice_sites; - } - - START_LOCKED_MASTER(threading) - if( op->D_vectorized == NULL ) { - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - // 2 is for complex, 4 is for 4 directions - MALLOC_HUGEPAGES( op->D_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n2, 64 ); - MALLOC_HUGEPAGES( op->D_transformed_vectorized, OPERATOR_TYPE_PRECISION, 2*4*l->num_lattice_site_var*column_offset*n2, 64 ); - MALLOC_HUGEPAGES( op->clover_vectorized, OPERATOR_TYPE_PRECISION, 2*l->num_lattice_site_var*column_offset*n, 64 ); - } - END_LOCKED_MASTER(threading) - - int start, end; - compute_core_start_end_custom(0, n, &start, &end, l, threading, 1); - int n_per_core = end-start; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_v = 2*l->num_lattice_site_var*column_offset; - copy_coarse_operator_to_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_vectorized + 4*start*offset_v, - n_per_core, l->num_lattice_site_var/2); - copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_transformed_vectorized + 4*start*offset_v, - n_per_core, l->num_lattice_site_var/2); - copy_coarse_operator_clover_to_vectorized_layout_PRECISION( - op->clover + start*sc_size, - op->clover_vectorized + start*offset_v, - n_per_core, l->num_lattice_site_var/2); -#ifdef HAVE_TM - int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1); - add_tm_term_to_vectorized_layout_PRECISION( - op->tm_term + start*tm_size, - op->clover_vectorized + start*offset_v, - n_per_core, l->num_lattice_site_var/2); -#endif - SYNC_CORES(threading) - - // vectorize negative boundary - if ( l->depth > 0 ) { - compute_core_start_end_custom(n1, n2, &start, &end, l, threading, 1); - n_per_core = end-start; - copy_coarse_operator_to_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_vectorized + 4*start*offset_v, - n_per_core, l->num_lattice_site_var/2); - copy_coarse_operator_to_transformed_vectorized_layout_PRECISION( - op->D + 4*start*nc_size, - op->D_transformed_vectorized + 4*start*offset_v, - n_per_core, l->num_lattice_site_var/2); - SYNC_CORES(threading) - } -} - -void coarse_operator_PRECISION_set_couplings_clover( operator_PRECISION_struct *op, level_struct *l, struct Thread *threading ) { - - if(op->D_vectorized == 0) - coarse_operator_PRECISION_set_couplings(op, l, threading); - - int n = l->num_inner_lattice_sites; - int sc_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var+1); - int start, end; - - compute_core_start_end_custom(0, n, &start, &end, l, threading, 1); - int n_per_core = end-start; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int offset_v = 2*l->num_lattice_site_var*column_offset; - - copy_coarse_operator_clover_to_vectorized_layout_PRECISION( - op->clover + start*sc_size, - op->clover_vectorized + start*offset_v, - n_per_core, l->num_lattice_site_var/2); -#ifdef HAVE_TM - int tm_size = (l->num_lattice_site_var/2)*(l->num_lattice_site_var/2+1); - add_tm_term_to_vectorized_layout_PRECISION( - op->tm_term + start*tm_size, - op->clover_vectorized + start*offset_v, - n_per_core, l->num_lattice_site_var/2); -#endif - -} -#endif - - -void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { - - sse_set_coarse_self_coupling_PRECISION( spin_0_1, spin_2_3, V, l, site, n_rhs, tmp ); -} - - -void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ) { - - int k, k1, k2, num_aggregates = l->is_PRECISION.num_agg, - num_eig_vect = l->next_level->num_lattice_site_var/2, - aggregate_size = l->inner_vector_size / num_aggregates, - clover_site_size = (l->next_level->num_lattice_site_var*(l->next_level->num_lattice_site_var+1))/2; - int t1, t2; - - config_PRECISION clover_pt, clover = l->next_level->op_PRECISION.clover; - - // just an abbreviation - int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; - int fine_components = l->num_lattice_site_var; - - int aggregate = (fine_components*site)/aggregate_size; - clover_pt = clover + aggregate*clover_site_size; - - // U(x) = [ A B , A=A*, D=D*, C = -B* - // C D ] - // storage order: upper triangle of A, upper triangle of D, B, columnwise - // diagonal coupling - for ( int n=0; nnext_level->num_lattice_site_var/2, - D_link_size = num_eig_vect*num_eig_vect*4; - int t1, t2; - - config_PRECISION D_pt, D = l->next_level->op_PRECISION.D; - - // just an abbreviation - int component_offset = OPERATOR_COMPONENT_OFFSET_PRECISION; - int fine_components = l->num_lattice_site_var; - - int aggregate = (fine_components*site)/(l->inner_vector_size / l->is_PRECISION.num_agg); - D_pt = D + (4*aggregate+mu)*D_link_size; - - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D, each column wise - for ( int n=0; n i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - out_tmp[(2*i+0)*column_offset + j] = creal(clover[offset_to_column+jp]); - out_tmp[(2*i+1)*column_offset + j] = sign*cimag(clover[offset_to_column+jp]); - // C = -B^dagger - out_tmp[(2*i+0)*column_offset + j + vecs] = -creal(clover[offset_to_B + j*vecs+i]); - out_tmp[(2*i+1)*column_offset + j + vecs] = cimag(clover[offset_to_B + j*vecs+i]); - } - // zero - for(int j=2*vecs; j i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] = creal(clover[offset_to_D + offset_to_column+jp]); - out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] = sign*cimag(clover[offset_to_D + offset_to_column+jp]); - } - // zero - for(int j=2*vecs; j i) { - ip = j; - jp = i; - sign = -1.0; - } - int offset_to_column = (ip*ip+ip)/2; // upper triangle including diagonal - out_tmp[(2*i+0)*column_offset + j] += sign*creal(tm_term[offset_to_column+jp]); - out_tmp[(2*i+1)*column_offset + j] += cimag(tm_term[offset_to_column+jp]); - out_tmp[(2*(i+vecs)+0)*column_offset + j + vecs] += sign*creal(tm_term[offset_to_D + offset_to_column+jp]); - out_tmp[(2*(i+vecs)+1)*column_offset + j + vecs] += cimag(tm_term[offset_to_D + offset_to_column+jp]); - } - } - - tm_term += 2*offset_to_D; -#ifndef STORE_COARSE_OPERATOR_AS_FLOAT16 - // out_tmp is an alias for the actual output - out_tmp += 2*column_offset*2*vecs; -#else - //TODO - error0("STORE_COARSE_OPERATOR_AS_FLOAT16 not implemented for HAVE_TM") - convert_PRECISION_to_half(2*column_offset*2*vecs, out_tmp, clover_vectorized); - clover_vectorized += 2*column_offset*2*vecs; -#endif - } -#endif -} - -void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, - level_struct *l, int site, int *direction_flags ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = l->num_lattice_site_var*offset; - int index_bw; - int index_fw; - int *neighbor = s->op.neighbor_table; - int *backward_neighbor = s->op.backward_neighbor_table; - complex_PRECISION *phi_pt; - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - int n = l->num_lattice_site_var; - int D_site_offset = 4*n*n; - int D_link_offset = n*n; - int clover_offset = (n*(n+1))/2*site; - - coarse_spinwise_site_self_couplings_PRECISION_vectorized( eta1, eta2, phi+site_offset*site, s->op.clover+clover_offset, offset, l ); - - for(int mu=0; mu<4; mu++) { - index_fw = neighbor[5*site+1 + mu]; - index_bw = backward_neighbor[5*site+1 + mu]; - - // from backward - if ( direction_flags[2*mu+0] == 1 ) { - D_pt = D + D_site_offset*index_bw + D_link_offset*mu; - phi_pt = phi + site_offset*index_bw; - coarse_spinwise_n_daggered_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); - } - - // from forward - if ( direction_flags[2*mu+1] == 1 ) { - D_pt = D + D_site_offset*site + D_link_offset*mu; - phi_pt = phi + site_offset*index_fw; - coarse_spinwise_n_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); - } - } -} - - -void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, - schwarz_PRECISION_struct *s, level_struct *l, int site ) { - - int offset = SIMD_LENGTH_PRECISION; - int site_offset = l->num_lattice_site_var*offset; - int index_fw; - int *neighbor = s->op.neighbor_table; - complex_PRECISION *phi_pt; - config_PRECISION D_pt; - config_PRECISION D = s->op.D; - int n = l->num_lattice_site_var; - int D_site_offset = 4*n*n; - int D_link_offset = n*n; - - vector_PRECISION_define( eta1, 0, 0, n*offset, l ); - vector_PRECISION_define( eta2, 0, 0, n*offset, l ); - - // requires the positive boundaries of phi to be communicated before - index_fw = neighbor[5*site+1 + mu]; - D_pt = D + D_site_offset*site + D_link_offset*mu; - phi_pt = phi + site_offset*index_fw; - coarse_spinwise_hopp_PRECISION_vectorized( eta1, eta2, phi_pt, D_pt, offset, l ); -} - - -void coarse_spinwise_site_self_couplings_PRECISION_vectorized( - complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ) { - - sse_coarse_spinwise_site_self_couplings_PRECISION( eta1, eta2, phi, clover, elements, l ); -} - -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION -void coarse_block_operator_PRECISION( vector_PRECISION eta, vector_PRECISION phi, int start, schwarz_PRECISION_struct *s, - level_struct *l, struct Thread *threading ) { - - START_UNTHREADED_FUNCTION(threading) - - int n = s->num_block_sites, *length = s->dir_length, **index = s->index, - *ind, *neighbor = s->op.neighbor_table, m = l->num_lattice_site_var; - vector_PRECISION lphi = phi+start, leta = eta+start; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset; - - // site-wise self coupling - coarse_self_couplings_PRECISION_vectorized( eta, phi, s->op.clover_vectorized, (start/m), (start/m)+n, l ); - - // inner block couplings - for ( int mu=0; mu<4; mu++ ) { - OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + - (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + - (start/m)*4*vectorized_link_offset + mu*vectorized_link_offset; - ind = index[mu]; // mu direction - for ( int i=0; inum_inner_lattice_sites, &start, &end, l, threading, 1); - coarse_self_couplings_PRECISION_vectorized( eta, phi, op->clover_vectorized, start, end, l ); - SYNC_CORES(threading) - PROF_PRECISION_STOP( _SC, 1, threading ); - PROF_PRECISION_START( _NC, threading ); - coarse_hopping_term_PRECISION( eta, phi, op, _FULL_SYSTEM, l, threading ); - PROF_PRECISION_STOP( _NC, 1, threading ); -} -#endif - -#endif // SSE diff --git a/src/sse_coarse_operator_generic.h b/src/sse_coarse_operator_generic.h deleted file mode 100644 index b805944..0000000 --- a/src/sse_coarse_operator_generic.h +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_COARSE_OPERATOR_PRECISION_HEADER - #define SSE_COARSE_OPERATOR_PRECISION_HEADER - - #ifdef SSE - - #include "blas_vectorized.h" - - void coarse_operator_PRECISION_setup_vectorized( complex_PRECISION *operator, level_struct *l, struct Thread *threading ); - void set_coarse_self_coupling_PRECISION_vectorized( complex_PRECISION *spin_0_1, complex_PRECISION *spin_2_3, - complex_PRECISION *V, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_self_coupling_PRECISION_vectorized_finalize( level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - // here we do not check whether site is really on boundary, caller is responsible for that - // tmp is used to store coarse operator with padding, until sum over all sites has been done - void set_coarse_neighbor_coupling_PRECISION_vectorized( complex_PRECISION *buffer1, complex_PRECISION *buffer2, - complex_PRECISION *V, const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - void set_coarse_neighbor_coupling_PRECISION_vectorized_finalize( const int mu, level_struct *l, int site, const int n_rhs, complex_PRECISION *tmp ); - - void copy_coarse_operator_to_vectorized_layout_PRECISION(config_PRECISION D, - OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect); - // fw and bw links have a symmetry that allows constructing one from another, see, e.g., coarse_hopp_PRECISION - // for vectorization we store the operator for both cases, the "daggered" links need this transformed layout - void copy_coarse_operator_to_transformed_vectorized_layout_PRECISION(config_PRECISION D, - OPERATOR_TYPE_PRECISION *D_vectorized, int num_aggregates, int num_eig_vect); - void copy_coarse_operator_clover_to_vectorized_layout_PRECISION(config_PRECISION clover, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - void add_tm_term_to_vectorized_layout_PRECISION(config_PRECISION tm_term, - OPERATOR_TYPE_PRECISION *clover_vectorized, int num_aggregates, int num_eig_vect); - - void coarse_spinwise_site_self_couplings_PRECISION_vectorized( - complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, config_PRECISION clover, int elements, level_struct *l ); - - void coarse_aggregate_self_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, - int site, int *direction_flags ); - - void coarse_aggregate_neighbor_couplings_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, - complex_PRECISION *phi, const int mu, schwarz_PRECISION_struct *s, level_struct *l, - int site ); - - - static inline void coarse_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, - OPERATOR_TYPE_PRECISION *D, level_struct *l ) { -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - int lda = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - cgenmv(l->num_lattice_site_var, D, lda, (float *)phi, (float *)eta); -#endif - } - static inline void coarse_n_hopp_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, - OPERATOR_TYPE_PRECISION *D, level_struct *l ) { -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - int lda = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - cgemv(l->num_lattice_site_var, D, lda, (float *)phi, (float *)eta); -#endif - } - - static inline void coarse_self_couplings_PRECISION_vectorized( vector_PRECISION eta, vector_PRECISION phi, - OPERATOR_TYPE_PRECISION *clover, int start, int end, level_struct *l ) { -#ifdef VECTORIZE_COARSE_OPERATOR_PRECISION - int site_size = l->num_lattice_site_var; - int lda = SIMD_LENGTH_PRECISION*((site_size+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - - for(int i=start; inum_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2) - for(int s=0; s<2; s++) { - // t is the row of the input matrix (in 2x2 block form) - for(int t=0; t<2; t++) { - for(int i=0; inum_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // s refers to "spin" components 0and1 (->eta1) or 2and3 (->eta2) - for(int s=0; s<2; s++) { - // t is the row of the input matrix (in 2x2 block form) - for(int t=0; t<2; t++) { - for(int i=0; inum_lattice_site_var/2; - int num_eig_vect2 = num_eig_vect*num_eig_vect; - complex_PRECISION *eta[2] = {eta1, eta2}; - // U_mu(x) = [ A B , U_-mu(x+muhat) = [ A* -C* - // C D ] -B* D* ] - // storage order: A, C, B, D - // note: minus sign of D = self_coupling - hopping_term is added here - - __m128 D_re; - __m128 D_im; - __m128 in_re; - __m128 in_im; - __m128 out_re; - __m128 out_im; - // A* - for(int i=0; i1?((k)*3+6):((k)*3)) +#define index_d_re(phi,mu,spin) (gamma_re_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) + gamma_offset[mu][spin] ] +#define index_d_im(phi,mu,spin) (gamma_im_sign[mu][spin]) * (phi)[ 6*gamma_co[mu][spin] + 12*(gamma_co[mu][spin]/2) - gamma_offset[mu][spin] +1 ] + #define neighbor_coupling_file "sse_dirac_su3local.h" void prp_double( complex_double *prn[4], complex_double *phi, int start, int end ) { @@ -129,6 +133,137 @@ void prp_float( complex_float *prn[4], complex_float *phi, int start, int end ) } +void dprp_double( complex_double *prn[4], complex_double *phi, int start, int end ) { + + double *phi_pt = (double*)(phi+start); + double *phi_end = (double*)(phi+end); + double *pr[4] = {(double*)(prn[0]+start/2),(double*)(prn[1]+start/2),(double*)(prn[2]+start/2),(double*)(prn[3]+start/2)}; + + while ( phi_pt < phi_end ) { + + __m128d phi_pt1_re; __m128d phi_pt1_im; + + sse_complex_deinterleaved_load_pd( phi_pt, &phi_pt1_re, &phi_pt1_im ); + for ( int mu=0; mu<4; mu++) { + __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0) ); + __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0) ); + __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); + __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); + sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); + pr[mu] += 2*SIMD_LENGTH_double; + } + + sse_complex_deinterleaved_load_pd( phi_pt+4, &phi_pt1_re, &phi_pt1_im ); + for ( int mu=0; mu<4; mu++) { + __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) ); + __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) ); + __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); + __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); + sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); + pr[mu] += 2*SIMD_LENGTH_double; + } + + sse_complex_deinterleaved_load_pd( phi_pt+8, &phi_pt1_re, &phi_pt1_im ); + for ( int mu=0; mu<4; mu++) { + __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1) ); + __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1) ); + __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); + __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); + sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); + pr[mu] += 2*SIMD_LENGTH_double; + } + + sse_complex_deinterleaved_load_pd( phi_pt+12, &phi_pt1_re, &phi_pt1_im ); + for ( int mu=0; mu<4; mu++) { + __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) ); + __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) ); + __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); + __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); + sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); + pr[mu] += 2*SIMD_LENGTH_double; + } + + sse_complex_deinterleaved_load_pd( phi_pt+16, &phi_pt1_re, &phi_pt1_im ); + for ( int mu=0; mu<4; mu++) { + __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1) ); + __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1) ); + __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); + __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); + sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); + pr[mu] += 2*SIMD_LENGTH_double; + } + + sse_complex_deinterleaved_load_pd( phi_pt+20, &phi_pt1_re, &phi_pt1_im ); + for ( int mu=0; mu<4; mu++) { + __m128d phi_pt2_re = _mm_setr_pd( index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) ); + __m128d phi_pt2_im = _mm_setr_pd( index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) ); + __m128d res_re = _mm_sub_pd( phi_pt1_re, phi_pt2_re ); + __m128d res_im = _mm_sub_pd( phi_pt1_im, phi_pt2_im ); + sse_complex_interleaved_store_pd( res_re, res_im, pr[mu] ); + pr[mu] += 2*SIMD_LENGTH_double; + } + + phi_pt += 48; + } +} + + +void dprp_float( complex_float *prn[4], complex_float *phi, int start, int end ) { + + float *phi_pt = (float*)(phi+start); + float *phi_end = (float*)(phi+end); + float *pr[4] = {(float*)(prn[0]+start/2),(float*)(prn[1]+start/2),(float*)(prn[2]+start/2),(float*)(prn[3]+start/2)}; + + while ( phi_pt < phi_end ) { + + __m128 phi_pt1_re = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], phi_pt[6] ); + __m128 phi_pt1_im = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], phi_pt[7] ); + for ( int mu=0; mu<4; mu++) { + __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0), + index_d_re(phi_pt+4,mu,0), index_d_re(phi_pt,mu,1) ); + __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0), + index_d_im(phi_pt+4,mu,0), index_d_im(phi_pt,mu,1) ); + + __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); + __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); + + sse_complex_interleaved_store( res_re, res_im, pr[mu] ); + pr[mu] += 8; + } + + phi_pt1_re = _mm_setr_ps( phi_pt[8], phi_pt[10], phi_pt[12], phi_pt[14] ); + phi_pt1_im = _mm_setr_ps( phi_pt[9], phi_pt[11], phi_pt[13], phi_pt[15] ); + for ( int mu=0; mu<4; mu++) { + __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1), + index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0) ); + __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1), + index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0) ); + __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); + __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); + + sse_complex_interleaved_store( res_re, res_im, pr[mu] ); + pr[mu] += 8; + } + + phi_pt1_re = _mm_setr_ps( phi_pt[16], phi_pt[18], phi_pt[20], phi_pt[22] ); + phi_pt1_im = _mm_setr_ps( phi_pt[17], phi_pt[19], phi_pt[21], phi_pt[23] ); + for ( int mu=0; mu<4; mu++) { + __m128 phi_pt2_re = _mm_setr_ps( index_d_re(phi_pt+16,mu,0), index_d_re(phi_pt+12,mu,1), + index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1) ); + __m128 phi_pt2_im = _mm_setr_ps( index_d_im(phi_pt+16,mu,0), index_d_im(phi_pt+12,mu,1), + index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1) ); + __m128 res_re = _mm_sub_ps( phi_pt1_re, phi_pt2_re ); + __m128 res_im = _mm_sub_ps( phi_pt1_im, phi_pt2_im ); + + sse_complex_interleaved_store( res_re, res_im, pr[mu] ); + pr[mu] += 8; + } + + phi_pt+=48; + } +} + + void prn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) { double *phi_pt = (double*)(phi+start); @@ -305,6 +440,267 @@ void prn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_st } +void dprn_su3_double( complex_double *prp[4], complex_double *phi, operator_double_struct *op, int *neighbor, int start, int end ) { + + double *phi_pt = (double*)(phi+start); + double *phi_end_pt = (double*)(phi+end); + double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; + double *D_pt = ((double*)(op->D))+2*(start/24*36); + int *nb_pt = neighbor+((start/24)*4); + + while ( phi_pt < phi_end_pt ) { + + __m128d in_re[6]; + __m128d in_im[6]; + + for ( int i=0; i<3; i++ ) { + in_re[i] = _mm_setr_pd( phi_pt[2*i+0], phi_pt[2*i+6] ); + in_im[i] = _mm_setr_pd( phi_pt[2*i+1], phi_pt[2*i+7] ); + } + for ( int i=3; i<6; i++ ) { + in_re[i] = _mm_setr_pd( phi_pt[2*i+6], phi_pt[2*i+12] ); + in_im[i] = _mm_setr_pd( phi_pt[2*i+7], phi_pt[2*i+13] ); + } + + for ( int mu=0; mu<4; mu++ ) { + + __m128d v_re[6]; + __m128d v_im[6]; + + // calc spin projection + for ( int i=0; i<3; i++ ) { + v_re[i] = _mm_setr_pd( index_d_re(phi_pt+2*i,mu,0), index_d_re(phi_pt+2*i,mu,1) ); + v_im[i] = _mm_setr_pd( index_d_im(phi_pt+2*i,mu,0), index_d_im(phi_pt+2*i,mu,1) ); + v_re[i] = _mm_add_pd( in_re[i], v_re[i] ); + v_im[i] = _mm_add_pd( in_im[i], v_im[i] ); + } + for ( int i=3; i<6; i++ ) { + v_re[i] = _mm_setr_pd( index_d_re(phi_pt+6+2*i,mu,0), index_d_re(phi_pt+6+2*i,mu,1) ); + v_im[i] = _mm_setr_pd( index_d_im(phi_pt+6+2*i,mu,0), index_d_im(phi_pt+6+2*i,mu,1) ); + v_re[i] = _mm_add_pd( in_re[i], v_re[i] ); + v_im[i] = _mm_add_pd( in_im[i], v_im[i] ); + } + + { + __m128d res_re[6]; + __m128d res_im[6]; + // load su(3) matrix and multiply + for ( int i=0; i<3; i++ ) { + __m128d buf_re = _mm_set1_pd( D_pt[0+2*i] ); + __m128d buf_im = _mm_set1_pd( D_pt[1+2*i] ); + cmul_conj_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); + cmul_conj_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); + buf_re = _mm_set1_pd( D_pt[6+2*i] ); + buf_im = _mm_set1_pd( D_pt[7+2*i] ); + cfmadd_conj_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); + cfmadd_conj_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); + buf_re = _mm_set1_pd( D_pt[12+2*i] ); + buf_im = _mm_set1_pd( D_pt[13+2*i] ); + cfmadd_conj_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); + cfmadd_conj_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); + } + + { + double *pr_pt = pr[mu]+2*12*(*(nb_pt)); + for ( int i=0; i<3; i++ ) { + __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] ); + __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] ); + _mm_storeu_pd( pr_pt+0+2*i, out1 ); + _mm_storeu_pd( pr_pt+6+2*i, out2 ); + } + for ( int i=3; i<6; i++ ) { + __m128d out1 = _mm_unpacklo_pd( res_re[i], res_im[i] ); + __m128d out2 = _mm_unpackhi_pd( res_re[i], res_im[i] ); + _mm_storeu_pd( pr_pt+ 6+2*i, out1 ); + _mm_storeu_pd( pr_pt+12+2*i, out2 ); + } + } + } + + D_pt += 18; + nb_pt++; + } + + phi_pt += 24*2; + } + +} + + +void dprn_su3_float( complex_float *prp[4], complex_float *phi, operator_float_struct *op, int *neighbor, int start, int end ) { + + float *phi_pt = (float*)(phi+start); + float *phi_end_pt = (float*)(phi+end); + float *pr[4] = {(float*)(prp[0]),(float*)(prp[1]),(float*)(prp[2]),(float*)(prp[3])}; + float *D_pt = (float*)(op->D_transformed_vectorized+2*(start/24*48)); + int *nb_pt = neighbor+((start/24)*4); + + while ( phi_pt < phi_end_pt ) { + + __m128 in11[2]; + __m128 in21[2]; + __m128 in12[2]; + __m128 in22[2]; + + in11[0] = _mm_setr_ps( phi_pt[0], phi_pt[2], phi_pt[4], 0 ); + in11[1] = _mm_setr_ps( phi_pt[1], phi_pt[3], phi_pt[5], 0 ); + in21[0] = _mm_setr_ps( phi_pt[6], phi_pt[8], phi_pt[10], 0 ); + in21[1] = _mm_setr_ps( phi_pt[7], phi_pt[9], phi_pt[11], 0 ); + in12[0] = _mm_setr_ps( phi_pt[12], phi_pt[14], phi_pt[16], 0 ); + in12[1] = _mm_setr_ps( phi_pt[13], phi_pt[15], phi_pt[17], 0 ); + in22[0] = _mm_setr_ps( phi_pt[18], phi_pt[20], phi_pt[22], 0 ); + in22[1] = _mm_setr_ps( phi_pt[19], phi_pt[21], phi_pt[23], 0 ); + + for ( int mu=0; mu<4; mu++ ) { + __m128 res11[2]; + __m128 res21[2]; + __m128 res12[2]; + __m128 res22[2]; + + { + // calc spin0 projection + res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,0), index_d_re(phi_pt+2,mu,0), index_d_re(phi_pt+4,mu,0), 0 ); + res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,0), index_d_im(phi_pt+2,mu,0), index_d_im(phi_pt+4,mu,0), 0 ); + __m128 in11_re = _mm_add_ps( in11[0], res11[0] ); + __m128 in11_im = _mm_add_ps( in11[1], res11[1] ); + + // calc spin1 projection + res11[0] = _mm_setr_ps( index_d_re(phi_pt,mu,1), index_d_re(phi_pt+2,mu,1), index_d_re(phi_pt+4,mu,1), 0 ); + res11[1] = _mm_setr_ps( index_d_im(phi_pt,mu,1), index_d_im(phi_pt+2,mu,1), index_d_im(phi_pt+4,mu,1), 0 ); + __m128 in21_re = _mm_add_ps( in21[0], res11[0] ); + __m128 in21_im = _mm_add_ps( in21[1], res11[1] ); + + // calc spin0 projection + res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,0), index_d_re(phi_pt+14,mu,0), index_d_re(phi_pt+16,mu,0), 0 ); + res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,0), index_d_im(phi_pt+14,mu,0), index_d_im(phi_pt+16,mu,0), 0 ); + __m128 in12_re = _mm_add_ps( in12[0], res12[0] ); + __m128 in12_im = _mm_add_ps( in12[1], res12[1] ); + + // calc spin1 projection + res12[0] = _mm_setr_ps( index_d_re(phi_pt+12,mu,1), index_d_re(phi_pt+14,mu,1), index_d_re(phi_pt+16,mu,1), 0 ); + res12[1] = _mm_setr_ps( index_d_im(phi_pt+12,mu,1), index_d_im(phi_pt+14,mu,1), index_d_im(phi_pt+16,mu,1), 0 ); + __m128 in22_re = _mm_add_ps( in22[0], res12[0] ); + __m128 in22_im = _mm_add_ps( in22[1], res12[1] ); + + // load 1st part of su(3) matrix and multiply + { + __m128 buf1 = _mm_loadu_ps( D_pt ); + __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); + { + __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(0,0,0,0) ); + __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(0,0,0,0) ); + cmul_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(0,0,0,0) ); + __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(0,0,0,0) ); + cmul_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(0,0,0,0) ); + __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(0,0,0,0) ); + cmul_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(0,0,0,0) ); + __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(0,0,0,0) ); + cmul_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); + } + } + // load 2nd part of su(3) matrix and multiply + { + __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); + __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); + { + __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(1,1,1,1) ); + __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(1,1,1,1) ); + cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(1,1,1,1) ); + __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(1,1,1,1) ); + cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(1,1,1,1) ); + __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(1,1,1,1) ); + cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(1,1,1,1) ); + __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(1,1,1,1) ); + cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); + } + } + // load 3rd part of su(3) matrix and multiply + { + __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); + __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); + { + __m128 buf3 = _mm_shuffle_ps( in11_re, in11_re, _MM_SHUFFLE(2,2,2,2) ); + __m128 buf4 = _mm_shuffle_ps( in11_im, in11_im, _MM_SHUFFLE(2,2,2,2) ); + cfmadd_conj( buf1, buf2, buf3, buf4, &res11[0], &res11[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in21_re, in21_re, _MM_SHUFFLE(2,2,2,2) ); + __m128 buf4 = _mm_shuffle_ps( in21_im, in21_im, _MM_SHUFFLE(2,2,2,2) ); + cfmadd_conj( buf1, buf2, buf3, buf4, &res21[0], &res21[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in12_re, in12_re, _MM_SHUFFLE(2,2,2,2) ); + __m128 buf4 = _mm_shuffle_ps( in12_im, in12_im, _MM_SHUFFLE(2,2,2,2) ); + cfmadd_conj( buf1, buf2, buf3, buf4, &res12[0], &res12[1] ); + } + { + __m128 buf3 = _mm_shuffle_ps( in22_re, in22_re, _MM_SHUFFLE(2,2,2,2) ); + __m128 buf4 = _mm_shuffle_ps( in22_im, in22_im, _MM_SHUFFLE(2,2,2,2) ); + cfmadd_conj( buf1, buf2, buf3, buf4, &res22[0], &res22[1] ); + } + } + } + + float *pr_pt = pr[mu]+2*12*(*nb_pt); + { + __m128 buf1 = _mm_unpacklo_ps( res11[0], res11[1] ); + __m128 buf2 = _mm_unpackhi_ps( res11[0], res11[1] ); + __m128 buf3 = _mm_unpacklo_ps( res21[0], res21[1] ); + + { + __m128 buf4 = _mm_unpackhi_ps( res21[0], res21[1] ); + buf2 = _mm_movelh_ps( buf2, buf3 ); + buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) + } + { + _mm_storeu_ps( pr_pt, buf1 ); + _mm_storeu_ps( pr_pt+4, buf2 ); + _mm_storeu_ps( pr_pt+8, buf3 ); + } + } + { + __m128 buf1 = _mm_unpacklo_ps( res12[0], res12[1] ); + __m128 buf2 = _mm_unpackhi_ps( res12[0], res12[1] ); + __m128 buf3 = _mm_unpacklo_ps( res22[0], res22[1] ); + + { + __m128 buf4 = _mm_unpackhi_ps( res22[0], res22[1] ); + buf2 = _mm_movelh_ps( buf2, buf3 ); + buf3 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) + } + { + _mm_storeu_ps( pr_pt+12, buf1 ); + _mm_storeu_ps( pr_pt+16, buf2 ); + _mm_storeu_ps( pr_pt+20, buf3 ); + } + } + nb_pt++; + D_pt += 24; + } + + phi_pt += 48; + } +} + + void pbn_double( complex_double *eta, complex_double *prp[4], int start, int end ) { double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; @@ -483,15 +879,283 @@ void pbn_float( complex_float *eta, complex_float *prp[4], int start, int end ) } } - - -void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op, - int *neighbor, int start, int end ) { +void dpbn_double( complex_double *eta, complex_double *prp[4], int start, int end ) { - double *D_pt = ((double*)(op->D))+2*(start*3); + double *pr[4] = {(double*)(prp[0]),(double*)(prp[1]),(double*)(prp[2]),(double*)(prp[3])}; double *eta_pt = (double*)(eta+start); - double *eta_end_pt = (double*)(eta+end); - double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])}; + + __m128d gamma0[4]; + __m128d gamma1[4]; + + for ( int mu=0; mu<4; mu++ ) { + gamma0[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][0]], gamma_im_sign[mu][gamma_co[mu][0]] ); + gamma1[mu] = _mm_setr_pd( gamma_re_sign[mu][gamma_co[mu][1]], gamma_im_sign[mu][gamma_co[mu][1]] ); + } + + for ( int i=start; iD))+2*(start*3); + double *eta_pt = (double*)(eta+start); + double *eta_end_pt = (double*)(eta+end); + double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])}; int *nb_pt = neighbor+((start/12)*4); __m128d gamma0[4]; @@ -606,41 +1270,477 @@ void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_doubl res[3*gamma_co[Z][0]+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+i], buf1 ); } for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[3+i]) ); - res[3*gamma_co[Z][1]+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+i], buf1 ); + __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[3+i]) ); + res[3*gamma_co[Z][1]+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+i], buf1 ); + } + } + } + // --------------- + // mu = Y + { + __m128d res_re[3]; + __m128d res_im[3]; + { + __m128d v_re[3]; + __m128d v_im[3]; + int j = 2*6*(*nb_pt); + + for ( int i=0; i<3; i++ ) { + v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) ); + v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); + __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); + cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); + buf_re = _mm_set1_pd( D_pt[2+6*i] ); + buf_im = _mm_set1_pd( D_pt[3+6*i] ); + cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); + buf_re = _mm_set1_pd( D_pt[4+6*i] ); + buf_im = _mm_set1_pd( D_pt[5+6*i] ); + cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); + } + D_pt += 18; + nb_pt++; + } + { + __m128d in[6]; + in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); + in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); + in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); + + in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); + in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); + in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); + + for ( int i=0; i<6; i++ ) { + res[i] = _mm_sub_pd( res[i], in[i] ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) ); + res[3*gamma_co[Y][0]+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[3+i]) ); + res[3*gamma_co[Y][1]+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+i], buf1 ); + } + } + } + // --------------- + // mu = X + { + __m128d res_re[3]; + __m128d res_im[3]; + { + __m128d v_re[3]; + __m128d v_im[3]; + int j = 2*6*(*nb_pt); + + for ( int i=0; i<3; i++ ) { + v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) ); + v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); + __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); + cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); + buf_re = _mm_set1_pd( D_pt[2+6*i] ); + buf_im = _mm_set1_pd( D_pt[3+6*i] ); + cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); + buf_re = _mm_set1_pd( D_pt[4+6*i] ); + buf_im = _mm_set1_pd( D_pt[5+6*i] ); + cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); + } + D_pt += 18; + nb_pt++; + } + { + __m128d in[6]; + in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); + in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); + in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); + + in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); + in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); + in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); + + for ( int i=0; i<6; i++ ) { + res[i] = _mm_sub_pd( res[i], in[i] ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) ); + res[3*gamma_co[X][0]+i] = _mm_sub_pd( res[3*gamma_co[X][0]+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[3+i]) ); + res[3*gamma_co[X][1]+i] = _mm_sub_pd( res[3*gamma_co[X][1]+i], buf1 ); + } + } + } + // --------------- + + for ( int i=0; i<12; i++ ) { + _mm_storeu_pd( eta_pt + 2*i, res[i] ); + } + eta_pt+=24; + } + +} + + +void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, + int *neighbor, int start, int end ) { + + float *D_pt = (float*)(op->D_vectorized+2*(start*4)); + float *eta_pt = (float*)(eta+start); + float *eta_end_pt = (float*)(eta+end); + float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])}; + int *nb_pt = neighbor+((start/12)*4); + + __m128 gamma0[4][2]; + __m128 gamma1[4][2]; + + for ( int mu=0; mu<4; mu++ ) { + gamma0[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][0]] ); + gamma0[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][0]] ); + gamma1[mu][0] = _mm_set1_ps( -gamma_re_sign[mu][gamma_co[mu][1]] ); + gamma1[mu][1] = _mm_set1_ps( -gamma_im_sign[mu][gamma_co[mu][1]] ); + } + + while( eta_pt < eta_end_pt ) { + + __m128 eta_lo1 = _mm_loadu_ps( eta_pt ); + __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 ); + __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 ); + __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 ); + + __m128 eta2_lo[2]; + __m128 eta2_hi[2]; + + eta2_lo[0] = _mm_loadu_ps( eta_pt + 12 ); + eta2_hi[0] = _mm_loadu_ps( eta_pt + 14 ); + eta2_lo[1] = _mm_loadu_ps( eta_pt + 18 ); + eta2_hi[1] = _mm_loadu_ps( eta_pt + 20 ); + + for ( int mu=0; mu<4; mu++ ) { + __m128 res1[2]; + __m128 res2[2]; + + { + int j = 2*6*(*nb_pt); + // load 1st part of su(3) matrix and multiply + { + __m128 buf1 = _mm_loadu_ps( D_pt ); + __m128 buf2 = _mm_loadu_ps( D_pt+SIMD_LENGTH_float ); + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+0) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+1) ); + cmul( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); + } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+6) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) ); + cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); + } + } + // load 2nd part of su(3) matrix and multiply + { + __m128 buf1 = _mm_loadu_ps( D_pt+2*SIMD_LENGTH_float ); + __m128 buf2 = _mm_loadu_ps( D_pt+3*SIMD_LENGTH_float ); + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+2) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+3) ); + cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); + } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+8) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) ); + cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); + } + } + // load 3rd part of su(3) matrix and multiply + { + __m128 buf1 = _mm_loadu_ps( D_pt+4*SIMD_LENGTH_float ); + __m128 buf2 = _mm_loadu_ps( D_pt+5*SIMD_LENGTH_float ); + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+4) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+5) ); + cfmadd( buf1, buf2, buf3, buf4, &res1[0], &res1[1] ); + } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+10) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) ); + cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); + } + } + } + + { + // store spin0 contribution + { + __m128 buf1 = _mm_unpacklo_ps( res1[0], res1[1] ); + __m128 buf2 = _mm_unpackhi_ps( res1[0], res1[1] ); + eta_lo1 = _mm_sub_ps( eta_lo1, buf1 ); + eta_lo2 = _mm_sub_ps( eta_lo2, buf2 ); + } + + // store contribution from 1st SU(3) multiplication to either spin2 or spin3 + __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] ); + __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] ); + __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); + __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); + buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) + eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 ); + eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 ); + } + + { + // store spin1 contribution + { + __m128 buf1 = _mm_unpacklo_ps( res2[0], res2[1] ); + __m128 buf2 = _mm_unpackhi_ps( res2[0], res2[1] ); + eta_hi1 = _mm_sub_ps( eta_hi1, buf1 ); + eta_hi2 = _mm_sub_ps( eta_hi2, buf2 ); + } + + // store contribution from 1st SU(3) multiplication to either spin2 or spin3 + __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] ); + __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] ); + __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); + __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); + buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) + eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 ); + eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 ); + } + + nb_pt++; + D_pt += 24; + } + + _mm_storeu_ps( eta_pt, eta_lo1 ); + _mm_storeu_ps( eta_pt+4, eta_lo2 ); + _mm_storeu_ps( eta_pt+6, eta_hi1 ); + _mm_storeu_ps( eta_pt+10, eta_hi2 ); + _mm_storeu_ps( eta_pt+12, eta2_lo[0] ); + _mm_storeu_ps( eta_pt+14, eta2_hi[0] ); + _mm_storeu_ps( eta_pt+18, eta2_lo[1] ); + _mm_storeu_ps( eta_pt+20, eta2_hi[1] ); + + eta_pt += 24; + } + +} + + +void su3_dpbp_double( complex_double* eta, complex_double *prn[4], operator_double_struct *op, + int *neighbor, int start, int end ) { + + double *D_pt = ((double*)(op->D))+2*(start/24*36); + double *eta_pt = (double*)(eta+start); + double *eta_end_pt = (double*)(eta+end); + double *pr[4] = {(double*)(prn[0]),(double*)(prn[1]),(double*)(prn[2]),(double*)(prn[3])}; + int *nb_pt = neighbor+((start/24)*4); + + __m128d gamma0[4]; + __m128d gamma1[4]; + + for ( int mu=0; mu<4; mu++ ) { + gamma0[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][0]], -gamma_im_sign[mu][gamma_co[mu][0]] ); + gamma1[mu] = _mm_setr_pd( -gamma_re_sign[mu][gamma_co[mu][1]], -gamma_im_sign[mu][gamma_co[mu][1]] ); + } + + while( eta_pt < eta_end_pt ) { + + __m128d res[24]; + for ( int i=0; i<24; i++ ) { + res[i] = _mm_loadu_pd( eta_pt + 2*i ); + } + + // --------------- + // mu = T + { + __m128d res_re[6]; + __m128d res_im[6]; + { + __m128d v_re[6]; + __m128d v_im[6]; + int j = 2*12*(*nb_pt); + + for ( int i=0; i<3; i++ ) { + v_re[i] = _mm_setr_pd( *(pr[T]+j+0+2*i), *(pr[T]+j+6+2*i) ); + v_im[i] = _mm_setr_pd( *(pr[T]+j+1+2*i), *(pr[T]+j+7+2*i) ); + } + for ( int i=3; i<6; i++ ) { + v_re[i] = _mm_setr_pd( *(pr[T]+j+6+2*i), *(pr[T]+j+12+2*i) ); + v_im[i] = _mm_setr_pd( *(pr[T]+j+7+2*i), *(pr[T]+j+13+2*i) ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); + __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); + cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); + cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); + buf_re = _mm_set1_pd( D_pt[2+6*i] ); + buf_im = _mm_set1_pd( D_pt[3+6*i] ); + cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); + cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); + buf_re = _mm_set1_pd( D_pt[4+6*i] ); + buf_im = _mm_set1_pd( D_pt[5+6*i] ); + cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); + cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); + } + D_pt += 18; + nb_pt++; + } + { + __m128d in[12]; + in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); + in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); + in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); + + in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); + in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); + in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); + + in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); + in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); + in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); + + in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); + in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); + in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); + + for ( int i=0; i<12; i++ ) { + res[i] = _mm_sub_pd( res[i], in[i] ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i]) ); + res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+3]) ); + res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma0[T], GAMMA_T_SHUFFLE(in[i+6]) ); + res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][0]+6*(gamma_co[T][0]/2)+6+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma1[T], GAMMA_T_SHUFFLE(in[i+9]) ); + res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[T][1]+6*(gamma_co[T][1]/2)+6+i], buf1 ); + } + } + } + // --------------- + // mu = Z + { + __m128d res_re[6]; + __m128d res_im[6]; + { + __m128d v_re[6]; + __m128d v_im[6]; + int j = 2*12*(*nb_pt); + + for ( int i=0; i<3; i++ ) { + v_re[i] = _mm_setr_pd( *(pr[Z]+j+0+2*i), *(pr[Z]+j+6+2*i) ); + v_im[i] = _mm_setr_pd( *(pr[Z]+j+1+2*i), *(pr[Z]+j+7+2*i) ); + } + for ( int i=3; i<6; i++ ) { + v_re[i] = _mm_setr_pd( *(pr[Z]+j+6+2*i), *(pr[Z]+j+12+2*i) ); + v_im[i] = _mm_setr_pd( *(pr[Z]+j+7+2*i), *(pr[Z]+j+13+2*i) ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); + __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); + cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); + cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); + buf_re = _mm_set1_pd( D_pt[2+6*i] ); + buf_im = _mm_set1_pd( D_pt[3+6*i] ); + cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); + cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); + buf_re = _mm_set1_pd( D_pt[4+6*i] ); + buf_im = _mm_set1_pd( D_pt[5+6*i] ); + cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); + cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); + } + D_pt += 18; + nb_pt++; + } + { + __m128d in[12]; + in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); + in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); + in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); + + in[3] = _mm_unpackhi_pd( res_re[0], res_im[0] ); + in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); + in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); + + in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); + in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); + in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); + + in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); + in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); + in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); + + for ( int i=0; i<12; i++ ) { + res[i] = _mm_sub_pd( res[i], in[i] ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i]) ); + res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+3]) ); + res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma0[Z], GAMMA_Z_SHUFFLE(in[i+6]) ); + res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][0]+6*(gamma_co[Z][0]/2)+6+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma1[Z], GAMMA_Z_SHUFFLE(in[i+9]) ); + res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Z][1]+6*(gamma_co[Z][1]/2)+6+i], buf1 ); } } } // --------------- // mu = Y { - __m128d res_re[3]; - __m128d res_im[3]; + __m128d res_re[6]; + __m128d res_im[6]; { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); + __m128d v_re[6]; + __m128d v_im[6]; + int j = 2*12*(*nb_pt); for ( int i=0; i<3; i++ ) { v_re[i] = _mm_setr_pd( *(pr[Y]+j+0+2*i), *(pr[Y]+j+6+2*i) ); v_im[i] = _mm_setr_pd( *(pr[Y]+j+1+2*i), *(pr[Y]+j+7+2*i) ); } + for ( int i=3; i<6; i++ ) { + v_re[i] = _mm_setr_pd( *(pr[Y]+j+6+2*i), *(pr[Y]+j+12+2*i) ); + v_im[i] = _mm_setr_pd( *(pr[Y]+j+7+2*i), *(pr[Y]+j+13+2*i) ); + } for ( int i=0; i<3; i++ ) { __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); + cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); buf_re = _mm_set1_pd( D_pt[2+6*i] ); buf_im = _mm_set1_pd( D_pt[3+6*i] ); cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); + cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); buf_re = _mm_set1_pd( D_pt[4+6*i] ); buf_im = _mm_set1_pd( D_pt[5+6*i] ); cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); + cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); } D_pt += 18; nb_pt++; } { - __m128d in[6]; + __m128d in[12]; in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); @@ -649,49 +1749,72 @@ void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_doubl in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - for ( int i=0; i<6; i++ ) { + in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); + in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); + in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); + + in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); + in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); + in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); + + for ( int i=0; i<12; i++ ) { res[i] = _mm_sub_pd( res[i], in[i] ); } for ( int i=0; i<3; i++ ) { __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i]) ); - res[3*gamma_co[Y][0]+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+i], buf1 ); + res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+i], buf1 ); } for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[3+i]) ); - res[3*gamma_co[Y][1]+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+i], buf1 ); + __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+3]) ); + res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma0[Y], GAMMA_Y_SHUFFLE(in[i+6]) ); + res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][0]+6*(gamma_co[Y][0]/2)+6+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma1[Y], GAMMA_Y_SHUFFLE(in[i+9]) ); + res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[Y][1]+6*(gamma_co[Y][1]/2)+6+i], buf1 ); } } } - // --------------- + // --------------- // mu = X { - __m128d res_re[3]; - __m128d res_im[3]; + __m128d res_re[6]; + __m128d res_im[6]; { - __m128d v_re[3]; - __m128d v_im[3]; - int j = 2*6*(*nb_pt); + __m128d v_re[6]; + __m128d v_im[6]; + int j = 2*12*(*nb_pt); for ( int i=0; i<3; i++ ) { v_re[i] = _mm_setr_pd( *(pr[X]+j+0+2*i), *(pr[X]+j+6+2*i) ); v_im[i] = _mm_setr_pd( *(pr[X]+j+1+2*i), *(pr[X]+j+7+2*i) ); } + for ( int i=3; i<6; i++ ) { + v_re[i] = _mm_setr_pd( *(pr[X]+j+6+2*i), *(pr[X]+j+12+2*i) ); + v_im[i] = _mm_setr_pd( *(pr[X]+j+7+2*i), *(pr[X]+j+13+2*i) ); + } for ( int i=0; i<3; i++ ) { __m128d buf_re = _mm_set1_pd( D_pt[0+6*i] ); __m128d buf_im = _mm_set1_pd( D_pt[1+6*i] ); cmul_pd( buf_re, buf_im, v_re[0], v_im[0], &res_re[i], &res_im[i] ); + cmul_pd( buf_re, buf_im, v_re[3], v_im[3], &res_re[i+3], &res_im[i+3] ); buf_re = _mm_set1_pd( D_pt[2+6*i] ); buf_im = _mm_set1_pd( D_pt[3+6*i] ); cfmadd_pd( buf_re, buf_im, v_re[1], v_im[1], &res_re[i], &res_im[i] ); + cfmadd_pd( buf_re, buf_im, v_re[4], v_im[4], &res_re[i+3], &res_im[i+3] ); buf_re = _mm_set1_pd( D_pt[4+6*i] ); buf_im = _mm_set1_pd( D_pt[5+6*i] ); cfmadd_pd( buf_re, buf_im, v_re[2], v_im[2], &res_re[i], &res_im[i] ); + cfmadd_pd( buf_re, buf_im, v_re[5], v_im[5], &res_re[i+3], &res_im[i+3] ); } D_pt += 18; nb_pt++; } { - __m128d in[6]; + __m128d in[12]; in[0] = _mm_unpacklo_pd( res_re[0], res_im[0] ); in[1] = _mm_unpacklo_pd( res_re[1], res_im[1] ); in[2] = _mm_unpacklo_pd( res_re[2], res_im[2] ); @@ -700,38 +1823,54 @@ void su3_pbp_double( complex_double* eta, complex_double *prn[4], operator_doubl in[4] = _mm_unpackhi_pd( res_re[1], res_im[1] ); in[5] = _mm_unpackhi_pd( res_re[2], res_im[2] ); - for ( int i=0; i<6; i++ ) { + in[6] = _mm_unpacklo_pd( res_re[3], res_im[3] ); + in[7] = _mm_unpacklo_pd( res_re[4], res_im[4] ); + in[8] = _mm_unpacklo_pd( res_re[5], res_im[5] ); + + in[9] = _mm_unpackhi_pd( res_re[3], res_im[3] ); + in[10] = _mm_unpackhi_pd( res_re[4], res_im[4] ); + in[11] = _mm_unpackhi_pd( res_re[5], res_im[5] ); + + for ( int i=0; i<12; i++ ) { res[i] = _mm_sub_pd( res[i], in[i] ); } for ( int i=0; i<3; i++ ) { __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i]) ); - res[3*gamma_co[X][0]+i] = _mm_sub_pd( res[3*gamma_co[X][0]+i], buf1 ); + res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+i], buf1 ); } for ( int i=0; i<3; i++ ) { - __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[3+i]) ); - res[3*gamma_co[X][1]+i] = _mm_sub_pd( res[3*gamma_co[X][1]+i], buf1 ); + __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+3]) ); + res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma0[X], GAMMA_X_SHUFFLE(in[i+6]) ); + res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][0]+6*(gamma_co[X][0]/2)+6+i], buf1 ); + } + for ( int i=0; i<3; i++ ) { + __m128d buf1 = _mm_mul_pd( gamma1[X], GAMMA_X_SHUFFLE(in[i+9]) ); + res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i] = _mm_sub_pd( res[3*gamma_co[X][1]+6*(gamma_co[X][1]/2)+6+i], buf1 ); } } } // --------------- - for ( int i=0; i<12; i++ ) { + for ( int i=0; i<24; i++ ) { _mm_storeu_pd( eta_pt + 2*i, res[i] ); } - eta_pt+=24; + eta_pt+=48; } } -void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, +void su3_dpbp_float( complex_float* eta, complex_float *prn[4], operator_float_struct *op, int *neighbor, int start, int end ) { - float *D_pt = (float*)(op->D_vectorized+2*(start*4)); + float *D_pt = (float*)(op->D_vectorized+2*(start/24*48)); float *eta_pt = (float*)(eta+start); float *eta_end_pt = (float*)(eta+end); float *pr[4] = {(float*)(prn[0]),(float*)(prn[1]),(float*)(prn[2]),(float*)(prn[3])}; - int *nb_pt = neighbor+((start/12)*4); + int *nb_pt = neighbor+((start/24)*4); __m128 gamma0[4][2]; __m128 gamma1[4][2]; @@ -749,21 +1888,29 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st __m128 eta_lo2 = _mm_loadu_ps( eta_pt + 4 ); __m128 eta_hi1 = _mm_loadu_ps( eta_pt + 6 ); __m128 eta_hi2 = _mm_loadu_ps( eta_pt + 10 ); + __m128 eta_lo3 = _mm_loadu_ps( eta_pt + 12 ); + __m128 eta_lo4 = _mm_loadu_ps( eta_pt + 16 ); + __m128 eta_hi3 = _mm_loadu_ps( eta_pt + 18 ); + __m128 eta_hi4 = _mm_loadu_ps( eta_pt + 22 ); - __m128 eta2_lo[2]; - __m128 eta2_hi[2]; + __m128 eta2_lo[4]; + __m128 eta2_hi[4]; - eta2_lo[0] = _mm_loadu_ps( eta_pt + 12 ); - eta2_hi[0] = _mm_loadu_ps( eta_pt + 14 ); - eta2_lo[1] = _mm_loadu_ps( eta_pt + 18 ); - eta2_hi[1] = _mm_loadu_ps( eta_pt + 20 ); + eta2_lo[0] = _mm_loadu_ps( eta_pt + 24 ); + eta2_hi[0] = _mm_loadu_ps( eta_pt + 26 ); + eta2_lo[1] = _mm_loadu_ps( eta_pt + 30 ); + eta2_hi[1] = _mm_loadu_ps( eta_pt + 32 ); + eta2_lo[2] = _mm_loadu_ps( eta_pt + 36 ); + eta2_hi[2] = _mm_loadu_ps( eta_pt + 38 ); + eta2_lo[3] = _mm_loadu_ps( eta_pt + 42 ); + eta2_hi[3] = _mm_loadu_ps( eta_pt + 44 ); for ( int mu=0; mu<4; mu++ ) { - __m128 res1[2]; - __m128 res2[2]; + __m128 res1[4]; + __m128 res2[4]; { - int j = 2*6*(*nb_pt); + int j = 2*12*(*nb_pt); // load 1st part of su(3) matrix and multiply { __m128 buf1 = _mm_loadu_ps( D_pt ); @@ -778,6 +1925,16 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+7) ); cmul( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+12) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+13) ); + cmul( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); + } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+18) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+19) ); + cmul( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); + } } // load 2nd part of su(3) matrix and multiply { @@ -793,6 +1950,16 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+9) ); cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+14) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+15) ); + cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); + } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+20) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+21) ); + cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); + } } // load 3rd part of su(3) matrix and multiply { @@ -808,6 +1975,16 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+11) ); cfmadd( buf1, buf2, buf3, buf4, &res2[0], &res2[1] ); } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+16) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+17) ); + cfmadd( buf1, buf2, buf3, buf4, &res1[2], &res1[3] ); + } + { + __m128 buf3 = _mm_set1_ps( *(pr[mu]+j+22) ); + __m128 buf4 = _mm_set1_ps( *(pr[mu]+j+23) ); + cfmadd( buf1, buf2, buf3, buf4, &res2[2], &res2[3] ); + } } } @@ -819,17 +1996,33 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st eta_lo1 = _mm_sub_ps( eta_lo1, buf1 ); eta_lo2 = _mm_sub_ps( eta_lo2, buf2 ); } + { + __m128 buf1 = _mm_unpacklo_ps( res1[2], res1[3] ); + __m128 buf2 = _mm_unpackhi_ps( res1[2], res1[3] ); + eta_lo3 = _mm_sub_ps( eta_lo3, buf1 ); + eta_lo4 = _mm_sub_ps( eta_lo4, buf2 ); + } // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] ); - __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 ); - eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 ); + { + __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[gamma_offset[mu][0]] ); + __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[1-gamma_offset[mu][0]] ); + __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); + __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); + buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) + eta2_lo[gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[gamma_co[mu][2]], buf3 ); + eta2_hi[gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[gamma_co[mu][2]], buf4 ); + } + { + __m128 buf1 = _mm_mul_ps( gamma0[mu][0], res1[2+gamma_offset[mu][0]] ); + __m128 buf2 = _mm_mul_ps( gamma0[mu][1], res1[3-gamma_offset[mu][0]] ); + __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); + __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); + buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) + eta2_lo[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][2]], buf3 ); + eta2_hi[2+gamma_co[mu][2]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][2]], buf4 ); + } } - { // store spin1 contribution { @@ -838,17 +2031,33 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st eta_hi1 = _mm_sub_ps( eta_hi1, buf1 ); eta_hi2 = _mm_sub_ps( eta_hi2, buf2 ); } + { + __m128 buf1 = _mm_unpacklo_ps( res2[2], res2[3] ); + __m128 buf2 = _mm_unpackhi_ps( res2[2], res2[3] ); + eta_hi3 = _mm_sub_ps( eta_hi3, buf1 ); + eta_hi4 = _mm_sub_ps( eta_hi4, buf2 ); + } // store contribution from 1st SU(3) multiplication to either spin2 or spin3 - __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] ); - __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] ); - __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); - __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); - buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) - eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 ); - eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 ); + { + __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[gamma_offset[mu][1]] ); + __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[1-gamma_offset[mu][1]] ); + __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); + __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); + buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) + eta2_lo[gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[gamma_co[mu][3]], buf3 ); + eta2_hi[gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[gamma_co[mu][3]], buf4 ); + } + { + __m128 buf1 = _mm_mul_ps( gamma1[mu][0], res2[2+gamma_offset[mu][1]] ); + __m128 buf2 = _mm_mul_ps( gamma1[mu][1], res2[3-gamma_offset[mu][1]] ); + __m128 buf3 = _mm_unpacklo_ps( buf1, buf2 ); + __m128 buf4 = _mm_unpackhi_ps( buf1, buf2 ); + buf4 = _mm_shuffle_ps( buf3, buf4, _MM_SHUFFLE(1,0,3,2) ); // hi(buf3), lo(buf4) + eta2_lo[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_lo[2+gamma_co[mu][3]], buf3 ); + eta2_hi[2+gamma_co[mu][3]] = _mm_sub_ps( eta2_hi[2+gamma_co[mu][3]], buf4 ); + } } - nb_pt++; D_pt += 24; } @@ -857,18 +2066,25 @@ void su3_pbp_float( complex_float* eta, complex_float *prn[4], operator_float_st _mm_storeu_ps( eta_pt+4, eta_lo2 ); _mm_storeu_ps( eta_pt+6, eta_hi1 ); _mm_storeu_ps( eta_pt+10, eta_hi2 ); - _mm_storeu_ps( eta_pt+12, eta2_lo[0] ); - _mm_storeu_ps( eta_pt+14, eta2_hi[0] ); - _mm_storeu_ps( eta_pt+18, eta2_lo[1] ); - _mm_storeu_ps( eta_pt+20, eta2_hi[1] ); + _mm_storeu_ps( eta_pt+12, eta_lo3 ); + _mm_storeu_ps( eta_pt+16, eta_lo4 ); + _mm_storeu_ps( eta_pt+18, eta_hi3 ); + _mm_storeu_ps( eta_pt+22, eta_hi4 ); + _mm_storeu_ps( eta_pt+24, eta2_lo[0] ); + _mm_storeu_ps( eta_pt+26, eta2_hi[0] ); + _mm_storeu_ps( eta_pt+30, eta2_lo[1] ); + _mm_storeu_ps( eta_pt+32, eta2_hi[1] ); + _mm_storeu_ps( eta_pt+36, eta2_lo[2] ); + _mm_storeu_ps( eta_pt+38, eta2_hi[2] ); + _mm_storeu_ps( eta_pt+42, eta2_lo[3] ); + _mm_storeu_ps( eta_pt+44, eta2_hi[3] ); - eta_pt += 24; + eta_pt += 48; } } - void block_oddeven_plus_coupling_double( double *eta, double *D, double *phi, int mu, int start, int end, int *ind, int *neighbor ) { } void block_oddeven_pT_coupling_float( float *eta, float *D, float *phi, int start, int end, int *ind, int *neighbor ) { #define UPD _mm_sub_ps @@ -1266,7 +2482,7 @@ static inline int sse_clover_imag_index( int i, int j ) { void sse_set_clover_double( double *out, complex_double *in ) { } -void sse_set_clover_float( float *out, complex_double *in ) { +void sse_set_clover_float( float *out, complex_float *in ) { int index; float sign = 0.0; @@ -1274,7 +2490,7 @@ void sse_set_clover_float( float *out, complex_double *in ) { for ( int j=0; j<6; j++ ) { for ( int i=0; iclover+start:op->clover+(start/12)*42; - - clover_double( leta, lphi, clover, end-start, l, threading ); -#ifdef HAVE_TM - add_diagonal_double( leta, lphi, op->tm_term+start, end-start ); -#endif +void sse_set_clover_doublet_double( double *out, complex_double *in ) { } +void sse_set_clover_doublet_float( float *out, complex_float *in ) { + + int index, d; + float sign = 0.0; + for ( int k=0; k<12; k+=SIMD_LENGTH_float ) { + for ( int j=0; j<6; j++ ) { + for ( int i=0; i i+k ) { + // upper triangle + index = 12 + ( 30 - (5-(k+i))*(6-(k+i)) )/2 + (j-(i+k+1)); + sign = 1.0; + } else { + // lower triangle, j < i+k + index = 12 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k)-(j+1)); + sign = -1.0; + } + } else { + // i+k >= 6 + // second 6-by-6 matrix + if ( j > i+k-6 ) { + // upper triangle + index = 12 + 15 + ( 30 - (5-(k+i-6))*(6-(k+i-6)) )/2 + (j-(i+k-6+1)); + sign = 1.0; + } else { + // j < i+k-6 + // lower triangle + index = 12 + 15 + ( 30 - (5-(j))*(6-(j)) )/2 + ((i+k-6)-(j+1)); + sign = -1.0; + } + } + d=(i+k<6)?0:6; + out[ sse_clover_real_index(i+k+d,j) ] = creal_float( in[index] ); + out[ sse_clover_imag_index(i+k+d,j) ] = sign*cimag_float( in[index] ); + out[ sse_clover_real_index(i+k+d+6,j) ] = creal_float( in[index] ); + out[ sse_clover_imag_index(i+k+d+6,j) ] = sign*cimag_float( in[index] ); + } + } + } } +void sse_add_diagonal_clover_double( double *out, complex_double *diag ) { } -void sse_clover_float( vector_float eta, vector_float phi, operator_float_struct *op, int start, int end, - level_struct *l, struct Thread *threading ) { +void sse_add_diagonal_clover_float( float *out, complex_float *diag ) { + for ( int k=0; k<12; k++ ) { + out[ sse_clover_real_index(k,k%6) ] += creal_float( diag[k] ); + out[ sse_clover_imag_index(k,k%6) ] += cimag_float( diag[k] ); + } +} - if ( g.csw == 0.0 ) { - vector_float lphi = phi+start, leta = eta+start; - config_float clover = (g.csw==0.0)?op->clover+start:op->clover+(start/12)*42; +void sse_add_diagonal_clover_doublet_double( double *out, complex_double *diag ) { } - clover_float( leta, lphi, clover, end-start, l, threading ); -#ifdef HAVE_TM - add_diagonal_float( leta, lphi, op->tm_term+start, end-start ); -#endif - } else { - float *clov = op->clover_vectorized; - for ( int i=start; inum_block_sites, *length = s->dir_length, **index = s->index, *neighbor = s->op.neighbor_table; - vector_PRECISION lphi = phi+start, leta = eta+start; -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - PRECISION *Dplus = s->op.D_vectorized + (start/12)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/12)*96; -#else - int j, k, *ind; - complex_PRECISION buf1[25]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, *buf2=buf1+6, *buf3=buf2+6, *buf4=buf3+6; - config_PRECISION D_pt; - config_PRECISION D = s->op.D + (start/12)*36; -#endif - - // clover term -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - sse_clover_PRECISION(eta, phi, &(s->op), start, start+12*n, l, no_threading ); -#else - config_PRECISION clover = (g.csw==0.0)?s->op.clover+start:s->op.clover+(start/12)*42; - clover_PRECISION( leta, lphi, clover, 12*n, l, no_threading ); -#ifdef HAVE_TM - add_diagonal_PRECISION( leta, lphi, s->op.tm_term+start, 12*n ); -#endif -#endif - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - for ( int mu=0; mu<4; mu++ ) { - block_oddeven_plus_coupling_PRECISION( (PRECISION*)leta, Dplus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor ); - block_oddeven_minus_coupling_PRECISION( (PRECISION*)leta, Dminus, (PRECISION*)lphi, mu, 0, length[mu], index[mu], neighbor ); - } -#else - // inner block couplings - ind = index[T]; // T direction - for ( i=0; inum_inner_lattice_sites, *neighbor = op->neighbor_table, start, end; -#ifndef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - int i, j, *nb_pt; - complex_PRECISION pbuf[6]; - vector_PRECISION phi_pt, eta_pt, end_pt; - config_PRECISION D_pt; -#endif - - compute_core_start_end(0, 12*n, &start, &end, l, threading ); - - SYNC_MASTER_TO_ALL(threading) - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - sse_clover_PRECISION(eta, phi, op, start, end, l, threading ); -#else - vector_PRECISION lphi = phi+start, leta = eta+start; - config_PRECISION clover = (g.csw==0.0)?op->clover+start:op->clover+(start/12)*42; - clover_PRECISION( leta, lphi, clover, end-start, l, threading ); -#ifdef HAVE_TM - add_diagonal_PRECISION( leta, lphi, op->tm_term+start, end-start ); -#endif -#endif - - START_MASTER(threading) - PROF_PRECISION_START( _NC ); - END_MASTER(threading) - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; - prp_PRECISION( prn, phi, start, end ); -#else - for ( i=start/2, phi_pt=phi+start; iprnT+i, phi_pt ); - prp_Z_PRECISION( op->prnZ+i, phi_pt ); - prp_Y_PRECISION( op->prnY+i, phi_pt ); - prp_X_PRECISION( op->prnX+i, phi_pt ); - } -#endif - // start communication in negative direction - START_LOCKED_MASTER(threading) - ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); - ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); - ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); - ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); - END_LOCKED_MASTER(threading) - - // project plus dir and multiply with U dagger -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; - prn_su3_PRECISION( prp, phi, op, neighbor, start, end ); -#else - for ( phi_pt=phi+start, end_pt=phi+end, D_pt = op->D+(start*3), nb_pt=neighbor+((start/12)*4); phi_ptprpT+j, D_pt, pbuf ); - mvmh_PRECISION( op->prpT+j+3, D_pt, pbuf+3 ); D_pt += 9; - // Z dir - j = 6*(*nb_pt); nb_pt++; - prn_Z_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpZ+j, D_pt, pbuf ); - mvmh_PRECISION( op->prpZ+j+3, D_pt, pbuf+3 ); D_pt += 9; - // Y dir - j = 6*(*nb_pt); nb_pt++; - prn_Y_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpY+j, D_pt, pbuf ); - mvmh_PRECISION( op->prpY+j+3, D_pt, pbuf+3 ); D_pt += 9; - // X dir - j = 6*(*nb_pt); nb_pt++; - prn_X_PRECISION( pbuf, phi_pt ); - mvmh_PRECISION( op->prpX+j, D_pt, pbuf ); - mvmh_PRECISION( op->prpX+j+3, D_pt, pbuf+3 ); D_pt += 9; - } -#endif - - // start communication in positive direction - START_LOCKED_MASTER(threading) - ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); - ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); - ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); - ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); - // wait for communication in negative direction - ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), _FULL_SYSTEM, l ); - ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), _FULL_SYSTEM, l ); - ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), _FULL_SYSTEM, l ); - ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), _FULL_SYSTEM, l ); - END_LOCKED_MASTER(threading) - - // multiply with U and lift up minus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - su3_pbp_PRECISION( eta, prn, op, neighbor, start, end ); -#else - for ( eta_pt=eta+start, end_pt=eta+end, D_pt = op->D+start*3, nb_pt=neighbor+(start/12)*4; eta_ptprnT+j ); - mvm_PRECISION( pbuf+3, D_pt, op->prnT+j+3 ); - pbp_su3_T_PRECISION( pbuf, eta_pt ); D_pt += 9; - // Z dir - j = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnZ+j ); - mvm_PRECISION( pbuf+3, D_pt, op->prnZ+j+3 ); - pbp_su3_Z_PRECISION( pbuf, eta_pt ); D_pt += 9; - // Y dir - j = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnY+j ); - mvm_PRECISION( pbuf+3, D_pt, op->prnY+j+3 ); - pbp_su3_Y_PRECISION( pbuf, eta_pt ); D_pt += 9; - // X dir - j = 6*(*nb_pt); nb_pt++; - mvm_PRECISION( pbuf, D_pt, op->prnX+j ); - mvm_PRECISION( pbuf+3, D_pt, op->prnX+j+3 ); - pbp_su3_X_PRECISION( pbuf, eta_pt ); D_pt += 9; - } -#endif - - // wait for communication in positive direction - START_LOCKED_MASTER(threading) - ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), _FULL_SYSTEM, l ); - ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), _FULL_SYSTEM, l ); - ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), _FULL_SYSTEM, l ); - ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), _FULL_SYSTEM, l ); - END_LOCKED_MASTER(threading) - - // lift up plus dir -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - pbn_PRECISION( eta, prp, start, end ); -#else - for ( i=start/2, eta_pt=eta+start; iprpT+i, eta_pt ); - pbn_su3_Z_PRECISION( op->prpZ+i, eta_pt ); - pbn_su3_Y_PRECISION( op->prpY+i, eta_pt ); - pbn_su3_X_PRECISION( op->prpX+i, eta_pt ); - } -#endif - - START_MASTER(threading) - PROF_PRECISION_STOP( _NC, 1 ); - END_MASTER(threading) - - SYNC_MASTER_TO_ALL(threading) -} -#endif - - -#endif - diff --git a/src/sse_dirac_su3local.h b/src/sse_dirac_su3local.h index 437eae6..8e1f8ad 100644 --- a/src/sse_dirac_su3local.h +++ b/src/sse_dirac_su3local.h @@ -19,6 +19,297 @@ * */ +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) { +#ifdef BOUNDARY + for ( int i=start; inum_eig_vect; - - MALLOC( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, n ); - -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, n ); - l->is_PRECISION.interpolation[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size, 128 ); - for ( k=1; kis_PRECISION.interpolation[k] = l->is_PRECISION.interpolation[0] + k*l->vector_size; -#endif - // ghost shell is communicated in coarse_operator_setup, so we need size=vector_size, not inner_vector_size - MALLOC_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, - ((size_t)OPERATOR_COMPONENT_OFFSET_PRECISION)*((size_t)l->vector_size), 128 ); - - l->is_PRECISION.test_vector[0] = NULL; - MALLOC_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size, 128 ); - for ( k=1; kis_PRECISION.test_vector[k] = l->is_PRECISION.test_vector[0] + k*l->inner_vector_size; - } -} - - -void interpolation_PRECISION_dummy_alloc( level_struct *l ) { - - MALLOC( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - MALLOC( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); -} - - -void interpolation_PRECISION_dummy_free( level_struct *l ) { - - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, l->num_eig_vect ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, l->num_eig_vect ); -} - - -void interpolation_PRECISION_free( level_struct *l ) { - - int n = l->num_eig_vect; - - FREE_HUGEPAGES( l->is_PRECISION.test_vector[0], complex_PRECISION, n*l->inner_vector_size ); - FREE( l->is_PRECISION.eigenvalues, complex_PRECISION, n ); - FREE( l->is_PRECISION.test_vector, complex_PRECISION*, n ); -#ifndef INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_PRECISION - FREE_HUGEPAGES( l->is_PRECISION.interpolation[0], complex_PRECISION, n*l->vector_size ); - FREE( l->is_PRECISION.interpolation, complex_PRECISION*, n ); -#endif - FREE_HUGEPAGES( l->is_PRECISION.operator, complex_PRECISION, OPERATOR_COMPONENT_OFFSET_PRECISION*l->vector_size ); -} - - -void swap8_PRECISION( PRECISION* data ) { - - int i; - PRECISION tmp[8]; - - for ( i=0; i<4; i++ ) { - tmp[i] = data[2*i]; - tmp[i+4] = data[2*i+1]; - } - - for ( i=0; i<8; i++ ) { - data[i] = tmp[i]; - } -} - - -void define_interpolation_PRECISION_operator( complex_PRECISION **interpolation, level_struct *l, struct Thread *threading ) { - - int j, num_eig_vect = l->num_eig_vect; - complex_PRECISION *operator = l->is_PRECISION.operator; - - int start = threading->start_index[l->depth]; - int end = threading->end_index[l->depth]; - - SYNC_CORES(threading) - int offset = SIMD_LENGTH_PRECISION; - for ( j=0; j num_eig_vect) - j_end = num_eig_vect; - - operator = l->is_PRECISION.operator + j*l->vector_size + start*offset; - - for ( int i=start; iis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - - START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi_c_re+j, zero); - _mm_store_ps(tmp_phi_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jis_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - - START_LOCKED_MASTER(threading) - vector_PRECISION_distribute( phi_c_pt, phi_c, l->next_level ); - END_LOCKED_MASTER(threading) - SYNC_HYPERTHREADS(threading) - - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - float tmp_phi_c_re[2*OPERATOR_COMPONENT_OFFSET_float]; - float tmp_phi_c_im[2*OPERATOR_COMPONENT_OFFSET_float]; - __m128 zero = _mm_setzero_ps(); - for ( j=0; j<2*OPERATOR_COMPONENT_OFFSET_PRECISION; j+=SIMD_LENGTH_PRECISION ) { - _mm_store_ps(tmp_phi_c_re+j, zero); - _mm_store_ps(tmp_phi_c_im+j, zero); - } - // copy phi_c into temporary - for ( j=0; jis_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - for ( k=0; kis_PRECISION.num_agg, num_eig_vect = l->num_eig_vect, - num_parent_eig_vect = l->num_lattice_site_var/2, aggregate_sites = l->num_inner_lattice_sites / num_aggregates; - complex_PRECISION *operator = l->is_PRECISION.operator, *phi_pt = phi, - *phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer; - - for ( i=threading->n_thread*threading->core + threading->thread; in_core*threading->n_thread ) { - - phi_pt = phi + i*2*num_parent_eig_vect*aggregate_sites; - phi_c_pt = l->next_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - - int offset = SIMD_LENGTH_PRECISION; - // loop over blocks of SIMD_LENGTH_PRECISION vectors - for ( j=0; jnext_level->gs_PRECISION.transfer_buffer + i*2*num_eig_vect; - operator = l->is_PRECISION.operator + j*l->vector_size + i*2*offset*num_parent_eig_vect*aggregate_sites; - - // temporary, so we can used aligned load/store, and don't have to mess around with deinterleaving - // complex components and masking - // factor 2 is for low/high (refers to spin components being split up for preserving gamma5-symmetry of coarse operator) - float tmp_phi_c_re[2*offset]; - float tmp_phi_c_im[2*offset]; - __m128 zero = _mm_setzero_ps(); - for ( k1=0; k1<2*offset; k1+=offset ) { - _mm_store_ps(tmp_phi_c_re+k1, zero); - _mm_store_ps(tmp_phi_c_im+k1, zero); - } - - for ( k=0; k broadcast - __m128 phi_re = _mm_set1_ps(((float *)phi_pt)[0]); - __m128 phi_im = _mm_set1_ps(((float *)phi_pt)[1]); - - __m128 operator_re = _mm_load_ps((float *)operator); - __m128 operator_im = _mm_load_ps((float *)operator+offset); - __m128 phi_c_re = _mm_load_ps(tmp_phi_c_re+low_high_offset); - __m128 phi_c_im = _mm_load_ps(tmp_phi_c_im+low_high_offset); - - cfmadd_conj(operator_re, operator_im, phi_re, phi_im, &phi_c_re, &phi_c_im); - - _mm_store_ps(tmp_phi_c_re+low_high_offset, phi_c_re); - _mm_store_ps(tmp_phi_c_im+low_high_offset, phi_c_im); - // skip to next real line of matrix - operator += offset; - phi_pt++; - } - low_high_offset = offset; - } - } - - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+j+m))[0] = tmp_phi_c_re[m]; - ((float*)(phi_c_pt+j+m))[1] = tmp_phi_c_im[m]; - } - - for ( int m=0; m= num_eig_vect ) break; - ((float*)(phi_c_pt+num_eig_vect+j+m))[0] = tmp_phi_c_re[m+offset]; - ((float*)(phi_c_pt+num_eig_vect+j+m))[1] = tmp_phi_c_im[m+offset]; - } - } - } - - SYNC_HYPERTHREADS(threading) - START_LOCKED_MASTER(threading) - vector_PRECISION_gather( phi_c, l->next_level->gs_PRECISION.transfer_buffer, l->next_level ); - END_LOCKED_MASTER(threading) - PROF_PRECISION_STOP( _PR, 1, threading ); -} - -#endif // defined( SSE ) && defined( INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_PRECISION ) diff --git a/src/sse_interpolation_generic.h b/src/sse_interpolation_generic.h deleted file mode 100644 index 2db7a86..0000000 --- a/src/sse_interpolation_generic.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_INTERPOLATION_PRECISION_HEADER - #define SSE_INTERPOLATION_PRECISION_HEADER - - #ifdef SSE - void interpolation_PRECISION_alloc( level_struct *l ); - void interpolation_PRECISION_free( level_struct *l ); - void interpolation_PRECISION_dummy_alloc( level_struct *l ); - void interpolation_PRECISION_dummy_free( level_struct *l ); - - void interpolate_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading ); - void interpolate3_PRECISION( vector_PRECISION phi, vector_PRECISION phi_c, level_struct *l, Thread *threading ); - void restrict_PRECISION( vector_PRECISION phi_c, vector_PRECISION phi, level_struct *l, Thread *threading ); -#endif - -#endif \ No newline at end of file diff --git a/src/sse_linalg.c b/src/sse_linalg.c deleted file mode 100644 index bf0f9d6..0000000 --- a/src/sse_linalg.c +++ /dev/null @@ -1,795 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE - -#ifdef OPTIMIZED_LINALG_double -void vector_double_scale( vector_double z, vector_double x, complex_double alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_double_START( _LA6 ); - - __m128d alpha_re = _mm_set1_pd( creal_double(alpha) ); - __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) ); - double *zd = (double*)(z+start); - double *xd = (double*)(x+start); - - for( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void vector_float_scale( vector_float z, vector_float x, complex_float alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if(thread == 0 && start != end) - PROF_float_START( _LA6 ); - - __m128 alpha_re = _mm_set1_ps( creal_float(alpha) ); - __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) ); - float *zf = (float*)(z+start); - float *xf = (float*)(x+start); - - if ( l->depth == 0 ) { - for( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void vector_float_saxpy( vector_float z, vector_float x, vector_float y, complex_float alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_float_START( _LA8 ); - - __m128 alpha_re = _mm_set1_ps( creal_float(alpha) ); - __m128 alpha_im = _mm_set1_ps( cimag_float(alpha) ); - - if ( l->depth == 0 ) { - for ( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void vector_double_saxpy( vector_double z, vector_double x, vector_double y, complex_double alpha, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_double_START( _LA8 ); - - __m128d alpha_re = _mm_set1_pd( creal_double(alpha) ); - __m128d alpha_im = _mm_set1_pd( cimag_double(alpha) ); - - for ( int i=start; iinner_vector_size ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -complex_double global_inner_product_double( vector_double phi, vector_double psi, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _GIP, threading ); - complex_double local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128d alpha_re = _mm_setzero_pd(); - __m128d alpha_im = _mm_setzero_pd(); - - if ( l->depth == 0 ) { - for( int i=thread_start; iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((complex_double *)threading->workspace)[0] += ((complex_double *)threading->workspace)[i]; - local_alpha = ((complex_double *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_double_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm ); - PROF_double_STOP( _ALLR, 1 ); - ((complex_double *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((complex_double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return global_alpha; - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((complex_double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return local_alpha; - } -} -#endif - -#ifdef OPTIMIZED_LINALG_float -complex_float global_inner_product_float( vector_float phi, vector_float psi, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GIP, threading ); - complex_float local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128 alpha_re = _mm_setzero_ps(); - __m128 alpha_im = _mm_setzero_ps(); - - float *phif = (float*)(phi+thread_start); - float *psif = (float*)(psi+thread_start); - - if ( l->depth == 0 ) { - for( int i=thread_start; iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((complex_float *)threading->workspace)[0] += ((complex_float *)threading->workspace)[i]; - local_alpha = ((complex_float *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_float_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_COMPLEX_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm ); - PROF_float_STOP( _ALLR, 1 ); - ((complex_float *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((complex_float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return global_alpha; - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((complex_float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return local_alpha; - } -} -#endif - -#ifdef OPTIMIZED_LINALG_double -double global_norm_double( vector_double x, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _GIP, threading ); - - double local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - VECTOR_FOR( int i=thread_start, iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((double *)threading->workspace)[0] += ((double *)threading->workspace)[i]; - local_alpha = ((double *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_double_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_double, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_double.level_comm ); - PROF_double_STOP( _ALLR, 1 ); - ((double *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (double)sqrt((double)global_alpha); - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((double *)threading->workspace)[0]; - PROF_double_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (double)sqrt((double)local_alpha); - } -} -#endif - -#ifdef OPTIMIZED_LINALG_float -float global_norm_float( vector_float x, int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GIP, threading ); - - float local_alpha = 0, global_alpha = 0; - - int thread_start; - int thread_end; - compute_core_start_end(start, end, &thread_start, &thread_end, l, threading); - - SYNC_CORES(threading) - - __m128 alpha = _mm_setzero_ps(); - - if ( l->depth == 0 ) { - for( int i=thread_start; iworkspace)[threading->core] = local_alpha; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) - ((float *)threading->workspace)[0] += ((float *)threading->workspace)[i]; - local_alpha = ((float *)threading->workspace)[0]; - END_MASTER(threading) - - if ( g.num_processes > 1 ) { - START_MASTER(threading) - PROF_float_START( _ALLR ); - MPI_Allreduce( &local_alpha, &global_alpha, 1, MPI_float, MPI_SUM, (l->depth==0)?g.comm_cart:l->gs_float.level_comm ); - PROF_float_STOP( _ALLR, 1 ); - ((float *)threading->workspace)[0] = global_alpha; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - global_alpha = ((float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (float)sqrt((double)global_alpha); - } else { - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - local_alpha = ((float *)threading->workspace)[0]; - PROF_float_STOP( _GIP, (double)(end-start)/(double)l->inner_vector_size, threading ); - return (float)sqrt((double)local_alpha); - } -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void vector_double_multi_saxpy( vector_double z, vector_double *V, complex_double *alpha, - int sign, int count, int start, int end, level_struct *l ) { - - int thread = omp_get_thread_num(); - if (thread == 0 && start != end ) - PROF_double_START( _LA8 ); - - int flag = 0; - __m128d alpha_re[count]; __m128d alpha_im[count]; - for ( int c=0; c EPS_double || -cimag_double(alpha[c]) > EPS_double ) - flag = 1; - } - - if ( flag == 0 ) { - for ( int c=0; c EPS_float || -cimag_float(alpha[c]) > EPS_float ) - flag = 1; - } - - if ( l->depth == 0 ) { - if ( flag == 0 ) { - for ( int c=0; cworkspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_float_STOP( _PIP, (double)(end-start)/(double)l->inner_vector_size, threading ); -} -#endif - -#ifdef OPTIMIZED_LINALG_float -void process_multi_inner_product_float( int count, complex_float *results, vector_float *phi, vector_float psi, - int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _PIP, threading ); - int i; - for(int c=0; cdepth == 0 ) { - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); - for(int c=0; cworkspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_float **)threading->workspace)[0][c] += ((complex_float **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_float_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading ); -} -#endif - -#ifdef OPTIMIZED_LINALG_double -void process_multi_inner_product_double( int count, complex_double *results, vector_double *phi, vector_double psi, - int start, int end, level_struct *l, struct Thread *threading ) { - - PROF_double_START( _PIP, threading ); - int i; - for(int c=0; cdepth == 0 ) { - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, 12); - for(int c=0; cworkspace)[threading->core] = results; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int c=0; cn_core; i++) - ((complex_double **)threading->workspace)[0][c] += ((complex_double **)threading->workspace)[i][c]; - END_MASTER(threading) - // all threads need the result of the norm - SYNC_MASTER_TO_ALL(threading) - for(int c=0; cworkspace)[0][c]; - - PROF_double_STOP( _PIP, (double)(count*(end-start))/(double)l->inner_vector_size, threading ); -} -#endif - -#endif // SSE - diff --git a/src/sse_linalg.h b/src/sse_linalg.h deleted file mode 100644 index cd88fad..0000000 --- a/src/sse_linalg.h +++ /dev/null @@ -1,497 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef LINALG_SSE_H -#define LINALG_SSE_H -#ifdef SSE - - -// Standard Gram-Schmidt on aggregates -static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec, - level_struct *l, struct Thread *threading ); -// Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt -static inline void sse_aggregate_gram_schmidt_block_float( float *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); -// used by Block-Gram-Schmidt -static inline void sse_aggregate_block_dot_block_float( float *S, float *U, float *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); -// used by Block-Gram-Schmidt -static inline void sse_aggregate_block_minus_block_times_dot_float( float *B, float *U, float *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - -static inline void sse_aggregate_gram_schmidt_double( complex_double *V, const int num_vec, - level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_gram_schmidt_block_double( double *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_block_dot_block_double( double *S, double *U, double *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} -static inline void sse_aggregate_block_minus_block_times_dot_double( double *B, double *U, double *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) {} - - -static inline void sse_aggregate_gram_schmidt_float( complex_float *V, const int num_vec, level_struct *l, struct Thread *threading ) { - - PROF_float_START( _GRAM_SCHMIDT_ON_AGGREGATES, threading ); - SYNC_CORES(threading) - SYNC_HYPERTHREADS(threading) - long int i, j, k, k1, k2, k3, num_aggregates = l->s_float.num_aggregates, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; - - float *v_pt1; - float *v_pt2; - float norm1, norm2; - float next_norm1; - float next_norm2; - int ldv = SIMD_LENGTH_float; - int V_block_offset = 2*l->vector_size; - - for ( j=threading->n_thread*threading->core+threading->thread; jn_thread*threading->n_core ) { - - v_pt1 = (float *)V + 0 + j*aggregate_size*2*ldv; - - next_norm1 = 0.0; - next_norm2 = 0.0; - for ( i=0; is_float.num_aggregates, - aggregate_size = l->inner_vector_size / num_aggregates, offset = l->num_lattice_site_var/2; - - float *v_pt1; - float *v_pt2; - float norm; - float next_norm; - int ldv = leading_dimension; - //offset = 6; - - - // current thread chooses an aggregate - for ( int jp=threading->core; jp<2*num_aggregates; jp+=threading->n_core ) { - j = jp/2; - int component = jp%2; - - - v_pt1 = V + 2*component*offset*ldv + j*aggregate_size*2*ldv; - - next_norm = 0.0; - - // for the whole aggregate - for ( i=0; is_float.num_aggregates; - int aggregate_size = l->inner_vector_size / num_aggregates; - int offset = l->num_lattice_site_var/2; - - for ( int jp=threading->core; jpn_core ) { - int j = jp/2; - int component = jp%2; - // factors 2 are for complex and spin01/23 aggregates - Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - __m128 U_re; - __m128 U_im; - __m128 B_re; - __m128 B_im; - __m128 S_re[SIMD_LENGTH_float]; - __m128 S_im[SIMD_LENGTH_float]; - for( int i=0; is_float.num_aggregates; - int aggregate_size = l->inner_vector_size / num_aggregates; - int offset = l->num_lattice_site_var/2; - - for ( int jp=threading->core; jpn_core ) { - int j = jp/2; - int component = jp%2; - // factors 2 are for complex and spin01/23 aggregates - Up = U + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - Bp = B + 2*component*offset*leading_dimension + 2*2*j*aggregate_size*leading_dimension; - __m128 U_re; - __m128 U_im; - __m128 B_re; - __m128 B_im; - __m128 S_re[SIMD_LENGTH_float]; - __m128 S_im[SIMD_LENGTH_float]; - for( int i=0; ivector_size), - (PRECISION *)(V + j*l->vector_size), vecs, l, threading ); - aggregate_gram_schmidt_block_PRECISION( (PRECISION *)(V + i*l->vector_size), vecs, SIMD_LENGTH_PRECISION, l, threading ); - } - SYNC_CORES(threading) - PROF_PRECISION_STOP( _GRAM_SCHMIDT_ON_AGGREGATES, 1, threading ); -} - - -void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ) { - - // the block version has some optimizations which are correct only on the fine grid - if(l->depth == 0) - aggregate_block_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading); - else - aggregate_gram_schmidt_PRECISION_vectorized(V, num_vec, l, threading); -} - - -void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, int num_vec, level_struct *l, struct Thread *threading ) { - START_NO_HYPERTHREADS(threading) - - PRECISION *S = NULL; - START_LOCKED_MASTER(threading) - // factors 2 are for complex and spin01/23 aggregates - MALLOC_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION, 64); - ((PRECISION **)threading->workspace)[0] = S; - END_LOCKED_MASTER(threading) - S = ((PRECISION **)threading->workspace)[0]; - - aggregate_block_dot_block_PRECISION(S, U, B, num_vec, SIMD_LENGTH_PRECISION, l , threading); - aggregate_block_minus_block_times_dot_PRECISION(B, U, S, num_vec, SIMD_LENGTH_PRECISION, l , threading); - - START_LOCKED_MASTER(threading) - FREE_HUGEPAGES(S, PRECISION, 2*2*l->s_PRECISION.num_aggregates*SIMD_LENGTH_PRECISION*SIMD_LENGTH_PRECISION); - END_LOCKED_MASTER(threading) - - END_NO_HYPERTHREADS(threading) -} - - -void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) { - sse_aggregate_block_dot_block_PRECISION( S, U, B, num_vec, leading_dimension, l, threading ); -} - - -void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ) { - sse_aggregate_block_minus_block_times_dot_PRECISION( B, U, S, num_vec, leading_dimension, l, threading ); -} - -#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION -void setup_gram_schmidt_PRECISION_compute_dots( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading) { - - int thread_start; - int thread_end; - int cache_block_size = 12*16; - - for(int i=0; i<2*offset; i++) - thread_buffer[i] = 0.0; - - SYNC_CORES(threading) - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); - - __m128 dot_re[count]; - __m128 dot_im[count]; - __m128 dot_gamma5_re[count]; - __m128 dot_gamma5_im[count]; - - for ( int j=0; j can use 3 pre-defined +/-1 patterns - __m128 gamma5[3]; - gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 ); - gamma5[1] = _mm_set_ps( 1.0, 1.0,-1.0,-1.0 ); - gamma5[2] = _mm_set_ps( 1.0, 1.0, 1.0, 1.0 ); - - for(int m=0; m<3; m++) { - - sse_complex_deinterleaved_load( (float*)(V[j]+i+k+4*m), &vj_re, &vj_im ); - sse_complex_deinterleaved_load( (float*)(V[count]+i+k+4*m), &v_re, &v_im ); - - gamma5_v_re = _mm_mul_ps(gamma5[m], v_re); - gamma5_v_im = _mm_mul_ps(gamma5[m], v_im); - - cfmadd_conj(vj_re, vj_im, v_re, v_im, dot_re+j, dot_im+j); - cfmadd_conj(vj_re, vj_im, gamma5_v_re, gamma5_v_im, dot_gamma5_re+j, dot_gamma5_im+j); - } - } - } - } - for ( int j=0; jworkspace)[threading->core] = thread_buffer; - END_NO_HYPERTHREADS(threading) - // master sums up all results - SYNC_CORES(threading) - START_MASTER(threading) - for(int i=1; in_core; i++) { - for(int j=0; jworkspace)[0][j] += ((complex_PRECISION **)threading->workspace)[i][j]; - ((complex_PRECISION **)threading->workspace)[0][j+offset] += ((complex_PRECISION **)threading->workspace)[i][j+offset]; - } - } - END_MASTER(threading) - // only master needs the result in this case (it will be distributed later) -} -#endif - -#ifdef GRAM_SCHMIDT_VECTORIZED_PRECISION -void setup_gram_schmidt_PRECISION_axpys( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading) { - - int thread_start; - int thread_end; - int cache_block_size = 12*16; - - compute_core_start_end_custom(start, end, &thread_start, &thread_end, l, threading, cache_block_size); - - __m128 dot_re[count]; - __m128 dot_im[count]; - __m128 dot_gamma5_re[count]; - __m128 dot_gamma5_im[count]; - - for ( int j=0; j can use 3 pre-defined +/-1 patterns - __m128 gamma5[3]; - gamma5[0] = _mm_set_ps( -1.0,-1.0,-1.0,-1.0 ); - gamma5[1] = _mm_set_ps( 1.0, 1.0,-1.0,-1.0 ); - gamma5[2] = _mm_set_ps( 1.0, 1.0, 1.0, 1.0 ); - - for(int m=0; m<3; m++) { - - sse_complex_deinterleaved_load( (float*)(V[j]+i+k+4*m), &vj_re, &vj_im ); - sse_complex_deinterleaved_load( (float*)(V[count]+i+k+4*m), &v_re, &v_im ); - - gamma5_vj_re = _mm_mul_ps(gamma5[m], vj_re); - gamma5_vj_im = _mm_mul_ps(gamma5[m], vj_im); - - cfnmadd(vj_re, vj_im, dot_re[j], dot_im[j], &v_re, &v_im); - cfnmadd(gamma5_vj_re, gamma5_vj_im, dot_gamma5_re[j], dot_gamma5_im[j], &v_re, &v_im); - - sse_complex_interleaved_store(v_re, v_im, (float*)(V[count]+i+k+4*m) ); - } - } - } - } -} -#endif - -#endif diff --git a/src/sse_linalg_generic.h b/src/sse_linalg_generic.h deleted file mode 100644 index 00390d5..0000000 --- a/src/sse_linalg_generic.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_LINALG_PRECISION_HEADER - #define SSE_LINALG_PRECISION_HEADER - #ifdef SSE - - void gram_schmidt_on_aggregates_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - // Block-Gram-Schmidt on aggregates - void aggregate_block_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - // Standard Gram-Schmidt on aggregates - void aggregate_gram_schmidt_PRECISION_vectorized( complex_PRECISION *V, const int num_vec, level_struct *l, struct Thread *threading ); - - // Gram-Schmidt on a block of vectors, used by Block-Gram-Schmidt - void aggregate_gram_schmidt_block_PRECISION( PRECISION *V, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_orthogonalize_block_wrt_orthonormal_block_PRECISION( PRECISION *B, PRECISION *U, - int num_vec, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_block_dot_block_PRECISION( PRECISION *S, PRECISION *U, PRECISION *B, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - // used by Block-Gram-Schmidt - void aggregate_block_minus_block_times_dot_PRECISION( PRECISION *B, PRECISION *U, PRECISION *S, - int num_vec, int leading_dimension, level_struct *l, struct Thread *threading ); - - void setup_gram_schmidt_PRECISION_compute_dots( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading); - - void setup_gram_schmidt_PRECISION_axpys( - complex_PRECISION *thread_buffer, vector_PRECISION *V, int count, int offset, - int start, int end, level_struct *l, struct Thread *threading); - -#endif -#endif \ No newline at end of file diff --git a/src/sse_oddeven_generic.c b/src/sse_oddeven_generic.c deleted file mode 100644 index 724d2ee..0000000 --- a/src/sse_oddeven_generic.c +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION -void hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, operator_PRECISION_struct *op, - const int amount, level_struct *l, struct Thread *threading ) { - - int start_even, end_even, start_odd, end_odd, n = l->num_inner_lattice_sites, - *neighbor = op->neighbor_table, start=0, plus_dir_param=_FULL_SYSTEM, minus_dir_param=_FULL_SYSTEM; - - SYNC_CORES(threading) - - if ( amount == _EVEN_SITES || amount == _ODD_SITES ) { - compute_core_start_end_custom(0, op->num_even_sites, &start_even, &end_even, l, threading, 1 ); - compute_core_start_end_custom(op->num_even_sites, op->num_even_sites+op->num_odd_sites, &start_odd, &end_odd, l, threading, 1 ); - } else { - compute_core_start_end_custom(0, l->num_inner_lattice_sites, &start, &n, l, threading, 1 ); - } - - if ( amount == _EVEN_SITES ) { - start = start_odd, n = end_odd; - minus_dir_param = _ODD_SITES; - plus_dir_param = _EVEN_SITES; - } else if ( amount == _ODD_SITES ) { - start = start_even, n = end_even; - minus_dir_param = _EVEN_SITES; - plus_dir_param = _ODD_SITES; - } - - complex_PRECISION *prn[4] = { op->prnT, op->prnZ, op->prnY, op->prnX }; - complex_PRECISION *prp[4] = { op->prpT, op->prpZ, op->prpY, op->prpX }; - - // project minus dir - prp_PRECISION( prn, phi, 12*start, 12*n ); - - // start communication in negative direction - START_LOCKED_MASTER(threading) - ghost_sendrecv_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); - END_LOCKED_MASTER(threading) - - // project plus dir and multiply with U dagger - prn_su3_PRECISION( prp, phi, op, neighbor, 12*start, 12*n ); - - if ( amount == _EVEN_SITES ) { - start = start_even, n = end_even; - } else if ( amount == _ODD_SITES ) { - start = start_odd, n = end_odd; - } - // start communication in positive direction - START_LOCKED_MASTER(threading) - ghost_sendrecv_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l ); - ghost_sendrecv_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); - // wait for communication in negative direction - ghost_wait_PRECISION( op->prnT, T, -1, &(op->c), minus_dir_param, l ); - ghost_wait_PRECISION( op->prnZ, Z, -1, &(op->c), minus_dir_param, l ); - ghost_wait_PRECISION( op->prnY, Y, -1, &(op->c), minus_dir_param, l ); - ghost_wait_PRECISION( op->prnX, X, -1, &(op->c), minus_dir_param, l ); - END_LOCKED_MASTER(threading) - - // multiply with U and lift up minus dir - su3_pbp_PRECISION( eta, prn, op, neighbor, 12*start, 12*n ); - - // wait for communication in positive direction - START_LOCKED_MASTER(threading) - ghost_wait_PRECISION( op->prpT, T, +1, &(op->c), plus_dir_param, l ); - ghost_wait_PRECISION( op->prpZ, Z, +1, &(op->c), plus_dir_param, l ); - ghost_wait_PRECISION( op->prpY, Y, +1, &(op->c), plus_dir_param, l ); - ghost_wait_PRECISION( op->prpX, X, +1, &(op->c), plus_dir_param, l ); - END_LOCKED_MASTER(threading) - - // lift up plus dir - pbn_PRECISION( eta, prp, 12*start, 12*n ); - - SYNC_CORES(threading) -} -#endif - -// ---- block odd even --------------------------------------------------- - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION -void schwarz_PRECISION_oddeven_setup( operator_PRECISION_struct *op, level_struct *l ) { - - PRECISION *clover_pt = op->clover_vectorized, *oe_clover_pt = op->oe_clover_vectorized; - int mu, i, d0, c0, b0, a0, d1, c1, b1, a1, t, z, y, x, agg_split[4], block_split[4], block_size[4]; - - if ( g.csw ) { - for ( mu=0; mu<4; mu++ ) { - agg_split[mu] = l->local_lattice[mu]/l->coarsening[mu]; - block_split[mu] = l->coarsening[mu]/l->block_lattice[mu]; - block_size[mu] = l->block_lattice[mu]; - } - - for ( d0=0; d0oe_clover, op->clover, 0, l->inner_vector_size, l ); -#ifdef HAVE_TM - vector_PRECISION_plus( op->oe_clover, op->oe_clover, op->tm_term, 0, l->inner_vector_size, l ); -#endif - } -} -#endif - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION -void block_diag_ee_PRECISION( vector_PRECISION eta, vector_PRECISION phi, - int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { - - START_UNTHREADED_FUNCTION(threading) - PRECISION *clover_vectorized = s->op.oe_clover_vectorized + (start/12)*144; - int i, n1 = s->num_block_even_sites; - config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42; - vector_PRECISION lphi = phi+start, leta = eta+start; - // diagonal blocks applied to the even sites of a block - if ( g.csw ) { - for ( i=0; inum_block_even_sites, n2 = s->num_block_odd_sites; - config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42; - vector_PRECISION lphi = phi+start, leta = eta+start; - // diagonal blocks applied to the odd sites of a block - if ( g.csw ) { - error0("block_diag_oo_PRECISION is not available when using SSE\n"); - } else { - leta += n1*12; lphi += n1*12; clover += n1*12; - for ( i=0; i<12*n2; i++ ) - leta[i] = lphi[i]*clover[i]; - } - - END_UNTHREADED_FUNCTION(threading) -} -#endif - -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION -void block_diag_oo_inv_PRECISION( vector_PRECISION eta, vector_PRECISION phi, - int start, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { - - START_UNTHREADED_FUNCTION(threading) - PRECISION *clover_vectorized = s->op.oe_clover_vectorized + (start/12)*144; - int i, n1 = s->num_block_even_sites, n2 = s->num_block_odd_sites; - config_PRECISION clover = (g.csw==0.0)?s->op.oe_clover+start:s->op.oe_clover+(start/12)*42; - vector_PRECISION lphi = phi+start, leta = eta+start; - // inverted diagonal blocks applied to the odd sites of a block - if ( g.csw ) { - leta += n1*12; lphi += n1*12; clover_vectorized += n1*144; - for ( i=0; idir_length_even, *length_odd = s->dir_length_odd, - **index = s->oe_index, *neighbor = s->op.neighbor_table; - PRECISION *Dplus = s->op.D_vectorized + (start/12)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/12)*96; - - for ( int mu=0; mu<4; mu++ ) { - int a1, a2, n1, n2; - if ( amount == _EVEN_SITES ) { - a1 = 0; n1 = length_even[mu]; - a2 = n1; n2 = a2 + length_odd[mu]; - } else if ( amount == _ODD_SITES ) { - a1 = length_even[mu]; n1 = a1 + length_odd[mu]; - a2 = 0; n2 = a1; - } else { - a1 = 0; n1 = length_even[mu]+length_odd[mu]; - a2 = 0; n2 = n1; - } - block_oddeven_plus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), - mu, a1, n1, index[mu], neighbor ); - block_oddeven_minus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), - mu, a2, n2, index[mu], neighbor ); - } - - END_UNTHREADED_FUNCTION(threading) -} -#endif - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION -void block_n_hopping_term_PRECISION( vector_PRECISION eta, vector_PRECISION phi, - int start, int amount, schwarz_PRECISION_struct *s, level_struct *l, struct Thread *threading ) { - - START_UNTHREADED_FUNCTION(threading) - - int *length_even = s->dir_length_even, *length_odd = s->dir_length_odd, - **index = s->oe_index, *neighbor = s->op.neighbor_table; - PRECISION *Dplus = s->op.D_vectorized + (start/12)*96; - PRECISION *Dminus = s->op.D_transformed_vectorized + (start/12)*96; - - for ( int mu=0; mu<4; mu++ ) { - int a1, a2, n1, n2; - if ( amount == _EVEN_SITES ) { - a1 = 0; n1 = length_even[mu]; - a2 = n1; n2 = a2 + length_odd[mu]; - } else if ( amount == _ODD_SITES ) { - a1 = length_even[mu]; n1 = a1 + length_odd[mu]; - a2 = 0; n2 = a1; - } else { - a1 = 0; n1 = length_even[mu]+length_odd[mu]; - a2 = 0; n2 = n1; - } - block_oddeven_nplus_coupling_PRECISION( (PRECISION*)(eta+start), Dplus, (PRECISION*)(phi+start), - mu, a1, n1, index[mu], neighbor ); - block_oddeven_nminus_coupling_PRECISION( (PRECISION*)(eta+start), Dminus, (PRECISION*)(phi+start), - mu, a2, n2, index[mu], neighbor ); - } - - END_UNTHREADED_FUNCTION(threading) -} -#endif - - -#endif // SSE - diff --git a/src/sse_schwarz_generic.c b/src/sse_schwarz_generic.c deleted file mode 100644 index 9ef98d7..0000000 --- a/src/sse_schwarz_generic.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#include "main.h" - -#ifdef SSE - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION -void block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, - schwarz_PRECISION_struct *s, level_struct *l ) { - int *bbl = s->block_boundary_length; - PRECISION *Dplus = s->op.D_vectorized; - PRECISION *Dminus = s->op.D_transformed_vectorized; - - for ( int mu=0; mu<4; mu++ ) { - boundary_plus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi, - mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); - boundary_minus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi, - mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); - } -} -#endif - - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION -void n_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, int k, - schwarz_PRECISION_struct *s, level_struct *l ) { - int *bbl = s->block_boundary_length; - PRECISION *Dplus = s->op.D_vectorized; - PRECISION *Dminus = s->op.D_transformed_vectorized; - - for ( int mu=0; mu<4; mu++ ) { - boundary_nplus_coupling_PRECISION( (PRECISION*)eta, Dplus, (PRECISION*)phi, - mu, bbl[2*mu], bbl[2*mu+1], s->block[k].bt, NULL ); - boundary_nminus_coupling_PRECISION( (PRECISION*)eta, Dminus, (PRECISION*)phi, - mu, bbl[2*mu+1], bbl[2*mu+2], s->block[k].bt, NULL ); - } -} -#endif - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION -void coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, - int k, schwarz_PRECISION_struct *s, level_struct *l ) { - // k: number of current block - int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset; - - for ( int mu=0; mu<4; mu++ ) { - OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset; - // plus mu direction - for ( int i=bbl[2*mu]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l ); - } - // minus mu direction - for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); - } - } -} -#endif - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION -void n_coarse_block_PRECISION_boundary_op( vector_PRECISION eta, vector_PRECISION phi, - int k, schwarz_PRECISION_struct *s, level_struct *l ) { - // k: number of current block - int *bbl = s->block_boundary_length, n = l->num_lattice_site_var; - int column_offset = SIMD_LENGTH_PRECISION*((l->num_lattice_site_var+SIMD_LENGTH_PRECISION-1)/SIMD_LENGTH_PRECISION); - int vectorized_link_offset = 2*l->num_lattice_site_var*column_offset; - - for ( int mu=0; mu<4; mu++ ) { - OPERATOR_TYPE_PRECISION *Dplus = s->op.D_vectorized + mu*vectorized_link_offset; - OPERATOR_TYPE_PRECISION *Dminus = s->op.D_transformed_vectorized + mu*vectorized_link_offset; - // plus mu direction - for ( int i=bbl[2*mu]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dplus + 4*vectorized_link_offset*index, l ); - } - // minus mu direction - for ( int i=bbl[2*mu+1]; iblock[k].bt[i]; - int neighbor_index = s->block[k].bt[i+1]; - vector_PRECISION phi_pt = phi + n*neighbor_index; - vector_PRECISION eta_pt = eta + n*index; - coarse_n_hopp_PRECISION_vectorized( eta_pt, phi_pt, Dminus + 4*vectorized_link_offset*neighbor_index, l ); - } - } -} -#endif - -#if defined(OPTIMIZED_NEIGHBOR_COUPLING_PRECISION) || defined(OPTIMIZED_SELF_COUPLING_PRECISION) -void schwarz_PRECISION_setup( schwarz_PRECISION_struct *s, operator_double_struct *op_in, level_struct *l ) { - -/********************************************************************************* -* Copies the Dirac operator and the clover term from op_in into the Schwarz -* struct (this function is depth 0 only). -* - operator_double_struct *op_in: Input operator. -*********************************************************************************/ - - int i, index, n = l->num_inner_lattice_sites, *tt = s->op.translation_table; - config_PRECISION D_out_pt, clover_out_pt; - config_double D_in_pt = op_in->D, clover_in_pt = op_in->clover; -#ifdef HAVE_TM - config_PRECISION tm_term_out_pt, odd_proj_out_pt; - config_double tm_term_in_pt = op_in->tm_term, odd_proj_in_pt = op_in->odd_proj; -#endif - - for ( i=0; iop.D + 36*index; - FOR36( *D_out_pt = (complex_PRECISION) *D_in_pt; D_out_pt++; D_in_pt++; ) - } - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - for ( i=0; iop.D_vectorized + 96*i; - PRECISION *D_transformed_vectorized = s->op.D_transformed_vectorized + 96*i; - complex_PRECISION *D_out_pt = s->op.D + 36*i; - for ( int mu=0; mu<4; mu++ ) { - set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_out_pt+9*mu ); - } - } -#endif - - if ( g.csw != 0 ) { - for ( i=0; iop.clover + 42*index; -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - PRECISION *clover_out_vectorized_pt = s->op.clover_vectorized + 144*index; - sse_set_clover_PRECISION( clover_out_vectorized_pt, clover_in_pt ); -#endif - FOR42( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; ) - } - } else { - for ( i=0; iop.clover + 12*index; - FOR12( *clover_out_pt = (complex_PRECISION) *clover_in_pt; clover_out_pt++; clover_in_pt++; ) - } - } - -#ifdef HAVE_TM - for ( i=0; iop.tm_term + 12*index; -#ifdef OPTIMIZED_SELF_COUPLING_PRECISION - if ( g.csw != 0 ) { - PRECISION *clover_out_vectorized_pt = s->op.clover_vectorized + 144*index; - sse_add_diagonal_clover_PRECISION( clover_out_vectorized_pt, tm_term_in_pt ); - } -#endif - FOR12( *tm_term_out_pt = (complex_PRECISION) *tm_term_in_pt; tm_term_out_pt++; tm_term_in_pt++; ) - } - - for ( i=0; iop.odd_proj + 12*index; - FOR12( *odd_proj_out_pt = (complex_PRECISION) *odd_proj_in_pt; odd_proj_out_pt++; odd_proj_in_pt++; ) - } -#endif - - if ( g.odd_even ) - schwarz_PRECISION_oddeven_setup( &(s->op), l ); - - schwarz_PRECISION_boundary_update( s, l ); - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION - int start = l->num_lattice_sites; - int end = 2*l->num_lattice_sites - l->num_inner_lattice_sites; - for ( i=start; iop.D_vectorized + 96*i; - PRECISION *D_transformed_vectorized = s->op.D_transformed_vectorized + 96*i; - complex_PRECISION *D_out_pt = s->op.D + 36*i; - for ( int mu=0; mu<4; mu++ ) { - set_PRECISION_D_vectorized( D_vectorized+24*mu, D_transformed_vectorized+24*mu, D_out_pt+9*mu ); - } - } -#endif -} -#endif - -#endif // SSE diff --git a/src/sse_schwarz_generic.h b/src/sse_schwarz_generic.h deleted file mode 100644 index 5bd6218..0000000 --- a/src/sse_schwarz_generic.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (C) 2016, Matthias Rottmann, Artur Strebel, Simon Heybrock, Simone Bacchio, Bjoern Leder. - * - * This file is part of the DDalphaAMG solver library. - * - * The DDalphaAMG solver library is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * The DDalphaAMG solver library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * - * You should have received a copy of the GNU General Public License - * along with the DDalphaAMG solver library. If not, see http://www.gnu.org/licenses/. - * - */ - -#ifndef SSE_SCHWARZ_PRECISION_H -#define SSE_SCHWARZ_PRECISION_H -#ifdef SSE - -#ifdef OPTIMIZED_NEIGHBOR_COUPLING_PRECISION -static inline void set_PRECISION_D_vectorized( PRECISION *out1, PRECISION *out2, complex_PRECISION *in ) { - // out1: column major, out2: row major - for ( int i=0; i<3; i++ ) { // column - for ( int j=0; j<3; j++ ) { // row - out1[8*i +j] = creal_PRECISION(in[3*j+i]); - out1[8*i+4+j] = cimag_PRECISION(in[3*j+i]); - out2[8*i +j] = creal_PRECISION(in[j+3*i]); - out2[8*i+4+j] = cimag_PRECISION(in[j+3*i]); - } - out1[8*i+3] = 0.0; - out1[8*i+7] = 0.0; - out2[8*i+3] = 0.0; - out2[8*i+7] = 0.0; - } -} -#endif - -#endif // SSE -#endif diff --git a/src/threading.c b/src/threading.c index b78fa1f..e5d3f2c 100644 --- a/src/threading.c +++ b/src/threading.c @@ -31,7 +31,9 @@ void no_hyperthread_barrier(void *barrier, int id) } void core_barrier(int core) { +#ifdef OPENMP #pragma omp barrier +#endif } void hyperthread_barrier(void *barrier, int hyperthead) { diff --git a/src/threading.h b/src/threading.h index 7aafaed..f2e1742 100644 --- a/src/threading.h +++ b/src/threading.h @@ -43,11 +43,11 @@ // nested omp: split into cores, each core splits into hyperthreads (like DD preconditioner) #define CORE_BARRIER(threading) \ do { \ - threading->barrier(threading->core); \ + threading->barrier(threading->core); \ } while(0) #define HYPERTHREAD_BARRIER(threading) \ do { \ - threading->thread_barrier(threading->thread_barrier_data, threading->thread); \ + threading->thread_barrier(threading->thread_barrier_data, threading->thread); \ } while(0) #endif @@ -77,8 +77,10 @@ if(threading->thread == 0) \ CORE_BARRIER(threading); +#define MASTER(threading) \ + if(threading->core + threading->thread == 0) #define START_MASTER(threading) \ - if(threading->core + threading->thread == 0) { + MASTER(threading) { #define END_MASTER(threading) \ } @@ -101,7 +103,10 @@ #ifdef OPENMP #include +#define DO_PRAGMA(EXP) _Pragma (#EXP) +#define THREADED(EXP) DO_PRAGMA ( omp parallel num_threads( EXP ) ) #else +#define THREADED(EXP) static inline int omp_get_thread_num( void ) { return 0; } @@ -112,7 +117,7 @@ static inline int omp_get_num_threads( void ) { struct level_struct; -struct common_thread_data +typedef struct common_thread_data { // barrier among cores void (*barrier)(int); @@ -121,7 +126,7 @@ struct common_thread_data // *common* workspace for *all* threads // sometimes threads need to exchange data, they can use this char *workspace; -}; +} common_thread_data; void init_common_thread_data(struct common_thread_data *common); diff --git a/src/top_level.c b/src/top_level.c index 1a135b0..354f170 100644 --- a/src/top_level.c +++ b/src/top_level.c @@ -27,16 +27,13 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ) if(threading->thread != 0) return; - int start = threading->start_index[l->depth]; - int end = threading->end_index[l->depth]; - if ( g.rhs == 0 ) { - vector_double_define( rhs, 1, start, end, l ); + vector_double_define_real( rhs, 1, 0, l->inner_vector_size, l, threading ); START_MASTER(threading) if ( g.print > 0 ) printf0("rhs = ones\n"); END_MASTER(threading) } else if ( g.rhs == 1 ) { - vector_double_define( rhs, 0, start, end, l ); + vector_double_define_zero( rhs, 0, l->inner_vector_size, l, threading ); if ( g.my_rank == 0 ) { START_LOCKED_MASTER(threading) rhs[0] = 1.0; @@ -47,17 +44,16 @@ void rhs_define( vector_double rhs, level_struct *l, struct Thread *threading ) END_MASTER(threading) } else if ( g.rhs == 2 ) { // this would yield different results if we threaded it, so we don't - START_LOCKED_MASTER(threading) - vector_double_define_random( rhs, 0, l->inner_vector_size, l ); - END_LOCKED_MASTER(threading) + vector_double_define_random( rhs, 0, l->inner_vector_size, l, threading ); START_MASTER(threading) if ( g.print > 0 ) printf0("rhs = random\n"); END_MASTER(threading) } else if ( g.rhs == 3 ) { - vector_double_define( rhs, 0, start, end, l ); + vector_double_define_zero( rhs, 0, l->inner_vector_size, l, threading ); } else { ASSERT( g.rhs >= 0 && g.rhs <= 4 ); } + } @@ -65,8 +61,8 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l int iter = 0, start = threading->start_index[l->depth], end = threading->end_index[l->depth]; - vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b; - vector_double sol = g.mixed_precision==2?g.p_MP.dp.x:g.p.x; + vector_double rhs = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.b:g.p.b; + vector_double sol = (g.mixed_precision==2 && g.method >= 0)?g.p_MP.dp.x:g.p.x; #ifdef WILSON_BENCHMARK START_MASTER(threading) @@ -106,12 +102,11 @@ int wilson_driver( vector_double solution, vector_double source, level_struct *l void solve( vector_double solution, vector_double source, level_struct *l, struct Thread *threading ) { - vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b; - if ( g.vt.evaluation ) { + vector_double rhs = g.mixed_precision==2?g.p_MP.dp.b:g.p.b; // this would yield different results if we threaded it, so we don't + vector_double_define_random( rhs, 0, l->inner_vector_size, l, threading ); START_LOCKED_MASTER(threading) - vector_double_define_random( rhs, 0, l->inner_vector_size, l ); scan_var( &(g.vt), l ); END_LOCKED_MASTER(threading) } else { @@ -123,24 +118,34 @@ void solve( vector_double solution, vector_double source, level_struct *l, struc void solve_driver( level_struct *l, struct Thread *threading ) { vector_double solution = NULL, source = NULL; - double minus_twisted_bc[4]; - - START_LOCKED_MASTER(threading) + double minus_twisted_bc[4], norm; + if(g.bc==2) for ( int i=0; i<4; i++ ) - minus_twisted_bc[i] = g.twisted_bc[i]; - END_LOCKED_MASTER(threading) + minus_twisted_bc[i] = -1*g.twisted_bc[i]; +#ifdef HAVE_TM1p1 + if( g.epsbar != 0 || g.epsbar_ig5_odd_shift != 0 || g.epsbar_ig5_odd_shift != 0 ) { + data_layout_n_flavours( 2, l, threading ); + printf0("inverting doublet operator\n"); + } +#endif PUBLIC_MALLOC( solution, complex_double, l->inner_vector_size ); PUBLIC_MALLOC( source, complex_double, l->inner_vector_size ); - + rhs_define( source, l, threading ); if(g.bc==2) apply_twisted_bc_to_vector_double( source, source, g.twisted_bc, l); + norm = global_norm_double( source, 0, l->inner_vector_size, l, threading ); + printf0("source vector norm: %le\n",norm); + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 1 ) +#endif #ifdef HAVE_TM - if (g.tm_mu + g.tm_mu_odd_shift != 0.0 || g.tm_mu + g.tm_mu_even_shift != 0.0 ) + if ( g.mu + g.mu_odd_shift != 0.0 || g.mu + g.mu_even_shift != 0.0 ) if(g.downprop) { START_MASTER(threading) @@ -150,16 +155,17 @@ void solve_driver( level_struct *l, struct Thread *threading ) { solve( solution, source, l, threading ); if(g.bc==2) - apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l); + apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l); START_LOCKED_MASTER(threading) printf0("\n\n+-------------------------- down --------------------------+\n\n"); - g.tm_mu*=-1; - g.tm_mu_odd_shift*=-1; - g.tm_mu_even_shift*=-1; + g.mu*=-1; + g.mu_odd_shift*=-1; + g.mu_even_shift*=-1; END_LOCKED_MASTER(threading) - - optimized_shift_update( l->dirac_shift, l, threading ); + + tm_term_update( g.mu, l, threading ); + finalize_operator_update( l, threading ); } #endif @@ -167,8 +173,16 @@ void solve_driver( level_struct *l, struct Thread *threading ) { if(g.bc==2) apply_twisted_bc_to_vector_double( solution, solution, minus_twisted_bc, l); - + + norm = global_norm_double( solution, 0, l->inner_vector_size, l, threading ); + printf0("solution vector norm: %le\n",norm); + PUBLIC_FREE( solution, complex_double, l->inner_vector_size ); PUBLIC_FREE( source, complex_double, l->inner_vector_size ); + +#ifdef HAVE_TM1p1 + if( g.n_flavours == 2 ) + data_layout_n_flavours( 1, l, threading ); +#endif } diff --git a/src/var_table.h b/src/var_table.h index 53e766e..8066522 100644 --- a/src/var_table.h +++ b/src/var_table.h @@ -54,9 +54,11 @@ for ( int i=0; ivalues[_TRCKD_VAL] = *tmp_var; \ parameter_update( l ); \ - if ( g.vt.shift_update ) \ - shift_update( *tmp_var, l, no_threading ); \ - if ( g.vt.re_setup ) { \ + if ( g.vt.shift_update ) { \ + m0_update( *tmp_var, l, no_threading ); \ + g.m0 = *tmp_var; \ + } \ + if ( g.vt.re_setup ) { \ double t0, t1; \ t0 = MPI_Wtime(); \ method_re_setup( l, no_threading ); \ @@ -67,20 +69,20 @@ printf0("scanning variable \"%s\", value: %lf, run %d of %d\n", name, (double)(*tmp_var), i+1, g.vt.average_over ); \ if ( g.vt.track_error ) { \ apply_operator_double( b, v, &(g.p), l, no_threading ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define_zero( x, 0, l->inner_vector_size, l, no_threading ); \ if ( g.vt.track_cgn_error ) { \ ASSERT( g.method >=0 && g.p.restart_length >= 4 ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define_zero( x, 0, l->inner_vector_size, l, no_threading ); \ cgn_double( &(g.p), l, no_threading ); \ vector_double_minus( x, x, v, 0, l->inner_vector_size, l ); \ g.vt.p_end->values[_CGNR_ERR] += ( global_norm_double( x, 0, l->inner_vector_size, l, no_threading ) / norm_v ) / ((double)g.vt.average_over); \ printf0("CGN: error norm: %le\n", g.vt.p_end->values[_CGNR_ERR] ); \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define_zero( x, 0, l->inner_vector_size, l, no_threading ); \ } \ } else {\ rhs_define( b, l, no_threading );\ } \ - vector_double_define( x, 0, 0, l->inner_vector_size, l ); \ + vector_double_define_zero( x, 0, l->inner_vector_size, l, no_threading ); \ if (g.mixed_precision==2) fgmres_MP( &(g.p_MP), l, no_threading ); \ else fgmres_double( &(g.p), l, no_threading ); \ if ( i == g.vt.average_over-1 ) prof_print( l ); \ diff --git a/src/vcycle_generic.c b/src/vcycle_generic.c index be46be6..e20e094 100644 --- a/src/vcycle_generic.c +++ b/src/vcycle_generic.c @@ -23,7 +23,7 @@ #include "vcycle_PRECISION.h" void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, - int n, const int res, complex_PRECISION shift, level_struct *l, struct Thread *threading ) { + int n, const int res, level_struct *l, struct Thread *threading ) { ASSERT( phi != eta ); @@ -41,7 +41,6 @@ void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRE int start = threading->start_index[l->depth]; int end = threading->end_index[l->depth]; START_LOCKED_MASTER(threading) - l->sp_PRECISION.shift = shift; l->sp_PRECISION.initial_guess_zero = res; l->sp_PRECISION.num_restart = n; END_LOCKED_MASTER(threading) @@ -128,14 +127,15 @@ void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECI g.coarse_time += MPI_Wtime(); END_MASTER(threading) } - if( i == 0 && res == _NO_RES ) - interpolate3_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading ); - else + if( i == 0 && res == _NO_RES ) { + vector_PRECISION_define_zero( phi, 0, l->inner_vector_size, l, threading ); interpolate_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading ); - smoother_PRECISION( phi, Dphi, eta, l->post_smooth_iter, _RES, _NO_SHIFT, l, threading ); + } else + interpolate_PRECISION( phi, l->next_level->p_PRECISION.x, l, threading ); + smoother_PRECISION( phi, Dphi, eta, l->post_smooth_iter, _RES, l, threading ); res = _RES; } } else { - smoother_PRECISION( phi, Dphi, eta, (l->depth==0)?l->n_cy:l->post_smooth_iter, res, _NO_SHIFT, l, threading ); + smoother_PRECISION( phi, Dphi, eta, (l->depth==0)?l->n_cy:l->post_smooth_iter, res, l, threading ); } } diff --git a/src/vcycle_generic.h b/src/vcycle_generic.h index d8a5033..5e54a74 100644 --- a/src/vcycle_generic.h +++ b/src/vcycle_generic.h @@ -33,7 +33,7 @@ #include "solver_analysis.h" void smoother_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, - int n, const int res, complex_PRECISION shift, level_struct *l, struct Thread *threading ); + int n, const int res, level_struct *l, struct Thread *threading ); void vcycle_PRECISION( vector_PRECISION phi, vector_PRECISION Dphi, vector_PRECISION eta, int res, level_struct *l, struct Thread *threading ); diff --git a/src/vectorization_control.h b/src/vectorization_control.h index 772063b..abd5f3e 100644 --- a/src/vectorization_control.h +++ b/src/vectorization_control.h @@ -24,18 +24,16 @@ #ifdef SSE -#define SIMD_LENGTH_float 4 +#define SIMD_LENGTH_float 4 #define SIMD_LENGTH_double 2 -#ifndef HAVE_TM // TODO: make it work for TM -#define INTERPOLATION_OPERATOR_LAYOUT_OPTIMIZED_float -#define INTERPOLATION_SETUP_LAYOUT_OPTIMIZED_float -#endif -#define VECTORIZE_COARSE_OPERATOR_float -#define GRAM_SCHMIDT_VECTORIZED_float +#define OPTIMIZED_COARSE_NEIGHBOR_COUPLING_float +#define OPTIMIZED_COARSE_SELF_COUPLING_float +#define OPTIMIZED_INTERPOLATION_OPERATOR_float +#define OPTIMIZED_INTERPOLATION_SETUP_float +#define OPTIMIZED_NEIGHBOR_COUPLING_double #define OPTIMIZED_NEIGHBOR_COUPLING_float #define OPTIMIZED_SELF_COUPLING_float -#define OPTIMIZED_NEIGHBOR_COUPLING_double #define OPTIMIZED_LINALG_float #define OPTIMIZED_LINALG_double diff --git a/src/vectorization_dirac_generic.c b/src/vectorization_dirac_generic.c index a07919a..9ea2b3e 100644 --- a/src/vectorization_dirac_generic.c +++ b/src/vectorization_dirac_generic.c @@ -40,8 +40,8 @@ void d_plus_clover_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, comp config_PRECISION D = s->op.D; // add clover term/shift - spin0and1_site_clover_PRECISION_vectorized( eta1, phi+site_offset*site, s->op.clover+42*site, 4+l->dirac_shift, offset ); - spin2and3_site_clover_PRECISION_vectorized( eta2, phi+site_offset*site, s->op.clover+42*site, 4+l->dirac_shift, offset ); + spin0and1_site_clover_PRECISION_vectorized( eta1, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset ); + spin2and3_site_clover_PRECISION_vectorized( eta2, phi+site_offset*site, s->op.clover+42*site, 4+s->op.m0, offset ); index_out = site; @@ -102,3 +102,15 @@ void d_neighbor_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex twospin2_p_PRECISION_vectorized_simd_length( eta1, eta2, buffer, mu ); } #endif + +#ifdef SSE +void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, + complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, + int site ) { + + int offset = SIMD_LENGTH_PRECISION; + int site_offset = 12*offset; + + sse_diagonal_aggregate_PRECISION( eta1, eta2, phi+site_offset*site, s->op.odd_proj+12*site, offset ); +} +#endif diff --git a/src/vectorization_dirac_generic.h b/src/vectorization_dirac_generic.h index 3831b5d..5b8f02c 100644 --- a/src/vectorization_dirac_generic.h +++ b/src/vectorization_dirac_generic.h @@ -35,6 +35,10 @@ complex_PRECISION *phi, schwarz_PRECISION_struct *s, level_struct *l, int site, int *direction_flags ); + void diagonal_aggregate_PRECISION_vectorized( complex_PRECISION *eta1, complex_PRECISION *eta2, + complex_PRECISION *phi, schwarz_PRECISION_struct *s, + level_struct *l, int site ); + // spinors are vectorized, gauge is same for all (use for multiple rhs) static inline void mvm_PRECISION_vectorized_simd_length( const complex_PRECISION *eta, const complex_PRECISION *D, const complex_PRECISION *phi ) { @@ -129,4 +133,4 @@ #endif } -#endif \ No newline at end of file +#endif